From e078de03788353b220f3d501fc3607cc92db28c1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 3 Jul 2017 06:37:32 -0700 Subject: virtio_net: Remove references to NETIF_F_UFO. It is going away. Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 99a26a9efec1..99830167ea2f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2429,7 +2429,7 @@ static int virtnet_probe(struct virtio_device *vdev) dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { - dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO + dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6; } /* Individual feature bits: what can host handle? */ @@ -2439,13 +2439,11 @@ static int virtnet_probe(struct virtio_device *vdev) dev->hw_features |= NETIF_F_TSO6; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) dev->hw_features |= NETIF_F_TSO_ECN; - if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO)) - dev->hw_features |= NETIF_F_UFO; dev->features |= NETIF_F_GSO_ROBUST; if (gso) - dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO); + dev->features |= dev->hw_features & NETIF_F_ALL_TSO; /* (!csum && gso) case will be fixed by register_netdev() */ } if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) -- cgit v1.2.3 From 28b39bc7c57e7920b0cbba3d79ba0f134e0f76f0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 19 Jul 2017 16:54:46 +0800 Subject: virtio-net: pack headroom into ctx for mergeable buffers Pack headroom into ctx - this way when we get a buffer we can figure out the actual headroom that was allocated for the buffer. Will be helpful to optimize switching between XDP and non-XDP modes which have different headroom requirements. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 99830167ea2f..3223a36ad9a8 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -270,6 +270,23 @@ static void skb_xmit_done(struct virtqueue *vq) netif_wake_subqueue(vi->dev, vq2txq(vq)); } +#define MRG_CTX_HEADER_SHIFT 22 +static void *mergeable_len_to_ctx(unsigned int truesize, + unsigned int headroom) +{ + return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize); +} + +static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx) +{ + return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT; +} + +static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) +{ + return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1); +} + /* Called from bottom half context */ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, @@ -639,13 +656,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } rcu_read_unlock(); - if (unlikely(len > (unsigned long)ctx)) { + truesize = mergeable_ctx_to_truesize(ctx); + if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)ctx); dev->stats.rx_length_errors++; goto err_skb; } - truesize = (unsigned long)ctx; + head_skb = page_to_skb(vi, rq, page, offset, len, truesize); curr_skb = head_skb; @@ -665,13 +683,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } page = virt_to_head_page(buf); - if (unlikely(len > (unsigned long)ctx)) { + + truesize = mergeable_ctx_to_truesize(ctx); + if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)ctx); dev->stats.rx_length_errors++; goto err_skb; } - truesize = (unsigned long)ctx; num_skb_frags = skb_shinfo(curr_skb)->nr_frags; if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { @@ -889,7 +908,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; buf += headroom; /* advance address leaving hole at front of pkt */ - ctx = (void *)(unsigned long)len; + ctx = mergeable_len_to_ctx(len, headroom); get_page(alloc_frag->page); alloc_frag->offset += len + headroom; hole = alloc_frag->size - alloc_frag->offset; -- cgit v1.2.3 From 192f68cf35f5eefe28ce8acbb9a3dfc747149b64 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 19 Jul 2017 16:54:47 +0800 Subject: virtio-net: switch to use new ctx API for small buffer Use ctx API to store headroom for small buffers. Following patches will retrieve this info and use it for XDP. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3223a36ad9a8..a8a5968a8c61 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -410,7 +410,8 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi) static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, - void *buf, unsigned int len) + void *buf, void *ctx, + unsigned int len) { struct sk_buff *skb; struct bpf_prog *xdp_prog; @@ -773,7 +774,7 @@ static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, else if (vi->big_packets) skb = receive_big(dev, vi, rq, buf, len); else - skb = receive_small(dev, vi, rq, buf, len); + skb = receive_small(dev, vi, rq, buf, ctx, len); if (unlikely(!skb)) return 0; @@ -806,12 +807,18 @@ frame_err: return 0; } +/* Unlike mergeable buffers, all buffers are allocated to the + * same size, except for the headroom. For this reason we do + * not need to use mergeable_len_to_ctx here - it is enough + * to store the headroom as the context ignoring the truesize. + */ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { struct page_frag *alloc_frag = &rq->alloc_frag; char *buf; unsigned int xdp_headroom = virtnet_get_headroom(vi); + void *ctx = (void *)(unsigned long)xdp_headroom; int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; int err; @@ -825,7 +832,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, alloc_frag->offset += len; sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom, vi->hdr_len + GOOD_PACKET_LEN); - err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp); + err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) put_page(virt_to_head_page(buf)); @@ -1034,7 +1041,7 @@ static int virtnet_receive(struct receive_queue *rq, int budget) void *buf; struct virtnet_stats *stats = this_cpu_ptr(vi->stats); - if (vi->mergeable_rx_bufs) { + if (!vi->big_packets || vi->mergeable_rx_bufs) { void *ctx; while (received < budget && @@ -2202,7 +2209,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi) names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL); if (!names) goto err_names; - if (vi->mergeable_rx_bufs) { + if (!vi->big_packets || vi->mergeable_rx_bufs) { ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL); if (!ctx) goto err_ctx; -- cgit v1.2.3 From 4941d472bf95b4345d6e38906fcf354e74afa311 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 19 Jul 2017 16:54:48 +0800 Subject: virtio-net: do not reset during XDP set We currently reset the device during XDP set, the main reason is that we allocate more headroom with XDP (for header adjustment). This works but causes network downtime for users. Previous patches encoded the headroom in the buffer context, this makes it possible to detect the case where a buffer with headroom insufficient for XDP is added to the queue and XDP is enabled afterwards. Upon detection, we handle this case by copying the packet (slow, but it's a temporary condition). Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 232 ++++++++++++++++++++++------------------------- 1 file changed, 106 insertions(+), 126 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a8a5968a8c61..f894713dca2a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -407,6 +407,69 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi) return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; } +/* We copy the packet for XDP in the following cases: + * + * 1) Packet is scattered across multiple rx buffers. + * 2) Headroom space is insufficient. + * + * This is inefficient but it's a temporary condition that + * we hit right after XDP is enabled and until queue is refilled + * with large buffers with sufficient headroom - so it should affect + * at most queue size packets. + * Afterwards, the conditions to enable + * XDP should preclude the underlying device from sending packets + * across multiple buffers (num_buf > 1), and we make sure buffers + * have enough headroom. + */ +static struct page *xdp_linearize_page(struct receive_queue *rq, + u16 *num_buf, + struct page *p, + int offset, + int page_off, + unsigned int *len) +{ + struct page *page = alloc_page(GFP_ATOMIC); + + if (!page) + return NULL; + + memcpy(page_address(page) + page_off, page_address(p) + offset, *len); + page_off += *len; + + while (--*num_buf) { + unsigned int buflen; + void *buf; + int off; + + buf = virtqueue_get_buf(rq->vq, &buflen); + if (unlikely(!buf)) + goto err_buf; + + p = virt_to_head_page(buf); + off = buf - page_address(p); + + /* guard against a misconfigured or uncooperative backend that + * is sending packet larger than the MTU. + */ + if ((page_off + buflen) > PAGE_SIZE) { + put_page(p); + goto err_buf; + } + + memcpy(page_address(page) + page_off, + page_address(p) + off, buflen); + page_off += buflen; + put_page(p); + } + + /* Headroom does not contribute to packet length */ + *len = page_off - VIRTIO_XDP_HEADROOM; + return page; +err_buf: + __free_pages(page, 0); + return NULL; +} + static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, @@ -415,12 +478,14 @@ static struct sk_buff *receive_small(struct net_device *dev, { struct sk_buff *skb; struct bpf_prog *xdp_prog; - unsigned int xdp_headroom = virtnet_get_headroom(vi); + unsigned int xdp_headroom = (unsigned long)ctx; unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom; unsigned int headroom = vi->hdr_len + header_offset; unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct page *page = virt_to_head_page(buf); unsigned int delta = 0; + struct page *xdp_page; len -= vi->hdr_len; rcu_read_lock(); @@ -434,6 +499,27 @@ static struct sk_buff *receive_small(struct net_device *dev, if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) goto err_xdp; + if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) { + int offset = buf - page_address(page) + header_offset; + unsigned int tlen = len + vi->hdr_len; + u16 num_buf = 1; + + xdp_headroom = virtnet_get_headroom(vi); + header_offset = VIRTNET_RX_PAD + xdp_headroom; + headroom = vi->hdr_len + header_offset; + buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + xdp_page = xdp_linearize_page(rq, &num_buf, page, + offset, header_offset, + &tlen); + if (!xdp_page) + goto err_xdp; + + buf = page_address(xdp_page); + put_page(page); + page = xdp_page; + } + xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; xdp.data_end = xdp.data + len; @@ -462,7 +548,7 @@ static struct sk_buff *receive_small(struct net_device *dev, skb = build_skb(buf, buflen); if (!skb) { - put_page(virt_to_head_page(buf)); + put_page(page); goto err; } skb_reserve(skb, headroom - delta); @@ -478,7 +564,7 @@ err: err_xdp: rcu_read_unlock(); dev->stats.rx_dropped++; - put_page(virt_to_head_page(buf)); + put_page(page); xdp_xmit: return NULL; } @@ -503,66 +589,6 @@ err: return NULL; } -/* The conditions to enable XDP should preclude the underlying device from - * sending packets across multiple buffers (num_buf > 1). However per spec - * it does not appear to be illegal to do so but rather just against convention. - * So in order to avoid making a system unresponsive the packets are pushed - * into a page and the XDP program is run. This will be extremely slow and we - * push a warning to the user to fix this as soon as possible. Fixing this may - * require resolving the underlying hardware to determine why multiple buffers - * are being received or simply loading the XDP program in the ingress stack - * after the skb is built because there is no advantage to running it here - * anymore. - */ -static struct page *xdp_linearize_page(struct receive_queue *rq, - u16 *num_buf, - struct page *p, - int offset, - unsigned int *len) -{ - struct page *page = alloc_page(GFP_ATOMIC); - unsigned int page_off = VIRTIO_XDP_HEADROOM; - - if (!page) - return NULL; - - memcpy(page_address(page) + page_off, page_address(p) + offset, *len); - page_off += *len; - - while (--*num_buf) { - unsigned int buflen; - void *buf; - int off; - - buf = virtqueue_get_buf(rq->vq, &buflen); - if (unlikely(!buf)) - goto err_buf; - - p = virt_to_head_page(buf); - off = buf - page_address(p); - - /* guard against a misconfigured or uncooperative backend that - * is sending packet larger than the MTU. - */ - if ((page_off + buflen) > PAGE_SIZE) { - put_page(p); - goto err_buf; - } - - memcpy(page_address(page) + page_off, - page_address(p) + off, buflen); - page_off += buflen; - put_page(p); - } - - /* Headroom does not contribute to packet length */ - *len = page_off - VIRTIO_XDP_HEADROOM; - return page; -err_buf: - __free_pages(page, 0); - return NULL; -} - static struct sk_buff *receive_mergeable(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, @@ -577,6 +603,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct sk_buff *head_skb, *curr_skb; struct bpf_prog *xdp_prog; unsigned int truesize; + unsigned int headroom = mergeable_ctx_to_headroom(ctx); head_skb = NULL; @@ -589,10 +616,13 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, u32 act; /* This happens when rx buffer size is underestimated */ - if (unlikely(num_buf > 1)) { + if (unlikely(num_buf > 1 || + headroom < virtnet_get_headroom(vi))) { /* linearize data for XDP */ xdp_page = xdp_linearize_page(rq, &num_buf, - page, offset, &len); + page, offset, + VIRTIO_XDP_HEADROOM, + &len); if (!xdp_page) goto err_xdp; offset = VIRTIO_XDP_HEADROOM; @@ -835,7 +865,6 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) put_page(virt_to_head_page(buf)); - return err; } @@ -1840,7 +1869,6 @@ static void virtnet_freeze_down(struct virtio_device *vdev) } static int init_vqs(struct virtnet_info *vi); -static void _remove_vq_common(struct virtnet_info *vi); static int virtnet_restore_up(struct virtio_device *vdev) { @@ -1869,39 +1897,6 @@ static int virtnet_restore_up(struct virtio_device *vdev) return err; } -static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp) -{ - struct virtio_device *dev = vi->vdev; - int ret; - - virtio_config_disable(dev); - dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; - virtnet_freeze_down(dev); - _remove_vq_common(vi); - - virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); - virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); - - ret = virtio_finalize_features(dev); - if (ret) - goto err; - - vi->xdp_queue_pairs = xdp_qp; - ret = virtnet_restore_up(dev); - if (ret) - goto err; - ret = _virtnet_set_queues(vi, curr_qp); - if (ret) - goto err; - - virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); - virtio_config_enable(dev); - return 0; -err: - virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); - return ret; -} - static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { @@ -1948,35 +1943,29 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, return PTR_ERR(prog); } - /* Changing the headroom in buffers is a disruptive operation because - * existing buffers must be flushed and reallocated. This will happen - * when a xdp program is initially added or xdp is disabled by removing - * the xdp program resulting in number of XDP queues changing. - */ - if (vi->xdp_queue_pairs != xdp_qp) { - err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp); - if (err) { - dev_warn(&dev->dev, "XDP reset failure.\n"); - goto virtio_reset_err; - } - } + /* Make sure NAPI is not using any XDP TX queues for RX. */ + for (i = 0; i < vi->max_queue_pairs; i++) + napi_disable(&vi->rq[i].napi); netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); + err = _virtnet_set_queues(vi, curr_qp + xdp_qp); + if (err) + goto err; + vi->xdp_queue_pairs = xdp_qp; for (i = 0; i < vi->max_queue_pairs; i++) { old_prog = rtnl_dereference(vi->rq[i].xdp_prog); rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); + virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); } return 0; -virtio_reset_err: - /* On reset error do our best to unwind XDP changes inflight and return - * error up to user space for resolution. The underlying reset hung on - * us so not much we can do here. - */ +err: + for (i = 0; i < vi->max_queue_pairs; i++) + virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); if (prog) bpf_prog_sub(prog, vi->max_queue_pairs - 1); return err; @@ -2622,15 +2611,6 @@ free: return err; } -static void _remove_vq_common(struct virtnet_info *vi) -{ - vi->vdev->config->reset(vi->vdev); - free_unused_bufs(vi); - _free_receive_bufs(vi); - free_receive_page_frags(vi); - virtnet_del_vqs(vi); -} - static void remove_vq_common(struct virtnet_info *vi) { vi->vdev->config->reset(vi->vdev); -- cgit v1.2.3 From 3f93522ffab2d46a36b57adf324a54e674fc9536 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 19 Jul 2017 16:54:49 +0800 Subject: virtio-net: switch off offloads on demand if possible on XDP set Current XDP implementation wants guest offloads feature to be disabled on device. This is inconvenient and means guest can't benefit from offloads if XDP is not used. This patch tries to address this limitation by disabling the offloads on demand through control guest offloads. Guest offloads will be disabled and enabled on demand on XDP set. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 70 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 5 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f894713dca2a..d4751ce23b4f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -57,6 +57,11 @@ DECLARE_EWMA(pkt_len, 0, 64) #define VIRTNET_DRIVER_VERSION "1.0.0" +const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_TSO4, + VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_ECN, + VIRTIO_NET_F_GUEST_UFO }; + struct virtnet_stats { struct u64_stats_sync tx_syncp; struct u64_stats_sync rx_syncp; @@ -164,10 +169,13 @@ struct virtnet_info { u8 ctrl_promisc; u8 ctrl_allmulti; u16 ctrl_vid; + u64 ctrl_offloads; /* Ethtool settings */ u8 duplex; u32 speed; + + unsigned long guest_offloads; }; struct padded_vnet_hdr { @@ -1897,6 +1905,47 @@ static int virtnet_restore_up(struct virtio_device *vdev) return err; } +static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads) +{ + struct scatterlist sg; + vi->ctrl_offloads = cpu_to_virtio64(vi->vdev, offloads); + + sg_init_one(&sg, &vi->ctrl_offloads, sizeof(vi->ctrl_offloads)); + + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS, + VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) { + dev_warn(&vi->dev->dev, "Fail to set guest offload. \n"); + return -EINVAL; + } + + return 0; +} + +static int virtnet_clear_guest_offloads(struct virtnet_info *vi) +{ + u64 offloads = 0; + + if (!vi->guest_offloads) + return 0; + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) + offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM; + + return virtnet_set_guest_offloads(vi, offloads); +} + +static int virtnet_restore_guest_offloads(struct virtnet_info *vi) +{ + u64 offloads = vi->guest_offloads; + + if (!vi->guest_offloads) + return 0; + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) + offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM; + + return virtnet_set_guest_offloads(vi, offloads); +} + static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { @@ -1906,10 +1955,11 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, u16 xdp_qp = 0, curr_qp; int i, err; - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || - virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || - virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || - virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) { + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) + && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) { NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first"); return -EOPNOTSUPP; } @@ -1956,6 +2006,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, for (i = 0; i < vi->max_queue_pairs; i++) { old_prog = rtnl_dereference(vi->rq[i].xdp_prog); rcu_assign_pointer(vi->rq[i].xdp_prog, prog); + if (i == 0) { + if (!old_prog) + virtnet_clear_guest_offloads(vi); + if (!prog) + virtnet_restore_guest_offloads(vi); + } if (old_prog) bpf_prog_put(old_prog); virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); @@ -2591,6 +2647,10 @@ static int virtnet_probe(struct virtio_device *vdev) netif_carrier_on(dev); } + for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) + if (virtio_has_feature(vi->vdev, guest_offloads[i])) + set_bit(guest_offloads[i], &vi->guest_offloads); + pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", dev->name, max_queue_pairs); @@ -2687,7 +2747,7 @@ static struct virtio_device_id id_table[] = { VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ VIRTIO_NET_F_CTRL_MAC_ADDR, \ - VIRTIO_NET_F_MTU + VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS static unsigned int features[] = { VIRTNET_FEATURES, -- cgit v1.2.3 From cfa0ebc9d6d6308564f5174ecb655b9d504b2be5 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 24 Jul 2017 15:38:32 +0200 Subject: virtio-net: fix module unloading Unregister the driver before removing multi-instance hotplug callbacks. This order avoids the warning issued from __cpuhp_remove_state_cpuslocked when the number of remaining instances isn't yet zero. Fixes: 8017c279196a ("net/virtio-net: Convert to hotplug state machine") Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Jones Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 99a26a9efec1..f41ab0ea942a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2743,9 +2743,9 @@ module_init(virtio_net_driver_init); static __exit void virtio_net_driver_exit(void) { + unregister_virtio_driver(&virtio_net_driver); cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); cpuhp_remove_multi_state(virtionet_online); - unregister_virtio_driver(&virtio_net_driver); } module_exit(virtio_net_driver_exit); -- cgit v1.2.3 From 67a75194bce07669bf11e14a24a95f64ebde8b47 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Jul 2017 17:35:50 +0200 Subject: virtio-net: mark PM functions as __maybe_unused After removing the reset function, the freeze and restore functions are now unused when CONFIG_PM_SLEEP is disabled: drivers/net/virtio_net.c:1881:12: error: 'virtnet_restore_up' defined but not used [-Werror=unused-function] static int virtnet_restore_up(struct virtio_device *vdev) drivers/net/virtio_net.c:1859:13: error: 'virtnet_freeze_down' defined but not used [-Werror=unused-function] static void virtnet_freeze_down(struct virtio_device *vdev) A more robust way to do this is to remove the #ifdef around the callers and instead mark them as __maybe_unused. The compiler will now just silently drop the unused code. Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set") Signed-off-by: Arnd Bergmann Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d4751ce23b4f..1902701e15a9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2702,8 +2702,7 @@ static void virtnet_remove(struct virtio_device *vdev) free_netdev(vi->dev); } -#ifdef CONFIG_PM_SLEEP -static int virtnet_freeze(struct virtio_device *vdev) +static __maybe_unused int virtnet_freeze(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; @@ -2714,7 +2713,7 @@ static int virtnet_freeze(struct virtio_device *vdev) return 0; } -static int virtnet_restore(struct virtio_device *vdev) +static __maybe_unused int virtnet_restore(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; int err; @@ -2730,7 +2729,6 @@ static int virtnet_restore(struct virtio_device *vdev) return 0; } -#endif static struct virtio_device_id id_table[] = { { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, -- cgit v1.2.3 From 1daa8790d0280d2c719658e39bd59fce65efa909 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 31 Jul 2017 21:49:49 +0300 Subject: virtio_net: fix truesize for mergeable buffers Seth Forshee noticed a performance degradation with some workloads. This turns out to be due to packet drops. Euan Kemp noticed that this is because we drop all packets where length exceeds the truesize, but for some packets we add in extra memory without updating the truesize. This in turn was kept around unchanged from ab7db91705e95 ("virtio-net: auto-tune mergeable rx buffer size for improved performance"). That commit had an internal reason not to account for the extra space: not enough bits to do it. No longer true so let's account for the allocated length exactly. Many thanks to Seth Forshee for the report and bisecting and Euan Kemp for debugging the issue. Fixes: 680557cf79f8 ("virtio_net: rework mergeable buffer handling") Reported-by: Euan Kemp Tested-by: Euan Kemp Reported-by: Seth Forshee Tested-by: Seth Forshee Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 99a26a9efec1..075dc18829ab 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -889,21 +889,20 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; buf += headroom; /* advance address leaving hole at front of pkt */ - ctx = (void *)(unsigned long)len; get_page(alloc_frag->page); alloc_frag->offset += len + headroom; hole = alloc_frag->size - alloc_frag->offset; if (hole < len + headroom) { /* To avoid internal fragmentation, if there is very likely not * enough space for another buffer, add the remaining space to - * the current buffer. This extra space is not included in - * the truesize stored in ctx. + * the current buffer. */ len += hole; alloc_frag->offset += hole; } sg_init_one(rq->sg, buf, len); + ctx = (void *)(unsigned long)len; err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) put_page(virt_to_head_page(buf)); -- cgit v1.2.3 From 7acd43296889a71ecf9c20498465f672e667a9ad Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 12 Aug 2017 22:45:53 +0100 Subject: virtio-net: make array guest_offloads static The array guest_offloads is local to the source and does not need to be in global scope, so make it static. Also tweak formatting. Cleans up sparse warnings: symbol 'guest_offloads' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index e90de2186ffc..a3f3c66b4530 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -57,10 +57,12 @@ DECLARE_EWMA(pkt_len, 0, 64) #define VIRTNET_DRIVER_VERSION "1.0.0" -const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_TSO4, - VIRTIO_NET_F_GUEST_TSO6, - VIRTIO_NET_F_GUEST_ECN, - VIRTIO_NET_F_GUEST_UFO }; +static const unsigned long guest_offloads[] = { + VIRTIO_NET_F_GUEST_TSO4, + VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_ECN, + VIRTIO_NET_F_GUEST_UFO +}; struct virtnet_stats { struct u64_stats_sync tx_syncp; -- cgit v1.2.3 From a4a765031d007191c36a25edb9df2fa23c8e6496 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 15 Aug 2017 10:29:17 -0700 Subject: virtio: put paren around sizeof Kernel coding style is to put paren around operand of sizeof. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a3f3c66b4530..4302f313d9a7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -319,7 +319,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, hdr_len = vi->hdr_len; if (vi->mergeable_rx_bufs) - hdr_padded_len = sizeof *hdr; + hdr_padded_len = sizeof(*hdr); else hdr_padded_len = sizeof(struct padded_vnet_hdr); -- cgit v1.2.3 From 718ad681eff47d3d04879ff5f5290bdab0b8bad6 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 18 Aug 2017 13:46:24 -0700 Subject: net: drop unused attribute argument from sysfs queue funcs The show and store functions don't need/use the attribute. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- include/linux/netdevice.h | 5 ++--- net/core/net-sysfs.c | 37 +++++++++++-------------------------- 3 files changed, 14 insertions(+), 30 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4302f313d9a7..52ae78ca3d38 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2376,7 +2376,7 @@ err: #ifdef CONFIG_SYSFS static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, char *buf) + char *buf) { struct virtnet_info *vi = netdev_priv(queue->dev); unsigned int queue_index = get_netdev_rx_queue_index(queue); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0c928598dab..c5475b37a631 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -694,10 +694,9 @@ struct netdev_rx_queue { */ struct rx_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf); + ssize_t (*show)(struct netdev_rx_queue *queue, char *buf); ssize_t (*store)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, const char *buf, size_t len); + const char *buf, size_t len); }; #ifdef CONFIG_XPS diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 76ec74d4a65b..48714c8024f3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -661,7 +661,7 @@ static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, if (!attribute->show) return -EIO; - return attribute->show(queue, attribute, buf); + return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, @@ -673,7 +673,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, if (!attribute->store) return -EIO; - return attribute->store(queue, attribute, buf, count); + return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { @@ -682,8 +682,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = { }; #ifdef CONFIG_RPS -static ssize_t show_rps_map(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, char *buf) +static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; @@ -706,8 +705,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, } static ssize_t store_rps_map(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, - const char *buf, size_t len) + const char *buf, size_t len) { struct rps_map *old_map, *map; cpumask_var_t mask; @@ -765,7 +763,6 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf) { struct rps_dev_flow_table *flow_table; @@ -788,8 +785,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu) } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, - const char *buf, size_t len) + const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; @@ -975,10 +971,9 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf); + ssize_t (*show)(struct netdev_queue *queue, char *buf); ssize_t (*store)(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, const char *buf, size_t len); + const char *buf, size_t len); }; #define to_netdev_queue_attr(_attr) container_of(_attr, \ struct netdev_queue_attribute, attr) @@ -994,7 +989,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, attribute, buf); + return attribute->show(queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1007,7 +1002,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, attribute, buf, count); + return attribute->store(queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1016,7 +1011,6 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { }; static ssize_t show_trans_timeout(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, char *buf) { unsigned long trans_timeout; @@ -1040,7 +1034,6 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) } static ssize_t show_traffic_class(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, char *buf) { struct net_device *dev = queue->dev; @@ -1055,14 +1048,12 @@ static ssize_t show_traffic_class(struct netdev_queue *queue, #ifdef CONFIG_XPS static ssize_t show_tx_maxrate(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, char *buf) { return sprintf(buf, "%lu\n", queue->tx_maxrate); } static ssize_t set_tx_maxrate(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1130,7 +1121,6 @@ static ssize_t bql_set(const char *buf, const size_t count, } static ssize_t bql_show_hold_time(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf) { struct dql *dql = &queue->dql; @@ -1139,7 +1129,6 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue, } static ssize_t bql_set_hold_time(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, const char *buf, size_t len) { struct dql *dql = &queue->dql; @@ -1160,7 +1149,6 @@ static struct netdev_queue_attribute bql_hold_time_attribute = bql_set_hold_time); static ssize_t bql_show_inflight(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf) { struct dql *dql = &queue->dql; @@ -1173,14 +1161,12 @@ static struct netdev_queue_attribute bql_inflight_attribute = #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - struct netdev_queue_attribute *attr, \ char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ - struct netdev_queue_attribute *attr, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1211,7 +1197,7 @@ static const struct attribute_group dql_group = { #ifdef CONFIG_XPS static ssize_t show_xps_map(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, char *buf) + char *buf) { struct net_device *dev = queue->dev; int cpu, len, num_tc = 1, tc = 0; @@ -1258,8 +1244,7 @@ static ssize_t show_xps_map(struct netdev_queue *queue, } static ssize_t store_xps_map(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - const char *buf, size_t len) + const char *buf, size_t len) { struct net_device *dev = queue->dev; unsigned long index; -- cgit v1.2.3 From dadc0736f7be553a25ad34dc437ae379c5ab4a68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Aug 2017 09:02:49 -0700 Subject: virtio_net: be drop monitor friendly This change is needed to not fool drop monitor. (perf record ... -e skb:kfree_skb ) Packets were properly sent and are consumed after TX completion. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 98f17b05c68b..b06169ea60dc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1058,7 +1058,7 @@ static void free_old_xmit_skbs(struct send_queue *sq) bytes += skb->len; packets++; - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); } /* Avoid overhead when no packets have been processed -- cgit v1.2.3 From 9457642a405fd68d5ce90f5c432e462277fb666e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Sep 2017 17:42:41 +0800 Subject: virtio-net: remove unnecessary parameter of virtnet_xdp_xmit() CC: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 511f8339fa96..a0ef4b08f0e9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -373,7 +373,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, } static bool virtnet_xdp_xmit(struct virtnet_info *vi, - struct receive_queue *rq, struct xdp_buff *xdp) { struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -542,7 +541,7 @@ static struct sk_buff *receive_small(struct net_device *dev, delta = orig_data - xdp.data; break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp))) + if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); rcu_read_unlock(); goto xdp_xmit; @@ -677,7 +676,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp))) + if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); if (unlikely(xdp_page != page)) -- cgit v1.2.3 From 312403453532b209879aa7faeff296bd3dea78af Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Sep 2017 17:42:42 +0800 Subject: virtio-net: add packet len average only when needed during XDP There's no need to add packet len average in the case of XDP_PASS since it will be done soon after skb is created. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a0ef4b08f0e9..db5924c085aa 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -656,6 +656,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdp.data_end = xdp.data + (len - vi->hdr_len); act = bpf_prog_run_xdp(xdp_prog, &xdp); + if (act != XDP_PASS) + ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); + switch (act) { case XDP_PASS: /* recalculate offset to account for any header @@ -671,14 +674,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, put_page(page); head_skb = page_to_skb(vi, rq, xdp_page, offset, len, PAGE_SIZE); - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); return head_skb; } break; case XDP_TX: if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); if (unlikely(xdp_page != page)) goto err_xdp; rcu_read_unlock(); @@ -690,7 +691,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, case XDP_DROP: if (unlikely(xdp_page != page)) __free_pages(xdp_page, 0); - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); goto err_xdp; } } -- cgit v1.2.3 From 186b3c998c50fc241b51b905081c748455d16b4a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Sep 2017 17:42:43 +0800 Subject: virtio-net: support XDP_REDIRECT This patch tries to add XDP_REDIRECT for virtio-net. The changes are not complex as we could use exist XDP_TX helpers for most of the work. The rest is passing the XDP_TX to NAPI handler for implementing batching. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net. | 0 drivers/net/virtio_net.c | 77 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 drivers/net/virtio_net. (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net. new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index db5924c085aa..f6c1f135a024 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -29,6 +29,7 @@ #include #include #include +#include #include static int napi_weight = NAPI_POLL_WEIGHT; @@ -372,8 +373,20 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, return skb; } -static bool virtnet_xdp_xmit(struct virtnet_info *vi, - struct xdp_buff *xdp) +static void virtnet_xdp_flush(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct send_queue *sq; + unsigned int qp; + + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); + sq = &vi->sq[qp]; + + virtqueue_kick(sq->vq); +} + +static bool __virtnet_xdp_xmit(struct virtnet_info *vi, + struct xdp_buff *xdp) { struct virtio_net_hdr_mrg_rxbuf *hdr; unsigned int len; @@ -407,10 +420,19 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi, return false; } - virtqueue_kick(sq->vq); return true; } +static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +{ + struct virtnet_info *vi = netdev_priv(dev); + bool sent = __virtnet_xdp_xmit(vi, xdp); + + if (!sent) + return -ENOSPC; + return 0; +} + static unsigned int virtnet_get_headroom(struct virtnet_info *vi) { return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; @@ -483,7 +505,8 @@ static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, void *ctx, - unsigned int len) + unsigned int len, + bool *xdp_xmit) { struct sk_buff *skb; struct bpf_prog *xdp_prog; @@ -493,7 +516,7 @@ static struct sk_buff *receive_small(struct net_device *dev, unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct page *page = virt_to_head_page(buf); - unsigned int delta = 0; + unsigned int delta = 0, err; struct page *xdp_page; len -= vi->hdr_len; @@ -541,8 +564,16 @@ static struct sk_buff *receive_small(struct net_device *dev, delta = orig_data - xdp.data; break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) + if (unlikely(!__virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); + else + *xdp_xmit = true; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + err = xdp_do_redirect(dev, &xdp, xdp_prog); + if (!err) + *xdp_xmit = true; rcu_read_unlock(); goto xdp_xmit; default: @@ -603,7 +634,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct receive_queue *rq, void *buf, void *ctx, - unsigned int len) + unsigned int len, + bool *xdp_xmit) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); @@ -613,6 +645,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct bpf_prog *xdp_prog; unsigned int truesize; unsigned int headroom = mergeable_ctx_to_headroom(ctx); + int err; head_skb = NULL; @@ -678,12 +711,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) + if (unlikely(!__virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); + else + *xdp_xmit = true; if (unlikely(xdp_page != page)) goto err_xdp; rcu_read_unlock(); goto xdp_xmit; + case XDP_REDIRECT: + err = xdp_do_redirect(dev, &xdp, xdp_prog); + if (err) + *xdp_xmit = true; + rcu_read_unlock(); + goto xdp_xmit; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: @@ -788,7 +829,7 @@ xdp_xmit: } static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, - void *buf, unsigned int len, void **ctx) + void *buf, unsigned int len, void **ctx, bool *xdp_xmit) { struct net_device *dev = vi->dev; struct sk_buff *skb; @@ -809,11 +850,11 @@ static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, } if (vi->mergeable_rx_bufs) - skb = receive_mergeable(dev, vi, rq, buf, ctx, len); + skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit); else if (vi->big_packets) skb = receive_big(dev, vi, rq, buf, len); else - skb = receive_small(dev, vi, rq, buf, ctx, len); + skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit); if (unlikely(!skb)) return 0; @@ -1071,7 +1112,7 @@ static void refill_work(struct work_struct *work) } } -static int virtnet_receive(struct receive_queue *rq, int budget) +static int virtnet_receive(struct receive_queue *rq, int budget, bool *xdp_xmit) { struct virtnet_info *vi = rq->vq->vdev->priv; unsigned int len, received = 0, bytes = 0; @@ -1083,13 +1124,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget) while (received < budget && (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) { - bytes += receive_buf(vi, rq, buf, len, ctx); + bytes += receive_buf(vi, rq, buf, len, ctx, xdp_xmit); received++; } } else { while (received < budget && (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { - bytes += receive_buf(vi, rq, buf, len, NULL); + bytes += receive_buf(vi, rq, buf, len, NULL, xdp_xmit); received++; } } @@ -1161,15 +1202,19 @@ static int virtnet_poll(struct napi_struct *napi, int budget) struct receive_queue *rq = container_of(napi, struct receive_queue, napi); unsigned int received; + bool xdp_xmit = false; virtnet_poll_cleantx(rq); - received = virtnet_receive(rq, budget); + received = virtnet_receive(rq, budget, &xdp_xmit); /* Out of packets? */ if (received < budget) virtqueue_napi_complete(napi, rq->vq, received); + if (xdp_xmit) + xdp_do_flush_map(); + return received; } @@ -2069,6 +2114,8 @@ static const struct net_device_ops virtnet_netdev = { .ndo_poll_controller = virtnet_netpoll, #endif .ndo_xdp = virtnet_xdp, + .ndo_xdp_xmit = virtnet_xdp_xmit, + .ndo_xdp_flush = virtnet_xdp_flush, .ndo_features_check = passthru_features_check, }; -- cgit v1.2.3 From dd5437974964c759570d68e50ce13c313808f79a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 22 Sep 2017 14:38:58 +0800 Subject: virtio-net: correctly set xdp_xmit for mergeable buffer We should set xdp_xmit only when xdp_do_redirect() succeed. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f6c1f135a024..dd14a4547932 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -721,7 +721,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto xdp_xmit; case XDP_REDIRECT: err = xdp_do_redirect(dev, &xdp, xdp_prog); - if (err) + if (!err) *xdp_xmit = true; rcu_read_unlock(); goto xdp_xmit; -- cgit v1.2.3 From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 + drivers/net/ethernet/cavium/thunder/nicvf_main.c | 1 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + .../net/ethernet/netronome/nfp/nfp_net_common.c | 1 + drivers/net/ethernet/qlogic/qede/qede_fp.c | 1 + drivers/net/tun.c | 1 + drivers/net/virtio_net.c | 2 + include/linux/bpf.h | 1 + include/linux/filter.h | 21 +++- include/linux/skbuff.h | 68 +++++++++++- include/uapi/linux/bpf.h | 13 ++- kernel/bpf/verifier.c | 114 ++++++++++++++++----- net/bpf/test_run.c | 1 + net/core/dev.c | 31 +++++- net/core/filter.c | 77 +++++++++++++- net/core/skbuff.c | 2 + 19 files changed, 297 insertions(+), 42 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index d8f0c837b72c..06ce63c00821 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp.data_hard_start = *data_ptr - offset; xdp.data = *data_ptr; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; orig_data = xdp.data; mapping = rx_buf->mapping - bp->rx_dma_offset; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 49b80da51ba7..d68478afccbf 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp.data_hard_start = page_address(page); xdp.data = (void *)cpu_addr; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 1519dfb851d0..f426762bd83a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = xdp.data - i40e_rx_offset(rx_ring); xdp.data_end = xdp.data + size; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d962368d08d0..04bb03bda1cd 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b97a55c827eb..8f9cb8abc497 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud xdp.data_hard_start = va - frags[0].page_offset; xdp.data = va; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + length; orig_data = xdp.data; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f1dd638384d3..30b3f3fbd719 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, return false; xdp.data = va + *rx_headroom; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; xdp.data_hard_start = va; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 1c0187f0af51..e3a38be3600a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start, xdp.data_hard_start = hard_start; xdp.data = data + *off; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = data + *off + *len; orig_data = xdp.data; diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 6fc854b120b0..48ec4c56cddf 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp.data_hard_start = page_address(bd->data); xdp.data = xdp.data_hard_start + *data_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; /* Queues always have a full reset currently, so for the time diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2c36f6ebad79..a6e0bffe3d29 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp.data_hard_start = buf; xdp.data = buf + pad; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index dd14a4547932..fc059f193e7d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + (len - vi->hdr_len); act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8390859e79e7..2b672c50f160 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -137,6 +137,7 @@ enum bpf_reg_type { PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ + PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 052bab3d62e7..911d454af107 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -487,12 +487,14 @@ struct sk_filter { struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; + void *data_meta; void *data_end; }; struct xdp_buff { void *data; void *data_end; + void *data_meta; void *data_hard_start; }; @@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb)); - cb->data_end = skb->data + skb_headlen(skb); + cb->data_meta = skb->data - skb_metadata_len(skb); + cb->data_end = skb->data + skb_headlen(skb); } static inline u8 *bpf_skb_cb(struct sk_buff *skb) @@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + void bpf_warn_invalid_xdp_action(u32 act); -void bpf_warn_invalid_xdp_redirect(u32 ifindex); struct sock *do_sk_redirect_map(void); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f9db5539a6fb..19e64bfb1a66 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - unsigned short _unused; - unsigned char nr_frags; + __u8 __unused; + __u8 meta_len; + __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ @@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void) return 0; } +static inline u8 skb_metadata_len(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->meta_len; +} + +static inline void *skb_metadata_end(const struct sk_buff *skb) +{ + return skb_mac_header(skb); +} + +static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b, + u8 meta_len) +{ + const void *a = skb_metadata_end(skb_a); + const void *b = skb_metadata_end(skb_b); + /* Using more efficient varaiant than plain call to memcmp(). */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + u64 diffs = 0; + + switch (meta_len) { +#define __it(x, op) (x -= sizeof(u##op)) +#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) + case 32: diffs |= __it_diff(a, b, 64); + case 24: diffs |= __it_diff(a, b, 64); + case 16: diffs |= __it_diff(a, b, 64); + case 8: diffs |= __it_diff(a, b, 64); + break; + case 28: diffs |= __it_diff(a, b, 64); + case 20: diffs |= __it_diff(a, b, 64); + case 12: diffs |= __it_diff(a, b, 64); + case 4: diffs |= __it_diff(a, b, 32); + break; + } + return diffs; +#else + return memcmp(a - meta_len, b - meta_len, meta_len); +#endif +} + +static inline bool skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b) +{ + u8 len_a = skb_metadata_len(skb_a); + u8 len_b = skb_metadata_len(skb_b); + + if (!(len_a | len_b)) + return false; + + return len_a != len_b ? + true : __skb_metadata_differs(skb_a, skb_b, len_a); +} + +static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) +{ + skb_shinfo(skb)->meta_len = meta_len; +} + +static inline void skb_metadata_clear(struct sk_buff *skb) +{ + skb_metadata_set(skb, 0); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..e43491ac4823 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -582,6 +582,12 @@ union bpf_attr { * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem + * + * int bpf_xdp_adjust_meta(xdp_md, delta) + * Adjust the xdp_md.data_meta by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data_meta + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -638,6 +644,7 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ + FN(xdp_adjust_meta), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -715,7 +722,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* accessed by BPF_PROG_TYPE_sk_skb types */ + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -723,6 +730,9 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; }; struct bpf_tunnel_key { @@ -783,6 +793,7 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; + __u32 data_meta; }; enum sk_action { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..f849eca36052 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...) va_end(args); } +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -187,6 +193,7 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; @@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose("(id=%d", reg->id); if (t != SCALAR_VALUE) verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) + if (type_is_pkt_pointer(t)) verbose(",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || @@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; @@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) mark_reg_unknown(state->regs, value_regno); @@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { err = check_stack_read(state, off, size, value_regno); } - } else if (reg->type == PTR_TO_PACKET) { + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; @@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: + case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); @@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; @@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else @@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else @@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) + if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } @@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->range = 0; @@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max_t(u16, regs[i].range, dst_reg->off); @@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, dst_reg->off); } } @@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; @@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index df672517b4fd..a86e6687026e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/core/dev.c b/net/core/dev.c index 97abddd9039a..e350c768d4b5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3864,8 +3864,8 @@ drop: static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - if (skb_linearize(skb)) - goto do_drop; + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (troom > 0 && __skb_linearize(skb)) + goto do_drop; + } /* The XDP program wants to see the packet starting at the MAC * header. @@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; @@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - /* fall through */ + break; case XDP_PASS: + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); break; - default: bpf_warn_invalid_xdp_action(act); /* fall through */ @@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); diff --git a/net/core/filter.c b/net/core/filter.c index c468e7cfad19..9b6e7e84aafd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < xdp->data_hard_start || + if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; xdp->data = data; return 0; @@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp->data_hard_start || + meta > xdp->data)) + return -EINVAL; + if (unlikely((metalen & (sizeof(__u32) - 1)) || + (metalen > 32))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head) + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta) return true; return false; @@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; @@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; @@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + return false; + } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size, } switch (off) { - case bpf_ctx_range(struct __sk_buff, tc_classid): - return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 000ce735fa8d..d98c2e3ce2bf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + skb_metadata_clear(skb); + /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). -- cgit v1.2.3 From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:16 -0700 Subject: net: bpf: rename ndo_xdp to ndo_bpf ndo_xdp is a control path callback for setting up XDP in the driver. We can reuse it for other forms of communication between the eBPF stack and the drivers. Rename the callback and associated structures and definitions. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 4 +-- drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 +-- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 6 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 +-- .../net/ethernet/netronome/nfp/nfp_net_common.c | 4 +-- drivers/net/ethernet/qlogic/qede/qede.h | 2 +- drivers/net/ethernet/qlogic/qede/qede_filter.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 +-- drivers/net/tun.c | 4 +-- drivers/net/virtio_net.c | 4 +-- include/linux/netdevice.h | 23 ++++++++------- net/core/dev.c | 34 +++++++++++----------- net/core/rtnetlink.c | 4 +-- 17 files changed, 56 insertions(+), 55 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4e3d569bf32e..96416f5d97f3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7775,7 +7775,7 @@ static const struct net_device_ops bnxt_netdev_ops = { #endif .ndo_udp_tunnel_add = bnxt_udp_tunnel_add, .ndo_udp_tunnel_del = bnxt_udp_tunnel_del, - .ndo_xdp = bnxt_xdp, + .ndo_bpf = bnxt_xdp, .ndo_bridge_getlink = bnxt_bridge_getlink, .ndo_bridge_setlink = bnxt_bridge_setlink, .ndo_get_phys_port_name = bnxt_get_phys_port_name diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 06ce63c00821..261e5847557a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -208,7 +208,7 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) return 0; } -int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp) +int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct bnxt *bp = netdev_priv(dev); int rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 12a5ad66b564..414b748038ca 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -16,6 +16,6 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts); bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, struct page *page, u8 **data_ptr, unsigned int *len, u8 *event); -int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp); +int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp); #endif diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 71989e180289..a063c36c4c58 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1741,7 +1741,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) return 0; } -static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp) +static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) { struct nicvf *nic = netdev_priv(netdev); @@ -1774,7 +1774,7 @@ static const struct net_device_ops nicvf_netdev_ops = { .ndo_tx_timeout = nicvf_tx_timeout, .ndo_fix_features = nicvf_fix_features, .ndo_set_features = nicvf_set_features, - .ndo_xdp = nicvf_xdp, + .ndo_bpf = nicvf_xdp, }; static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index dfecaeda0654..05b94d87a6c3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11648,12 +11648,12 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, } /** - * i40e_xdp - implements ndo_xdp for i40e + * i40e_xdp - implements ndo_bpf for i40e * @dev: netdevice * @xdp: XDP command **/ static int i40e_xdp(struct net_device *dev, - struct netdev_xdp *xdp) + struct netdev_bpf *xdp) { struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_vsi *vsi = np->vsi; @@ -11705,7 +11705,7 @@ static const struct net_device_ops i40e_netdev_ops = { .ndo_features_check = i40e_features_check, .ndo_bridge_getlink = i40e_ndo_bridge_getlink, .ndo_bridge_setlink = i40e_ndo_bridge_setlink, - .ndo_xdp = i40e_xdp, + .ndo_bpf = i40e_xdp, }; /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 507977994a03..e5dcb25be398 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10004,7 +10004,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) return 0; } -static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct ixgbe_adapter *adapter = netdev_priv(dev); @@ -10113,7 +10113,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_udp_tunnel_add = ixgbe_add_udp_tunnel_port, .ndo_udp_tunnel_del = ixgbe_del_udp_tunnel_port, .ndo_features_check = ixgbe_features_check, - .ndo_xdp = ixgbe_xdp, + .ndo_bpf = ixgbe_xdp, .ndo_xdp_xmit = ixgbe_xdp_xmit, .ndo_xdp_flush = ixgbe_xdp_flush, }; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index d611df2f274d..736a6ccaf05e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2916,7 +2916,7 @@ static u32 mlx4_xdp_query(struct net_device *dev) return prog_id; } -static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -2958,7 +2958,7 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, - .ndo_xdp = mlx4_xdp, + .ndo_bpf = mlx4_xdp, }; static const struct net_device_ops mlx4_netdev_ops_master = { @@ -2995,7 +2995,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, - .ndo_xdp = mlx4_xdp, + .ndo_bpf = mlx4_xdp, }; struct mlx4_en_bond { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 28ae00b3eb88..3b7b7bb84eb0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3831,7 +3831,7 @@ static u32 mlx5e_xdp_query(struct net_device *dev) return prog_id; } -static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -3883,7 +3883,7 @@ static const struct net_device_ops mlx5e_netdev_ops = { .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif .ndo_tx_timeout = mlx5e_tx_timeout, - .ndo_xdp = mlx5e_xdp, + .ndo_bpf = mlx5e_xdp, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = mlx5e_netpoll, #endif diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 185a3dd35a3f..f6c6ad4e8a59 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3378,7 +3378,7 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, return 0; } -static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp) +static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) { struct nfp_net *nn = netdev_priv(netdev); @@ -3441,7 +3441,7 @@ const struct net_device_ops nfp_net_netdev_ops = { .ndo_get_phys_port_name = nfp_port_get_phys_port_name, .ndo_udp_tunnel_add = nfp_net_add_vxlan_port, .ndo_udp_tunnel_del = nfp_net_del_vxlan_port, - .ndo_xdp = nfp_net_xdp, + .ndo_bpf = nfp_net_xdp, }; /** diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index adb700512baa..a3a70ade411f 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -503,7 +503,7 @@ void qede_fill_rss_params(struct qede_dev *edev, void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti); void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti); -int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp); +int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp); #ifdef CONFIG_DCB void qede_set_dcbnl_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index f79e36e4060a..c1a0708a7d7c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1065,7 +1065,7 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) return 0; } -int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp) +int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct qede_dev *edev = netdev_priv(dev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index e5ee9f274a71..8f9b3eb82137 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -556,7 +556,7 @@ static const struct net_device_ops qede_netdev_ops = { .ndo_udp_tunnel_add = qede_udp_tunnel_add, .ndo_udp_tunnel_del = qede_udp_tunnel_del, .ndo_features_check = qede_features_check, - .ndo_xdp = qede_xdp, + .ndo_bpf = qede_xdp, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = qede_rx_flow_steer, #endif @@ -594,7 +594,7 @@ static const struct net_device_ops qede_netdev_vf_xdp_ops = { .ndo_udp_tunnel_add = qede_udp_tunnel_add, .ndo_udp_tunnel_del = qede_udp_tunnel_del, .ndo_features_check = qede_features_check, - .ndo_xdp = qede_xdp, + .ndo_bpf = qede_xdp, }; /* ------------------------------------------------------------------------- diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8125956f62a1..1a326b697221 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1141,7 +1141,7 @@ static u32 tun_xdp_query(struct net_device *dev) return 0; } -static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -1185,7 +1185,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, - .ndo_xdp = tun_xdp, + .ndo_bpf = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fc059f193e7d..edf984406ba0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2088,7 +2088,7 @@ static u32 virtnet_xdp_query(struct net_device *dev) return 0; } -static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -2115,7 +2115,7 @@ static const struct net_device_ops virtnet_netdev = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = virtnet_netpoll, #endif - .ndo_xdp = virtnet_xdp, + .ndo_bpf = virtnet_xdp, .ndo_xdp_xmit = virtnet_xdp_xmit, .ndo_xdp_flush = virtnet_xdp_flush, .ndo_features_check = passthru_features_check, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7de7656550c2..9af9feaaeb64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -779,10 +779,10 @@ enum tc_setup_type { TC_SETUP_CBS, }; -/* These structures hold the attributes of xdp state that are being passed - * to the netdevice through the xdp op. +/* These structures hold the attributes of bpf state that are being passed + * to the netdevice through the bpf op. */ -enum xdp_netdev_command { +enum bpf_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are @@ -801,8 +801,8 @@ enum xdp_netdev_command { struct netlink_ext_ack; -struct netdev_xdp { - enum xdp_netdev_command command; +struct netdev_bpf { + enum bpf_netdev_command command; union { /* XDP_SETUP_PROG */ struct { @@ -1124,9 +1124,10 @@ struct dev_ifalias { * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. - * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp); + * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); * This function is used to set or query state related to XDP on the - * netdevice. See definition of enum xdp_netdev_command for details. + * netdevice and manage BPF offload. See definition of + * enum bpf_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. @@ -1315,8 +1316,8 @@ struct net_device_ops { struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); - int (*ndo_xdp)(struct net_device *dev, - struct netdev_xdp *xdp); + int (*ndo_bpf)(struct net_device *dev, + struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); void (*ndo_xdp_flush)(struct net_device *dev); @@ -3311,10 +3312,10 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp); +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id); +u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1423cf4d695c..10cde58d3275 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4545,7 +4545,7 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) +static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; @@ -7090,26 +7090,26 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) +u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) { - struct netdev_xdp xdp; + struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(xdp_op(dev, &xdp) < 0); + WARN_ON(bpf_op(dev, &xdp) < 0); if (prog_id) *prog_id = xdp.prog_id; return xdp.prog_attached; } -static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, +static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { - struct netdev_xdp xdp; + struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -7120,7 +7120,7 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, xdp.flags = flags; xdp.prog = prog; - return xdp_op(dev, &xdp); + return bpf_op(dev, &xdp); } /** @@ -7137,24 +7137,24 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - xdp_op_t xdp_op, xdp_chk; + bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); - xdp_op = xdp_chk = ops->ndo_xdp; - if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) + bpf_op = bpf_chk = ops->ndo_bpf; + if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; - if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) - xdp_op = generic_xdp_install; - if (xdp_op == xdp_chk) - xdp_chk = generic_xdp_install; + if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) + bpf_op = generic_xdp_install; + if (bpf_op == bpf_chk) + bpf_chk = generic_xdp_install; if (fd >= 0) { - if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, xdp_op, NULL)) + __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); @@ -7162,7 +7162,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return PTR_ERR(prog); } - err = dev_xdp_install(dev, xdp_op, extack, flags, prog); + err = dev_xdp_install(dev, bpf_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8a8c51937edf..dc5ad84ac096 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1270,10 +1270,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) *prog_id = generic_xdp_prog->aux->id; return XDP_ATTACHED_SKB; } - if (!ops->ndo_xdp) + if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); + return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:03 -0800 Subject: mm: remove __GFP_COLD As the page free path makes no distinction between cache hot and cold pages, there is no real useful ordering of pages in the free list that allocation requests can take advantage of. Juding from the users of __GFP_COLD, it is likely that a number of them are the result of copying other sites instead of actually measuring the impact. Remove the __GFP_COLD parameter which simplifies a number of paths in the page allocator. This is potentially controversial but bear in mind that the size of the per-cpu pagelists versus modern cache sizes means that the whole per-cpu list can often fit in the L3 cache. Hence, there is only a potential benefit for microbenchmarks that alloc/free pages in a tight loop. It's even worse when THP is taken into account which has little or no chance of getting a cache-hot page as the per-cpu list is bypassed and the zeroing of multiple pages will thrash the cache anyway. The truncate microbenchmarks are not shown as this patch affects the allocation path and not the free path. A page fault microbenchmark was tested but it showed no sigificant difference which is not surprising given that the __GFP_COLD branches are a miniscule percentage of the fault path. Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 2 +- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 3 +-- .../net/ethernet/cavium/liquidio/octeon_network.h | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 5 ++--- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- drivers/net/ethernet/qlogic/qlge/qlge_main.c | 3 +-- drivers/net/ethernet/sfc/falcon/rx.c | 2 +- drivers/net/ethernet/sfc/rx.c | 2 +- drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c | 2 +- drivers/net/ethernet/ti/netcp_core.c | 2 +- drivers/net/virtio_net.c | 1 - drivers/staging/lustre/lustre/mdc/mdc_request.c | 2 +- fs/cachefiles/rdwr.c | 6 ++---- include/linux/gfp.h | 5 ----- include/linux/pagemap.h | 8 +------- include/linux/skbuff.h | 2 +- include/linux/slab.h | 3 --- include/trace/events/mmflags.h | 1 - kernel/power/snapshot.c | 4 ++-- mm/filemap.c | 6 +++--- mm/page_alloc.c | 20 ++++++-------------- mm/percpu-vm.c | 2 +- net/core/skbuff.c | 4 ++-- tools/perf/builtin-kmem.c | 1 - 25 files changed, 32 insertions(+), 62 deletions(-) (limited to 'drivers/net/virtio_net.c') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index c6bd5e24005d..fbbbd8b3eb45 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num) rc = ena_alloc_rx_page(rx_ring, rx_info, - __GFP_COLD | GFP_ATOMIC | __GFP_COMP); + GFP_ATOMIC | __GFP_COMP); if (unlikely(rc < 0)) { netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, "failed to alloc buffer for rx queue %d\n", diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index 45d92304068e..cc1e4f820e64 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -295,7 +295,7 @@ again: order = alloc_order; /* Try to obtain pages, decreasing order if necessary */ - gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN; + gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN; while (order >= 0) { pages = alloc_pages_node(node, gfp, order); if (pages) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 0654e0c76bc2..519ca6534b85 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self) buff->flags = 0U; buff->len = AQ_CFG_RX_FRAME_MAX; - buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD | - __GFP_COMP, pages_order); + buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order); if (!buff->page) { err = -ENOMEM; goto err_exit; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h index 9e36319cead6..57853eead4b5 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h @@ -195,7 +195,7 @@ static inline void struct sk_buff *skb; struct octeon_skb_page_info *skb_pg_info; - page = alloc_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_page(GFP_ATOMIC); if (unlikely(!page)) return NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b97a55c827eb..ffead38cf5da 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) if (mlx4_en_prepare_rx_desc(priv, ring, ring->actual_size, - GFP_KERNEL | __GFP_COLD)) { + GFP_KERNEL)) { if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { en_err(priv, "Failed to allocate enough rx buffers\n"); return -ENOMEM; @@ -552,8 +552,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv, do { if (mlx4_en_prepare_rx_desc(priv, ring, ring->prod & ring->size_mask, - GFP_ATOMIC | __GFP_COLD | - __GFP_MEMALLOC)) + GFP_ATOMIC | __GFP_MEMALLOC)) break; ring->prod++; } while (likely(--missing)); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index e118b5f23996..ffb12dc13a5a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr) } else { struct page *page; - page = alloc_page(GFP_KERNEL | __GFP_COLD); + page = alloc_page(GFP_KERNEL); frag = page ? page_address(page) : NULL; } if (!frag) { @@ -1212,7 +1212,7 @@ static void *nfp_net_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr) } else { struct page *page; - page = alloc_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_page(GFP_ATOMIC); frag = page ? page_address(page) : NULL; } if (!frag) { diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 29fea74bff2e..7b97a9969046 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring, { if (!rx_ring->pg_chunk.page) { u64 map; - rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP | - GFP_ATOMIC, + rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC, qdev->lbq_buf_order); if (unlikely(!rx_ring->pg_chunk.page)) { netif_err(qdev, drv, qdev->ndev, diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c index 6a8406dc0c2b..91097aea6c41 100644 --- a/drivers/net/ethernet/sfc/falcon/rx.c +++ b/drivers/net/ethernet/sfc/falcon/rx.c @@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic) do { page = ef4_reuse_page(rx_queue); if (page == NULL) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | + page = alloc_pages(__GFP_COMP | (atomic ? GFP_ATOMIC : GFP_KERNEL), efx->rx_buffer_order); if (unlikely(page == NULL)) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 42443f434569..0004c50d3c83 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) do { page = efx_reuse_page(rx_queue); if (page == NULL) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | + page = alloc_pages(__GFP_COMP | (atomic ? GFP_ATOMIC : GFP_KERNEL), efx->rx_buffer_order); if (unlikely(page == NULL)) diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c index e9672b1f9968..031cf9c3435a 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c @@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata, dma_addr_t pages_dma; /* Try to obtain pages, decreasing order if necessary */ - gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN; while (order >= 0) { pages = alloc_pages(gfp, order); if (pages) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 437d36289786..50d2b76771b5 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) sw_data[0] = (u32)bufptr; } else { /* Allocate a secondary receive queue entry */ - page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD); + page = alloc_page(GFP_ATOMIC | GFP_DMA); if (unlikely(!page)) { dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); goto fail; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 511f8339fa96..5eec09d63fc0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -988,7 +988,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, int err; bool oom; - gfp |= __GFP_COLD; do { if (vi->mergeable_rx_bufs) err = add_recvbuf_mergeable(vi, rq, gfp); diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 9e538a59f09d..03e55bca4ada 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0) } for (npages = 1; npages < max_pages; npages++) { - page = page_cache_alloc_cold(inode->i_mapping); + page = page_cache_alloc(inode->i_mapping); if (!page) break; page_pool[npages] = page; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 23097cca2674..883bc7bb12c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem_monitor; } @@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f7e62d9096fe..1a4582b44d32 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,7 +24,6 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -192,16 +191,12 @@ struct vm_area_struct; /* * Action modifiers * - * __GFP_COLD indicates that the caller does not expect to be used in the near - * future. Where possible, a cache-cold page will be returned. - * * __GFP_NOWARN suppresses allocation failure reports. * * __GFP_COMP address compound page metadata. * * __GFP_ZERO returns a zeroed page on success. */ -#define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4c6790bb7afb..34ce3ebf97d5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x) return __page_cache_alloc(mapping_gfp_mask(x)); } -static inline struct page *page_cache_alloc_cold(struct address_space *x) -{ - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); -} - static inline gfp_t readahead_gfp_mask(struct address_space *x) { - return mapping_gfp_mask(x) | - __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN; + return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index aa1341474916..7c46fd0b8b64 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ - gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; + gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } diff --git a/include/linux/slab.h b/include/linux/slab.h index 79f6532f8a0b..50697a1d6621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * - * %__GFP_COLD - Request cache-cold pages instead of - * trying to return cache-warm pages. - * * %__GFP_HIGH - This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 72162f3a03fa..dbe1bb058c09 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -32,7 +32,6 @@ {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a917a301e201..bce0464524d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) */ static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } @@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); diff --git a/mm/filemap.c b/mm/filemap.c index 90a9f261f85f..923fc2ebd74a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2272,7 +2272,7 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc(mapping); if (!page) { error = -ENOMEM; goto out; @@ -2384,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) int ret; do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); + page = __page_cache_alloc(gfp_mask); if (!page) return -ENOMEM; @@ -2788,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, repeat: page = find_get_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp | __GFP_COLD); + page = __page_cache_alloc(gfp); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f265d37b3152..370b64d03e3f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2336,7 +2336,7 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, bool cold) + int migratetype) { int i, alloced = 0; @@ -2358,10 +2358,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(!cold)) - list_add(&page->lru, list); - else - list_add_tail(&page->lru, list); + list_add(&page->lru, list); list = &page->lru; alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) @@ -2795,7 +2792,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, - bool cold, struct per_cpu_pages *pcp, + struct per_cpu_pages *pcp, struct list_head *list) { struct page *page; @@ -2804,16 +2801,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, - migratetype, cold); + migratetype); if (unlikely(list_empty(list))) return NULL; } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; } while (check_new_pcp(page)); @@ -2828,14 +2821,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, { struct per_cpu_pages *pcp; struct list_head *list; - bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; unsigned long flags; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 15dab691ea70..9158e5a81391 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cd057b41f34..9c68555bb906 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -353,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -366,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index cbf70738ef5f..ae11e4c3516a 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -641,7 +641,6 @@ static const struct { { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, - { "__GFP_COLD", "CO" }, { "__GFP_NOWARN", "NWR" }, { "__GFP_RETRY_MAYFAIL", "R" }, { "__GFP_NOFAIL", "NF" }, -- cgit v1.2.3