From 95c7e0e4d4792f9c9629ad759c9be047a3ddbcc5 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Wed, 9 Jan 2013 07:19:37 +0000 Subject: ipv4: Use FIELD_SIZEOF() in inet_init(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24b384b7903e..4fdf96710d45 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1705,12 +1705,11 @@ static struct packet_type ip_packet_type __read_mostly = { static int __init inet_init(void) { - struct sk_buff *dummy_skb; struct inet_protosw *q; struct list_head *r; int rc = -EINVAL; - BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!sysctl_local_reserved_ports) -- cgit v1.2.3 From 50c3a487d507568359ccbce1b15bcdfc01edf7ef Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Tue, 22 Jan 2013 06:32:49 +0000 Subject: ipv4: Use IS_ERR_OR_NULL(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/udp.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4fdf96710d45..4b7053919976 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1333,7 +1333,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ops->callbacks.gso_segment(skb, features); rcu_read_unlock(); - if (!segs || IS_ERR(segs)) + if (IS_ERR_OR_NULL(segs)) goto out; skb = segs; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2aa69c8ae60c..52271947a471 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3243,7 +3243,7 @@ __tcp_alloc_md5sig_pool(struct sock *sk) struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (!hash || IS_ERR(hash)) + if (IS_ERR_OR_NULL(hash)) goto out_free; per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 79c8dbe59b54..cf6158f1f46b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -971,7 +971,7 @@ back_from_confirm: sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); - if (skb && !IS_ERR(skb)) + if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4); goto out; } -- cgit v1.2.3 From cef401de7be8c4e155c6746bfccf721a4fa5fab9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Jan 2013 20:34:37 +0000 Subject: net: fix possible wrong checksum generation Pravin Shelar mentioned that GSO could potentially generate wrong TX checksum if skb has fragments that are overwritten by the user between the checksum computation and transmit. He suggested to linearize skbs but this extra copy can be avoided for normal tcp skbs cooked by tcp_sendmsg(). This patch introduces a new SKB_GSO_SHARED_FRAG flag, set in skb_shinfo(skb)->gso_type if at least one frag can be modified by the user. Typical sources of such possible overwrites are {vm}splice(), sendfile(), and macvtap/tun/virtio_net drivers. Tested: $ netperf -H 7.7.8.84 MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 3959.52 $ netperf -H 7.7.8.84 -t TCP_SENDFILE TCP SENDFILE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 3216.80 Performance of the SENDFILE is impacted by the extra allocation and copy, and because we use order-0 pages, while the TCP_STREAM uses bigger pages. Reported-by: Pravin Shelar Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 3 ++- drivers/net/tun.c | 12 ++++++++---- drivers/net/virtio_net.c | 12 ++++++++---- include/linux/skbuff.h | 19 +++++++++++++++++++ net/core/dev.c | 9 +++++++++ net/core/skbuff.c | 4 ++++ net/ipv4/af_inet.c | 1 + net/ipv4/ip_gre.c | 4 +++- net/ipv4/ipip.c | 4 +++- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_output.c | 4 ++-- net/ipv6/ip6_offload.c | 1 + 13 files changed, 65 insertions(+), 15 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0f0f9ce3a776..b181dfb3d6d6 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -543,6 +543,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, skb->data_len += len; skb->len += len; skb->truesize += truesize; + skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; @@ -598,7 +599,7 @@ static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; - skb_shinfo(skb)->gso_type = gso_type; + skb_shinfo(skb)->gso_type |= gso_type; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c81680dc10eb..293ce8dfc9e6 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1005,6 +1005,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, skb->data_len += len; skb->len += len; skb->truesize += truesize; + skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; @@ -1150,16 +1151,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { + unsigned short gso_type = 0; + pr_debug("GSO!\n"); switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_TCPV6: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + gso_type = SKB_GSO_TCPV6; break; case VIRTIO_NET_HDR_GSO_UDP: - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + gso_type = SKB_GSO_UDP; break; default: tun->dev->stats.rx_frame_errors++; @@ -1168,9 +1171,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = gso.gso_size; + skb_shinfo(skb)->gso_type |= gso_type; if (skb_shinfo(skb)->gso_size == 0) { tun->dev->stats.rx_frame_errors++; kfree_skb(skb); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 701408a1ded6..58914c8ea68f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -220,6 +220,7 @@ static void set_skb_frag(struct sk_buff *skb, struct page *page, skb->len += size; skb->truesize += PAGE_SIZE; skb_shinfo(skb)->nr_frags++; + skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; *len -= size; } @@ -379,16 +380,18 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) ntohs(skb->protocol), skb->len, skb->pkt_type); if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { + unsigned short gso_type = 0; + pr_debug("GSO!\n"); switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_UDP: - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + gso_type = SKB_GSO_UDP; break; case VIRTIO_NET_HDR_GSO_TCPV6: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + gso_type = SKB_GSO_TCPV6; break; default: net_warn_ratelimited("%s: bad gso type %u.\n", @@ -397,7 +400,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) } if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = hdr->hdr.gso_size; if (skb_shinfo(skb)->gso_size == 0) { @@ -405,6 +408,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) goto frame_err; } + skb_shinfo(skb)->gso_type |= gso_type; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; skb_shinfo(skb)->gso_segs = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8b2256e880e0..0259b719bebf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -307,6 +307,13 @@ enum { SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, + + /* This indicates at least one fragment might be overwritten + * (as in vmsplice(), sendfile() ...) + * If we need to compute a TX checksum, we'll need to copy + * all frags to avoid possible bad checksum + */ + SKB_GSO_SHARED_FRAG = 1 << 6, }; #if BITS_PER_LONG > 32 @@ -2200,6 +2207,18 @@ static inline int skb_linearize(struct sk_buff *skb) return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; } +/** + * skb_has_shared_frag - can any frag be overwritten + * @skb: buffer to test + * + * Return true if the skb has at least one frag that might be modified + * by an external entity (as in vmsplice()/sendfile()) + */ +static inline bool skb_has_shared_frag(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_type & SKB_GSO_SHARED_FRAG; +} + /** * skb_linearize_cow - make sure skb is linear and writable * @skb: buffer to process diff --git a/net/core/dev.c b/net/core/dev.c index c69cd8721b28..a83375d3af72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2271,6 +2271,15 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (skb_has_shared_frag(skb)) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2568c449fe36..bddc1dd2e7f2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2340,6 +2340,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + skb_shinfo(skb1)->gso_type = skb_shinfo(skb)->gso_type; + if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2845,6 +2847,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_copy_from_linear_data_offset(skb, offset, skb_put(nskb, hsize), hsize); + skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type; + while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; __skb_frag_ref(frag); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4b7053919976..49ddca31c4da 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1306,6 +1306,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | + SKB_GSO_SHARED_FRAG | 0))) goto out; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 303012adf9e6..af6be70821c4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -738,7 +738,7 @@ drop: static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph; const struct iphdr *tiph; struct flowi4 fl4; u8 tos; @@ -756,6 +756,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev skb_checksum_help(skb)) goto tx_error; + old_iph = ip_hdr(skb); + if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 191fc24a745a..8f024d41eefa 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -472,7 +472,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; @@ -486,6 +486,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb_checksum_help(skb)) goto tx_error; + old_iph = ip_hdr(skb); + if (tos & 1) tos = old_iph->tos; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 52271947a471..3ec1f69c5ceb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -896,6 +896,8 @@ new_segment: skb_fill_page_desc(skb, i, page, offset, copy); } + skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; + skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -3032,6 +3034,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | + SKB_GSO_SHARED_FRAG | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0905997e5873..492c7cfe1453 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1240,13 +1240,13 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ if (!skb_shinfo(prev)->gso_size) { skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; + skb_shinfo(prev)->gso_type |= sk->sk_gso_type; } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (skb_shinfo(skb)->gso_segs <= 1) { skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 667a6adfccf8..367e2ec01da1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1133,6 +1133,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { + skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; if (skb->len <= mss_now || !sk_can_gso(sk) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal @@ -1140,11 +1141,10 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ skb_shinfo(skb)->gso_segs = 1; skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; } else { skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type = sk->sk_gso_type; + skb_shinfo(skb)->gso_type |= sk->sk_gso_type; } } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index f26f0da7f095..d141fc32a2ea 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -100,6 +100,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | + SKB_GSO_SHARED_FRAG | 0))) goto out; -- cgit v1.2.3 From 547472b8e1da72ae226430c0c4273e36fc8ca768 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Feb 2013 14:42:23 -0500 Subject: ipv4: Disallow non-namespace aware protocols to register. All in-tree ipv4 protocol implementations are now namespace aware. Therefore all the run-time checks are superfluous. Reject registry of any non-namespace aware ipv4 protocol. Eventually we'll remove prot->netns_ok and this registry time check as well. Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 19 ------------------- net/ipv4/ip_input.c | 7 ------- net/ipv4/protocol.c | 6 ++++++ 3 files changed, 6 insertions(+), 26 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 49ddca31c4da..1aec92bf8018 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -263,21 +263,6 @@ void build_ehash_secret(void) } EXPORT_SYMBOL(build_ehash_secret); -static inline int inet_netns_ok(struct net *net, __u8 protocol) -{ - const struct net_protocol *ipprot; - - if (net_eq(net, &init_net)) - return 1; - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot == NULL) { - /* raw IP is OK */ - return 1; - } - return ipprot->netns_ok; -} - /* * Create an inet socket. */ @@ -350,10 +335,6 @@ lookup_protocol: !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; - err = -EAFNOSUPPORT; - if (!inet_netns_ok(net, protocol)) - goto out_rcu_unlock; - sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f1395a6fb35f..87abd3e2bd32 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -208,13 +208,6 @@ static int ip_local_deliver_finish(struct sk_buff *skb) if (ipprot != NULL) { int ret; - if (!net_eq(net, &init_net) && !ipprot->netns_ok) { - net_info_ratelimited("%s: proto %d isn't netns-ready\n", - __func__, protocol); - kfree_skb(skb); - goto out; - } - if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0f9d09f54bd9..ce848461acbb 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -37,6 +37,12 @@ const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { + if (!prot->netns_ok) { + pr_err("Protocol %u is not namespace aware, cannot register.\n", + protocol); + return -EINVAL; + } + return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } -- cgit v1.2.3 From c9af6db4c11ccc6c3e7f19bbc15d54023956f97c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 11 Feb 2013 09:27:41 +0000 Subject: net: Fix possible wrong checksum generation. Patch cef401de7be8c4e (net: fix possible wrong checksum generation) fixed wrong checksum calculation but it broke TSO by defining new GSO type but not a netdev feature for that type. net_gso_ok() would not allow hardware checksum/segmentation offload of such packets without the feature. Following patch fixes TSO and wrong checksum. This patch uses same logic that Eric Dumazet used. Patch introduces new flag SKBTX_SHARED_FRAG if at least one frag can be modified by the user. but SKBTX_SHARED_FRAG flag is kept in skb shared info tx_flags rather than gso_type. tx_flags is better compared to gso_type since we can have skb with shared frag without gso packet. It does not link SHARED_FRAG to GSO, So there is no need to define netdev feature for this. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 4 ++-- drivers/net/tun.c | 13 +++++-------- drivers/net/virtio_net.c | 13 +++++-------- include/linux/skbuff.h | 17 +++++++++-------- net/core/skbuff.c | 5 ++--- net/ipv4/af_inet.c | 1 - net/ipv4/ip_output.c | 1 + net/ipv4/tcp.c | 4 +--- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_output.c | 4 ++-- net/ipv6/ip6_offload.c | 1 - 11 files changed, 29 insertions(+), 38 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index b181dfb3d6d6..97243011d319 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -543,7 +543,6 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, skb->data_len += len; skb->len += len; skb->truesize += truesize; - skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; @@ -599,7 +598,7 @@ static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; - skb_shinfo(skb)->gso_type |= gso_type; + skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; @@ -743,6 +742,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (zerocopy) { skb_shinfo(skb)->destructor_arg = m->msg_control; skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; } if (vlan) macvlan_start_xmit(skb, vlan->dev); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b1038c0e2240..b6f45c5d84d5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1019,7 +1019,6 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, skb->data_len += len; skb->len += len; skb->truesize += truesize; - skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; @@ -1165,18 +1164,16 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - unsigned short gso_type = 0; - pr_debug("GSO!\n"); switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: - gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_TCPV6: - gso_type = SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; break; case VIRTIO_NET_HDR_GSO_UDP: - gso_type = SKB_GSO_UDP; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; default: tun->dev->stats.rx_frame_errors++; @@ -1185,10 +1182,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN) - gso_type |= SKB_GSO_TCP_ECN; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = gso.gso_size; - skb_shinfo(skb)->gso_type |= gso_type; if (skb_shinfo(skb)->gso_size == 0) { tun->dev->stats.rx_frame_errors++; kfree_skb(skb); @@ -1204,6 +1200,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (zerocopy) { skb_shinfo(skb)->destructor_arg = msg_control; skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; } skb_reset_network_header(skb); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 381a2d8d8a81..192c91c8e799 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -227,7 +227,7 @@ static void set_skb_frag(struct sk_buff *skb, struct page *page, skb->len += size; skb->truesize += PAGE_SIZE; skb_shinfo(skb)->nr_frags++; - skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; *len -= size; } @@ -387,18 +387,16 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) ntohs(skb->protocol), skb->len, skb->pkt_type); if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - unsigned short gso_type = 0; - pr_debug("GSO!\n"); switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: - gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_UDP: - gso_type = SKB_GSO_UDP; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; case VIRTIO_NET_HDR_GSO_TCPV6: - gso_type = SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; break; default: net_warn_ratelimited("%s: bad gso type %u.\n", @@ -407,7 +405,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) } if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) - gso_type |= SKB_GSO_TCP_ECN; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = hdr->hdr.gso_size; if (skb_shinfo(skb)->gso_size == 0) { @@ -415,7 +413,6 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) goto frame_err; } - skb_shinfo(skb)->gso_type |= gso_type; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; skb_shinfo(skb)->gso_segs = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d7573c37a51d..9da99520ccd5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -230,6 +230,13 @@ enum { /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, + + /* This indicates at least one fragment might be overwritten + * (as in vmsplice(), sendfile() ...) + * If we need to compute a TX checksum, we'll need to copy + * all frags to avoid possible bad checksum + */ + SKBTX_SHARED_FRAG = 1 << 5, }; /* @@ -307,13 +314,6 @@ enum { SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, - - /* This indicates at least one fragment might be overwritten - * (as in vmsplice(), sendfile() ...) - * If we need to compute a TX checksum, we'll need to copy - * all frags to avoid possible bad checksum - */ - SKB_GSO_SHARED_FRAG = 1 << 6, }; #if BITS_PER_LONG > 32 @@ -2220,7 +2220,8 @@ static inline int skb_linearize(struct sk_buff *skb) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_type & SKB_GSO_SHARED_FRAG; + return skb_is_nonlinear(skb) && + skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 21a22cce6e53..6c1ad09f8796 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2326,8 +2326,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); - skb_shinfo(skb1)->gso_type = skb_shinfo(skb)->gso_type; - + skb_shinfo(skb)->tx_flags = skb_shinfo(skb1)->tx_flags & SKBTX_SHARED_FRAG; if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2833,7 +2832,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_copy_from_linear_data_offset(skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type; + skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1aec92bf8018..e6e5d8506336 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1287,7 +1287,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | - SKB_GSO_SHARED_FRAG | 0))) goto out; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3e98ed2bff55..5e12dca7b3dd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -598,6 +598,7 @@ slow_path: /* for offloaded checksums cleanup checksum before fragmentation */ if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) goto fail; + iph = ip_hdr(skb); left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 801b07b796f0..1f0bedb8622f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -897,8 +897,7 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } - - skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; @@ -3044,7 +3043,6 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | - SKB_GSO_SHARED_FRAG | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d9bfaea34322..a759e19496d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1239,13 +1239,13 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ if (!skb_shinfo(prev)->gso_size) { skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type |= sk->sk_gso_type; + skb_shinfo(prev)->gso_type = sk->sk_gso_type; } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (skb_shinfo(skb)->gso_segs <= 1) { skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; + skb_shinfo(skb)->gso_type = 0; } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 564bf89d9fd3..6182d90e97b0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1133,7 +1133,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; if (skb->len <= mss_now || !sk_can_gso(sk) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal @@ -1141,10 +1140,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ skb_shinfo(skb)->gso_segs = 1; skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type |= sk->sk_gso_type; + skb_shinfo(skb)->gso_type = sk->sk_gso_type; } } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index d141fc32a2ea..f26f0da7f095 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -100,7 +100,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | - SKB_GSO_SHARED_FRAG | 0))) goto out; -- cgit v1.2.3 From 68c331631143f5f039baac99a650e0b9e1ea02b6 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 14 Feb 2013 14:02:41 +0000 Subject: v4 GRE: Add TCP segmentation offload for GRE Following patch adds GRE protocol offload handler so that skb_gso_segment() can segment GRE packets. SKB GSO CB is added to keep track of total header length so that skb_segment can push entire header. e.g. in case of GRE, skb_segment need to push inner and outer headers to every segment. New NETIF_F_GRE_GSO feature is added for devices which support HW GRE TSO offload. Currently none of devices support it therefore GRE GSO always fall backs to software GSO. [ Compute pkt_len before ip_local_out() invocation. -DaveM ] Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 3 +- include/linux/skbuff.h | 17 ++++++ net/core/dev.c | 1 + net/core/ethtool.c | 1 + net/core/skbuff.c | 6 +- net/ipv4/af_inet.c | 1 + net/ipv4/gre.c | 118 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_gre.c | 82 +++++++++++++++++++++++++--- net/ipv4/tcp.c | 1 + net/ipv4/udp.c | 3 +- net/ipv6/ip6_offload.c | 1 + net/ipv6/udp_offload.c | 3 +- 12 files changed, 226 insertions(+), 11 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 5ac32123035a..3dd39340430e 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -41,7 +41,7 @@ enum { NETIF_F_TSO_ECN_BIT, /* ... TCP ECN support */ NETIF_F_TSO6_BIT, /* ... TCPv6 segmentation */ NETIF_F_FSO_BIT, /* ... FCoE segmentation */ - NETIF_F_GSO_RESERVED1, /* ... free (fill GSO_MASK to 8 bits) */ + NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */ /**/NETIF_F_GSO_LAST, /* [can't be last bit, see GSO_MASK] */ NETIF_F_GSO_RESERVED2 /* ... free (fill GSO_MASK to 8 bits) */ = NETIF_F_GSO_LAST, @@ -102,6 +102,7 @@ enum { #define NETIF_F_VLAN_CHALLENGED __NETIF_F(VLAN_CHALLENGED) #define NETIF_F_RXFCS __NETIF_F(RXFCS) #define NETIF_F_RXALL __NETIF_F(RXALL) +#define NETIF_F_GRE_GSO __NETIF_F(GSO_GRE) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca6ee7d93edb..821c7f45d2a7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,6 +314,8 @@ enum { SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, + + SKB_GSO_GRE = 1 << 6, }; #if BITS_PER_LONG > 32 @@ -2732,6 +2734,21 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb) } #endif +/* Keeps track of mac header offset relative to skb->head. + * It is useful for TSO of Tunneling protocol. e.g. GRE. + * For non-tunnel skb it points to skb_mac_header() and for + * tunnel skb it points to outer mac header. */ +struct skb_gso_cb { + int mac_offset; +}; +#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb) + +static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) +{ + return (skb_mac_header(inner_skb) - inner_skb->head) - + SKB_GSO_CB(inner_skb)->mac_offset; +} + static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; diff --git a/net/core/dev.c b/net/core/dev.c index 67deae60214c..1cd6297fd34b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2413,6 +2413,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_mac_len(skb); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d9d55209db67..3e9b2c3e30f0 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -77,6 +77,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c1ad09f8796..2a3ca33c30aa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2738,6 +2738,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) unsigned int mss = skb_shinfo(skb)->gso_size; unsigned int doffset = skb->data - skb_mac_header(skb); unsigned int offset = doffset; + unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int headroom; unsigned int len; int sg = !!(features & NETIF_F_SG); @@ -2814,7 +2815,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_set_network_header(nskb, skb->mac_len); nskb->transport_header = (nskb->network_header + skb_network_header_len(skb)); - skb_copy_from_linear_data(skb, nskb->data, doffset); + + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + doffset + tnl_hlen); if (fskb != skb_shinfo(skb)->frag_list) continue; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e6e5d8506336..e225a4e5b572 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1287,6 +1287,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | + SKB_GSO_GRE | 0))) goto out; diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 42a491055c76..7a4c710c4cdd 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,11 @@ static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; static DEFINE_SPINLOCK(gre_proto_lock); +struct gre_base_hdr { + __be16 flags; + __be16 protocol; +}; +#define GRE_HEADER_SECTION 4 int gre_add_protocol(const struct gre_protocol *proto, u8 version) { @@ -112,12 +118,117 @@ static void gre_err(struct sk_buff *skb, u32 info) rcu_read_unlock(); } +static struct sk_buff *gre_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t enc_features; + int ghl = GRE_HEADER_SECTION; + struct gre_base_hdr *greh; + int mac_len = skb->mac_len; + int tnl_hlen; + bool csum; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_TCPV6 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + goto out; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + + if (greh->flags & GRE_KEY) + ghl += GRE_HEADER_SECTION; + if (greh->flags & GRE_SEQ) + ghl += GRE_HEADER_SECTION; + if (greh->flags & GRE_CSUM) { + ghl += GRE_HEADER_SECTION; + csum = true; + } else + csum = false; + + /* setup inner skb. */ + if (greh->protocol == htons(ETH_P_TEB)) { + struct ethhdr *eth = eth_hdr(skb); + skb->protocol = eth->h_proto; + } else { + skb->protocol = greh->protocol; + } + + skb->encapsulation = 0; + + if (unlikely(!pskb_may_pull(skb, ghl))) + goto out; + __skb_pull(skb, ghl); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) + goto out; + + skb = segs; + tnl_hlen = skb_tnl_header_len(skb); + do { + __skb_push(skb, ghl); + if (csum) { + __be32 *pcsum; + + if (skb_has_shared_frag(skb)) { + int err; + + err = __skb_linearize(skb); + if (err) { + kfree_skb(segs); + segs = ERR_PTR(err); + goto out; + } + } + + greh = (struct gre_base_hdr *)(skb->data); + pcsum = (__be32 *)(greh + 1); + *pcsum = 0; + *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); + } + __skb_push(skb, tnl_hlen - ghl); + + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb->mac_len = mac_len; + } while ((skb = skb->next)); +out: + return segs; +} + +static int gre_gso_send_check(struct sk_buff *skb) +{ + if (!skb->encapsulation) + return -EINVAL; + return 0; +} + static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, .netns_ok = 1, }; +static const struct net_offload gre_offload = { + .callbacks = { + .gso_send_check = gre_gso_send_check, + .gso_segment = gre_gso_segment, + }, +}; + static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); @@ -127,11 +238,18 @@ static int __init gre_init(void) return -EAGAIN; } + if (inet_add_offload(&gre_offload, IPPROTO_GRE)) { + pr_err("can't add protocol offload\n"); + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); + return -EAGAIN; + } + return 0; } static void __exit gre_exit(void) { + inet_del_offload(&gre_offload, IPPROTO_GRE); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 00a14b9864ea..a56f1182c176 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -735,8 +735,33 @@ drop: return 0; } +static struct sk_buff *handle_offloads(struct sk_buff *skb) +{ + int err; + + if (skb_is_gso(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + goto error; + skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; + return skb; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } + skb->ip_summed = CHECKSUM_NONE; + + return skb; + +error: + kfree_skb(skb); + return ERR_PTR(err); +} + static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { + struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats); struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *old_iph; const struct iphdr *tiph; @@ -751,10 +776,19 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev __be32 dst; int mtu; u8 ttl; + int err; + int pkt_len; - if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb)) - goto tx_error; + skb = handle_offloads(skb); + if (IS_ERR(skb)) { + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + + if (!skb->encapsulation) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } old_iph = ip_hdr(skb); @@ -855,7 +889,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (skb->protocol == htons(ETH_P_IP)) { df |= (old_iph->frag_off&htons(IP_DF)); - if ((old_iph->frag_off&htons(IP_DF)) && + if (!skb_is_gso(skb) && + (old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -875,7 +910,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } } - if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { + if (!skb_is_gso(skb) && + mtu >= IPV6_MIN_MTU && + mtu < skb->len - tunnel->hlen + gre_hlen) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; @@ -936,6 +973,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ttl; + iph->id = 0; if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) @@ -964,9 +1002,19 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev *ptr = tunnel->parms.o_key; ptr--; } - if (tunnel->parms.o_flags&GRE_CSUM) { + /* Skip GRE checksum if skb is getting offloaded. */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE) && + (tunnel->parms.o_flags&GRE_CSUM)) { int offset = skb_transport_offset(skb); + if (skb_has_shared_frag(skb)) { + err = __skb_linearize(skb); + if (err) { + ip_rt_put(rt); + goto tx_error; + } + } + *ptr = 0; *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset, skb->len - offset, @@ -974,7 +1022,19 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } } - iptunnel_xmit(skb, dev); + nf_reset(skb); + + pkt_len = skb->len - skb_transport_offset(skb); + err = ip_local_out(skb); + if (likely(net_xmit_eval(err) == 0)) { + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += pkt_len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + dev->stats.tx_errors++; + dev->stats.tx_aborted_errors++; + } return NETDEV_TX_OK; #if IS_ENABLED(CONFIG_IPV6) @@ -1044,6 +1104,11 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) mtu = 68; tunnel->hlen = addend; + /* TCP offload with GRE SEQ is not supported. */ + if (!(tunnel->parms.o_flags & GRE_SEQ)) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } return mtu; } @@ -1593,6 +1658,9 @@ static void ipgre_tap_setup(struct net_device *dev) dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; + + dev->features |= GRE_FEATURES; + dev->hw_features |= GRE_FEATURES; } static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1f0bedb8622f..7a5ba48c2cc9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3043,6 +3043,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | + SKB_GSO_GRE | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6791aac06ea9..39a5e7a9a77f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2305,7 +2305,8 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | + SKB_GSO_GRE) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index f26f0da7f095..8234c1dcdf72 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -99,6 +99,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | + SKB_GSO_GRE | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 0c8934a317c2..cf05cf073c51 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -56,7 +56,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | + SKB_GSO_GRE) || !(type & (SKB_GSO_UDP)))) goto out; -- cgit v1.2.3 From 08dcdbf6a7b9d14c2302c5bd0c5390ddf122f664 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Feb 2013 12:18:52 +0000 Subject: ipv6: use a stronger hash for tcp It looks like its possible to open thousands of TCP IPv6 sessions on a server, all landing in a single slot of TCP hash table. Incoming packets have to lookup sockets in a very long list. We should hash all bits from foreign IPv6 addresses, using a salt and hash mix, not a simple XOR. inet6_ehashfn() can also separately use the ports, instead of xoring them. Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 8 ++++---- include/net/inet_sock.h | 1 + include/net/ipv6.h | 12 ++++++++++++ net/ipv4/af_inet.c | 9 +++++++-- 4 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 7ca75cbbf75e..fd4ee016ba5c 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -28,16 +28,16 @@ struct inet_hashinfo; -/* I have no idea if this is a good hash for v6 or not. -DaveM */ static inline unsigned int inet6_ehashfn(struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { - u32 ports = (lport ^ (__force u16)fport); + u32 ports = (((u32)lport) << 16) | (__force u32)fport; return jhash_3words((__force u32)laddr->s6_addr32[3], - (__force u32)faddr->s6_addr32[3], - ports, inet_ehash_secret + net_hash_mix(net)); + ipv6_addr_jhash(faddr), + ports, + inet_ehash_secret + net_hash_mix(net)); } static inline int inet6_sk_ehashfn(const struct sock *sk) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a4196cbc84ec..7235ae73a1e8 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -203,6 +203,7 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, extern int inet_sk_rebuild_header(struct sock *sk); extern u32 inet_ehash_secret; +extern u32 ipv6_hash_secret; extern void build_ehash_secret(void); static inline unsigned int inet_ehashfn(struct net *net, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 851d5412a299..64d12e77719a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -514,6 +515,17 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a) #endif } +/* more secured version of ipv6_addr_hash() */ +static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +{ + u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; + + return jhash_3words(v, + (__force u32)a->s6_addr32[2], + (__force u32)a->s6_addr32[3], + ipv6_hash_secret); +} + static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e225a4e5b572..9cbcb94a4c6d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -248,8 +248,12 @@ EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + /* - * inet_ehash_secret must be set exactly once + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. */ void build_ehash_secret(void) { @@ -259,7 +263,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - cmpxchg(&inet_ehash_secret, 0, rnd); + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); } EXPORT_SYMBOL(build_ehash_secret); -- cgit v1.2.3 From 5b0520425e5ea81ba95ec486dd6bbb59a09fff0e Mon Sep 17 00:00:00 2001 From: Li Wei Date: Thu, 21 Feb 2013 22:18:44 +0000 Subject: ipv4: fix error handling in icmp_protocol. Now we handle icmp errors in each transport protocol's err_handler, for icmp protocols, that is ping_err. Since this handler only care of those icmp errors triggered by echo request, errors triggered by echo reply(which sent by kernel) are sliently ignored. So wrap ping_err() with icmp_err() to deal with those icmp errors. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- include/net/icmp.h | 1 + net/ipv4/af_inet.c | 2 +- net/ipv4/icmp.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/icmp.h b/include/net/icmp.h index 9ac2524d1402..081439fd070e 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -41,6 +41,7 @@ struct net; extern void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info); extern int icmp_rcv(struct sk_buff *skb); +extern void icmp_err(struct sk_buff *, u32 info); extern int icmp_init(void); extern void icmp_out_count(struct net *net, unsigned char type); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9cbcb94a4c6d..15847e19b7dd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1577,7 +1577,7 @@ static const struct net_offload udp_offload = { static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, - .err_handler = ping_err, + .err_handler = icmp_err, .no_policy = 1, .netns_ok = 1, }; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 17ff9fd7cdda..3ac5dff79627 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,6 +934,29 @@ error: goto drop; } +void icmp_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; + struct net *net = dev_net(skb->dev); + + /* + * Use ping_err to handle all icmp errors except those + * triggered by ICMP_ECHOREPLY which sent from kernel. + */ + if (icmph->type != ICMP_ECHOREPLY) { + ping_err(skb, info); + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + else if (type == ICMP_REDIRECT) + ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); +} + /* * This table is the definition of how we handle ICMP. */ -- cgit v1.2.3 From 490ab08127cebc25e3a260a74556b56ce5f47c0f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 22 Feb 2013 07:30:30 +0000 Subject: IP_GRE: Fix IP-Identification. GRE-GSO generates ip fragments with id 0,2,3,4... for every GSO packet, which is not correct. Following patch fixes it by setting ip-header id unique id of fragments are allowed. As Eric Dumazet suggested it is optimized by using inner ip-header whenever inner packet is ipv4. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ipip.h | 17 +++++++++++++++++ net/ipv4/af_inet.c | 6 ++++-- net/ipv4/ip_gre.c | 3 ++- 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/ipip.h b/include/net/ipip.h index 21947cf4fa46..fd19625ff99d 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -71,4 +71,21 @@ static inline void iptunnel_xmit(struct sk_buff *skb, struct net_device *dev) } } +static inline void tunnel_ip_select_ident(struct sk_buff *skb, + const struct iphdr *old_iph, + struct dst_entry *dst) +{ + struct iphdr *iph = ip_hdr(skb); + + if (iph->frag_off & htons(IP_DF)) + iph->id = 0; + else { + /* Use inner packet iph-id if possible. */ + if (skb->protocol == htons(ETH_P_IP) && old_iph->id) + iph->id = old_iph->id; + else + __ip_select_ident(iph, dst, + (skb_shinfo(skb)->gso_segs ?: 1) - 1); + } +} #endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15847e19b7dd..68f6a94f7661 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1332,8 +1332,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (skb->next != NULL) iph->frag_off |= htons(IP_MF); offset += (skb->len - skb->mac_len - iph->ihl * 4); - } else - iph->id = htons(id++); + } else { + if (!(iph->frag_off & htons(IP_DF))) + iph->id = htons(id++); + } iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5ef4da780ac1..b8bada00d516 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -970,7 +970,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ttl; - iph->id = 0; + + tunnel_ip_select_ident(skb, old_iph, &rt->dst); if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) -- cgit v1.2.3