From d5a9e24afb4ab38110ebb777588ea0bd0eacbd0a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Jan 2009 16:22:11 -0800 Subject: net: Allow RX queue selection to seed TX queue hashing. The idea is that drivers which implement multiqueue RX pre-seed the SKB by recording the RX queue selected by the hardware. If such a seed is found on TX, we'll use that to select the outgoing TX queue. This helps get more consistent load balancing on router and firewall loads. Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5379b0c1190a..b21ad0b47aae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1722,6 +1722,13 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) simple_tx_hashrnd_initialized = 1; } + if (skb_rx_queue_recorded(skb)) { + u32 val = skb_get_rx_queue(skb); + + hash = jhash_1word(val, simple_tx_hashrnd); + goto out; + } + switch (skb->protocol) { case htons(ETH_P_IP): if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))) @@ -1759,6 +1766,7 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); +out: return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } -- cgit v1.2.3 From f7105d63940899ece79bda024f668e6c761cfebf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Jan 2009 16:27:48 -0800 Subject: net: If SKB has attached socket, use socket's hash for TX queue selection. Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index b21ad0b47aae..cb8caa93caca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1729,6 +1729,13 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) goto out; } + if (skb->sk && skb->sk->sk_hash) { + u32 val = skb->sk->sk_hash; + + hash = jhash_1word(val, simple_tx_hashrnd); + goto out; + } + switch (skb->protocol) { case htons(ETH_P_IP): if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))) -- cgit v1.2.3 From 7019298a2a5058c4e324494d6c8d0598214c28f4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Jan 2009 16:34:47 -0800 Subject: net: Get rid of by-hand TX queue hashing. We now only TX hash on pre-computed SKB properties. The thinking is: 1) High performance routing and firewalling setups will have a multiqueue capable card used for receive, and therefore would have RX queue recordings made into the SKB which can be used for the TX side hash. 2) Locally generated packets will have an attached socket and thus a valid sk->sk_hash to make use of. Signed-off-by: David S. Miller --- net/core/dev.c | 73 +++++++++++----------------------------------------------- 1 file changed, 14 insertions(+), 59 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cb8caa93caca..e61b95c11fc0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1708,72 +1708,27 @@ out_kfree_skb: return 0; } -static u32 simple_tx_hashrnd; -static int simple_tx_hashrnd_initialized = 0; +static u32 skb_tx_hashrnd; +static int skb_tx_hashrnd_initialized = 0; -static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) +static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) { - u32 addr1, addr2, ports; - u32 hash, ihl; - u8 ip_proto = 0; + u32 hash; - if (unlikely(!simple_tx_hashrnd_initialized)) { - get_random_bytes(&simple_tx_hashrnd, 4); - simple_tx_hashrnd_initialized = 1; + if (unlikely(!skb_tx_hashrnd_initialized)) { + get_random_bytes(&skb_tx_hashrnd, 4); + skb_tx_hashrnd_initialized = 1; } if (skb_rx_queue_recorded(skb)) { - u32 val = skb_get_rx_queue(skb); - - hash = jhash_1word(val, simple_tx_hashrnd); - goto out; - } - - if (skb->sk && skb->sk->sk_hash) { - u32 val = skb->sk->sk_hash; - - hash = jhash_1word(val, simple_tx_hashrnd); - goto out; - } - - switch (skb->protocol) { - case htons(ETH_P_IP): - if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))) - ip_proto = ip_hdr(skb)->protocol; - addr1 = ip_hdr(skb)->saddr; - addr2 = ip_hdr(skb)->daddr; - ihl = ip_hdr(skb)->ihl; - break; - case htons(ETH_P_IPV6): - ip_proto = ipv6_hdr(skb)->nexthdr; - addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; - addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; - ihl = (40 >> 2); - break; - default: - return 0; - } - - - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - ports = *((u32 *) (skb_network_header(skb) + (ihl * 4))); - break; - - default: - ports = 0; - break; - } + hash = skb_get_rx_queue(skb); + } else if (skb->sk && skb->sk->sk_hash) { + hash = skb->sk->sk_hash; + } else + hash = skb->protocol; - hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); + hash = jhash_1word(hash, skb_tx_hashrnd); -out: return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } @@ -1786,7 +1741,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb); else if (dev->real_num_tx_queues > 1) - queue_index = simple_tx_hash(dev, skb); + queue_index = skb_tx_hash(dev, skb); skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); -- cgit v1.2.3 From 5d0d9be8ef456afc6c3fb5f8aad06ef19b704b05 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:48 +0000 Subject: gro: Move common completion code into helpers Currently VLAN still has a bit of common code handling the aftermath of GRO that's shared with the common path. This patch moves them into shared helpers to reduce code duplication. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++ net/8021q/vlan_core.c | 39 +++--------------------- net/core/dev.c | 76 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 59 insertions(+), 59 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd8a35b3e8b2..20419508eec1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1375,12 +1375,15 @@ extern int netif_receive_skb(struct sk_buff *skb); extern void napi_gro_flush(struct napi_struct *napi); extern int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb); +extern int napi_skb_finish(int ret, struct sk_buff *skb); extern int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); extern void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb); extern struct sk_buff * napi_fraginfo_skb(struct napi_struct *napi, struct napi_gro_fraginfo *info); +extern int napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, int ret); extern int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info); extern void netif_nit_deliver(struct sk_buff *skb); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e9db889d6222..2eb057a74654 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -98,22 +98,7 @@ drop: int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct sk_buff *skb) { - int err = NET_RX_SUCCESS; - - switch (vlan_gro_common(napi, grp, vlan_tci, skb)) { - case -1: - return netif_receive_skb(skb); - - case 2: - err = NET_RX_DROP; - /* fall through */ - - case 1: - kfree_skb(skb); - break; - } - - return err; + return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); } EXPORT_SYMBOL(vlan_gro_receive); @@ -121,27 +106,11 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct napi_gro_fraginfo *info) { struct sk_buff *skb = napi_fraginfo_skb(napi, info); - int err = NET_RX_DROP; if (!skb) - goto out; - - err = NET_RX_SUCCESS; - - switch (vlan_gro_common(napi, grp, vlan_tci, skb)) { - case -1: - return netif_receive_skb(skb); - - case 2: - err = NET_RX_DROP; - /* fall through */ - - case 1: - napi_reuse_skb(napi, skb); - break; - } + return NET_RX_DROP; -out: - return err; + return napi_frags_finish(napi, skb, + vlan_gro_common(napi, grp, vlan_tci, skb)); } EXPORT_SYMBOL(vlan_gro_frags); diff --git a/net/core/dev.c b/net/core/dev.c index e61b95c11fc0..cd23ae15a1d5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,6 +135,14 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) +enum { + GRO_MERGED, + GRO_MERGED_FREE, + GRO_HELD, + GRO_NORMAL, + GRO_DROP, +}; + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -2369,7 +2377,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) int count = 0; int same_flow; int mac_len; - int free; + int ret; if (!(skb->dev->features & NETIF_F_GRO)) goto normal; @@ -2412,7 +2420,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) goto normal; same_flow = NAPI_GRO_CB(skb)->same_flow; - free = NAPI_GRO_CB(skb)->free; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { struct sk_buff *nskb = *pp; @@ -2435,12 +2443,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_shinfo(skb)->gso_size = skb->len; skb->next = napi->gro_list; napi->gro_list = skb; + ret = GRO_HELD; ok: - return free; + return ret; normal: - return -1; + return GRO_NORMAL; } EXPORT_SYMBOL(dev_gro_receive); @@ -2456,18 +2465,30 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return dev_gro_receive(napi, skb); } -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +int napi_skb_finish(int ret, struct sk_buff *skb) { - switch (__napi_gro_receive(napi, skb)) { - case -1: + int err = NET_RX_SUCCESS; + + switch (ret) { + case GRO_NORMAL: return netif_receive_skb(skb); - case 1: + case GRO_DROP: + err = NET_RX_DROP; + /* fall through */ + + case GRO_MERGED_FREE: kfree_skb(skb); break; } - return NET_RX_SUCCESS; + return err; +} +EXPORT_SYMBOL(napi_skb_finish); + +int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + return napi_skb_finish(__napi_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -2520,29 +2541,36 @@ out: } EXPORT_SYMBOL(napi_fraginfo_skb); -int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { - struct sk_buff *skb = napi_fraginfo_skb(napi, info); - int err = NET_RX_DROP; - - if (!skb) - goto out; + int err = NET_RX_SUCCESS; - err = NET_RX_SUCCESS; - - switch (__napi_gro_receive(napi, skb)) { - case -1: + switch (ret) { + case GRO_NORMAL: return netif_receive_skb(skb); - case 0: - goto out; - } + case GRO_DROP: + err = NET_RX_DROP; + /* fall through */ - napi_reuse_skb(napi, skb); + case GRO_MERGED_FREE: + napi_reuse_skb(napi, skb); + break; + } -out: return err; } +EXPORT_SYMBOL(napi_frags_finish); + +int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +{ + struct sk_buff *skb = napi_fraginfo_skb(napi, info); + + if (!skb) + return NET_RX_DROP; + + return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +} EXPORT_SYMBOL(napi_gro_frags); static int process_backlog(struct napi_struct *napi, int quota) -- cgit v1.2.3 From 86911732d3996a9da07914b280621450111bb6da Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:50 +0000 Subject: gro: Avoid copying headers of unmerged packets Unfortunately simplicity isn't always the best. The fraginfo interface turned out to be suboptimal. The problem was quite obvious. For every packet, we have to copy the headers from the frags structure into skb->head, even though for 99% of the packets this part is immediately thrown away after the merge. LRO didn't have this problem because it directly read the headers from the frags structure. This patch attempts to address this by creating an interface that allows GRO to access the headers in the first frag without having to copy it. Because all drivers that use frags place the headers in the first frag this optimisation should be enough. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 26 ++++++++++++++++++ include/linux/skbuff.h | 2 -- net/8021q/vlan_core.c | 2 ++ net/core/dev.c | 70 +++++++++++++++++++++++++++++++++++++++-------- net/core/skbuff.c | 23 ++++++++++------ net/ipv4/af_inet.c | 10 +++---- net/ipv4/tcp.c | 16 +++++------ net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/af_inet6.c | 30 +++++++++++++------- net/ipv6/tcp_ipv6.c | 2 +- 10 files changed, 137 insertions(+), 46 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20419508eec1..7a5057fbb7cd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -984,6 +984,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, void netif_napi_del(struct napi_struct *napi); struct napi_gro_cb { + /* This indicates where we are processing relative to skb->data. */ + int data_offset; + /* This is non-zero if the packet may be of the same flow. */ int same_flow; @@ -1087,6 +1090,29 @@ extern int dev_restart(struct net_device *dev); #ifdef CONFIG_NETPOLL_TRAP extern int netpoll_trap(void); #endif +extern void *skb_gro_header(struct sk_buff *skb, unsigned int hlen); +extern int skb_gro_receive(struct sk_buff **head, + struct sk_buff *skb); + +static inline unsigned int skb_gro_offset(const struct sk_buff *skb) +{ + return NAPI_GRO_CB(skb)->data_offset; +} + +static inline unsigned int skb_gro_len(const struct sk_buff *skb) +{ + return skb->len - NAPI_GRO_CB(skb)->data_offset; +} + +static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) +{ + NAPI_GRO_CB(skb)->data_offset += len; +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; +} static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a2c2378a9c58..08670d017479 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1687,8 +1687,6 @@ extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); -extern int skb_gro_receive(struct sk_buff **head, - struct sk_buff *skb); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 2eb057a74654..378fa69d625a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -98,6 +98,8 @@ drop: int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct sk_buff *skb) { + skb_gro_reset_offset(skb); + return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); } EXPORT_SYMBOL(vlan_gro_receive); diff --git a/net/core/dev.c b/net/core/dev.c index cd23ae15a1d5..df406dcf7482 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -215,6 +215,13 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; } +static inline void *skb_gro_mac_header(struct sk_buff *skb) +{ + return skb_headlen(skb) ? skb_mac_header(skb) : + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; +} + /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -2350,7 +2357,6 @@ static int napi_gro_complete(struct sk_buff *skb) out: skb_shinfo(skb)->gso_size = 0; - __skb_push(skb, -skb_network_offset(skb)); return netif_receive_skb(skb); } @@ -2368,6 +2374,25 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); +void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) +{ + unsigned int offset = skb_gro_offset(skb); + + hlen += offset; + if (hlen <= skb_headlen(skb)) + return skb->data + offset; + + if (unlikely(!skb_shinfo(skb)->nr_frags || + skb_shinfo(skb)->frags[0].size <= + hlen - skb_headlen(skb) || + PageHighMem(skb_shinfo(skb)->frags[0].page))) + return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; + + return page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset + offset; +} +EXPORT_SYMBOL(skb_gro_header); + int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -2388,11 +2413,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { struct sk_buff *p; + void *mac; if (ptype->type != type || ptype->dev || !ptype->gro_receive) continue; - skb_reset_network_header(skb); + skb_set_network_header(skb, skb_gro_offset(skb)); + mac = skb_gro_mac_header(skb); mac_len = skb->network_header - skb->mac_header; skb->mac_len = mac_len; NAPI_GRO_CB(skb)->same_flow = 0; @@ -2406,8 +2433,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) continue; if (p->mac_len != mac_len || - memcmp(skb_mac_header(p), skb_mac_header(skb), - mac_len)) + memcmp(skb_mac_header(p), mac, mac_len)) NAPI_GRO_CB(p)->same_flow = 0; } @@ -2434,13 +2460,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { - __skb_push(skb, -skb_network_offset(skb)); + if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) goto normal; - } NAPI_GRO_CB(skb)->count = 1; - skb_shinfo(skb)->gso_size = skb->len; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; @@ -2488,6 +2512,8 @@ EXPORT_SYMBOL(napi_skb_finish); int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_gro_reset_offset(skb); + return napi_skb_finish(__napi_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -2506,6 +2532,7 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, { struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; + struct ethhdr *eth; napi->skb = NULL; @@ -2525,13 +2552,23 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, skb->len += info->len; skb->truesize += info->len; - if (!pskb_may_pull(skb, ETH_HLEN)) { + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header(skb, sizeof(*eth)); + if (!eth) { napi_reuse_skb(napi, skb); skb = NULL; goto out; } - skb->protocol = eth_type_trans(skb, dev); + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; skb->ip_summed = info->ip_summed; skb->csum = info->csum; @@ -2544,10 +2581,21 @@ EXPORT_SYMBOL(napi_fraginfo_skb); int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { int err = NET_RX_SUCCESS; + int may; switch (ret) { case GRO_NORMAL: - return netif_receive_skb(skb); + case GRO_HELD: + may = pskb_may_pull(skb, skb_gro_offset(skb)); + BUG_ON(!may); + + skb->protocol = eth_type_trans(skb, napi->dev); + + if (ret == GRO_NORMAL) + return netif_receive_skb(skb); + + skb_gro_pull(skb, -ETH_HLEN); + break; case GRO_DROP: err = NET_RX_DROP; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2e5f2ca3bdcd..f9f4065a7e9b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2584,17 +2584,21 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *p = *head; struct sk_buff *nskb; unsigned int headroom; - unsigned int hlen = p->data - skb_mac_header(p); - unsigned int len = skb->len; + unsigned int len = skb_gro_len(skb); - if (hlen + p->len + len >= 65536) + if (p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) goto merge; - else if (!skb_headlen(p) && !skb_headlen(skb) && - skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < + else if (skb_headlen(skb) <= skb_gro_offset(skb) && + skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <= MAX_SKB_FRAGS) { + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -2611,7 +2615,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom); + nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); if (unlikely(!nskb)) return -ENOMEM; @@ -2619,12 +2623,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) nskb->mac_len = p->mac_len; skb_reserve(nskb, headroom); + __skb_put(nskb, skb_gro_offset(p)); - skb_set_mac_header(nskb, -hlen); + skb_set_mac_header(nskb, skb_mac_header(p) - p->data); skb_set_network_header(nskb, skb_network_offset(p)); skb_set_transport_header(nskb, skb_transport_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + __skb_pull(p, skb_gro_offset(p)); + memcpy(skb_mac_header(nskb), skb_mac_header(p), + p->data - skb_mac_header(p)); *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 743f5542d65a..d6770f295d5b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1253,10 +1253,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, int proto; int id; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ip_hdr(skb); proto = iph->protocol & (MAX_INET_PROTOS - 1); rcu_read_lock(); @@ -1270,7 +1270,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - flush = ntohs(iph->tot_len) != skb->len || + flush = ntohs(iph->tot_len) != skb_gro_len(skb) || iph->frag_off != htons(IP_DF); id = ntohs(iph->id); @@ -1298,8 +1298,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, } NAPI_GRO_CB(skb)->flush |= flush; - __skb_pull(skb, sizeof(*iph)); - skb_reset_transport_header(skb); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); pp = ops->gro_receive(head, skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cd71b84e483..1cd608253940 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,19 +2481,19 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int mss = 1; int flush = 1; - if (!pskb_may_pull(skb, sizeof(*th))) + th = skb_gro_header(skb, sizeof(*th)); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; - if (!pskb_may_pull(skb, thlen)) + th = skb_gro_header(skb, thlen); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); - __skb_pull(skb, thlen); + skb_gro_pull(skb, thlen); flags = tcp_flag_word(th); @@ -2521,10 +2521,10 @@ found: flush |= th->ack_seq != th2->ack_seq || th->window != th2->window; flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th)); - total = p->len; + total = skb_gro_len(p); mss = skb_shinfo(p)->gso_size; - flush |= skb->len > mss || skb->len <= 0; + flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb); flush |= ntohl(th2->seq) + total != ntohl(th->seq); if (flush || skb_gro_receive(head, skb)) { @@ -2537,7 +2537,7 @@ found: tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = skb->len < mss; + flush = skb_gro_len(skb) < mss; flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 19d7b429a262..f6b962f56ab4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2355,7 +2355,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr, + if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c802bc1658a8..bd91eadcbe3f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -799,24 +799,34 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, int proto; __wsum csum; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*iph)); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); - flush += ntohs(iph->payload_len) != skb->len; + flush += ntohs(iph->payload_len) != skb_gro_len(skb); rcu_read_lock(); - proto = ipv6_gso_pull_exthdrs(skb, iph->nexthdr); - iph = ipv6_hdr(skb); - IPV6_GRO_CB(skb)->proto = proto; + proto = iph->nexthdr; ops = rcu_dereference(inet6_protos[proto]); - if (!ops || !ops->gro_receive) - goto out_unlock; + if (!ops || !ops->gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + if (!ops || !ops->gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + IPV6_GRO_CB(skb)->proto = proto; flush--; - skb_reset_transport_header(skb); nlen = skb_network_header_len(skb); for (p = *head; p; p = p->next) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e5b85d45bee8..00f1269e11e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -948,7 +948,7 @@ struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (!tcp_v6_check(skb->len, &iph->saddr, &iph->daddr, + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; -- cgit v1.2.3 From 80595d59ba9917227856e663da249c2276a8628d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:52 +0000 Subject: gro: Open-code memcpy in napi_fraginfo_skb This patch optimises napi_fraginfo_skb to only copy the bits necessary. We also open-code the memcpy so that the alignment information is always available to gcc. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index df406dcf7482..ec5be1c7f2f1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2533,6 +2533,8 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; struct ethhdr *eth; + skb_frag_t *frag; + int i; napi->skb = NULL; @@ -2545,8 +2547,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, } BUG_ON(info->nr_frags > MAX_SKB_FRAGS); + frag = &info->frags[info->nr_frags - 1]; + + for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { + skb_fill_page_desc(skb, i, frag->page, frag->page_offset, + frag->size); + frag++; + } skb_shinfo(skb)->nr_frags = info->nr_frags; - memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags)); skb->data_len = info->len; skb->len += info->len; -- cgit v1.2.3 From ad0f9904444de1309dedd2b9e365cae8af77d9b1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 1 Feb 2009 01:24:55 -0800 Subject: gro: Fix handling of imprecisely split packets The commit 89a1b249edcf9be884e71f92df84d48355c576aa (gro: Avoid copying headers of unmerged packets) only worked for packets which are either completely linear, completely non-linear, or packets which exactly split at the boundary between headers and payload. Anything else would cause bits in the header to go missing if the packet is held by GRO. This may have broken drivers such as ixgbe. This patch fixes the places that assumed or only worked with the above cases. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ec5be1c7f2f1..220f52a1001e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -217,7 +217,7 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) static inline void *skb_gro_mac_header(struct sk_buff *skb) { - return skb_headlen(skb) ? skb_mac_header(skb) : + return skb_mac_header(skb) < skb->data ? skb_mac_header(skb) : page_address(skb_shinfo(skb)->frags[0].page) + skb_shinfo(skb)->frags[0].page_offset; } @@ -2469,11 +2469,19 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) napi->gro_list = skb; ret = GRO_HELD; +pull: + if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) { + if (napi->gro_list == skb) + napi->gro_list = skb->next; + ret = GRO_DROP; + } + ok: return ret; normal: - return GRO_NORMAL; + ret = GRO_NORMAL; + goto pull; } EXPORT_SYMBOL(dev_gro_receive); @@ -2589,14 +2597,10 @@ EXPORT_SYMBOL(napi_fraginfo_skb); int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { int err = NET_RX_SUCCESS; - int may; switch (ret) { case GRO_NORMAL: case GRO_HELD: - may = pskb_may_pull(skb, skb_gro_offset(skb)); - BUG_ON(!may); - skb->protocol = eth_type_trans(skb, napi->dev); if (ret == GRO_NORMAL) -- cgit v1.2.3 From 9a279bcbe347496799711155ed41a89bc40f79c5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 4 Feb 2009 16:55:27 -0800 Subject: net: Partially allow skb destructors to be used on receive path As it currently stands, skb destructors are forbidden on the receive path because the protocol end-points will overwrite any existing destructor with their own. This is the reason why we have to call skb_orphan in the loopback driver before we reinject the packet back into the stack, thus creating a period during which loopback traffic isn't charged to any socket. With virtualisation, we have a similar problem in that traffic is reinjected into the stack without being associated with any socket entity, thus providing no natural congestion push-back for those poor folks still stuck with UDP. Now had we been consistent in telling them that UDP simply has no congestion feedback, I could just fob them off. Unfortunately, we appear to have gone to some length in catering for this on the standard UDP path, with skb/socket accounting so that has created a very unhealthy dependency. Alas habits are difficult to break out of, so we may just have to allow skb destructors on the receive path. It turns out that making skb destructors useable on the receive path isn't as easy as it seems. For instance, simply adding skb_orphan to skb_set_owner_r isn't enough. This is because we assume all over the IP stack that skb->sk is an IP socket if present. The new transparent proxy code goes one step further and assumes that skb->sk is the receiving socket if present. Now all of this can be dealt with by adding simple checks such as only treating skb->sk as an IP socket if skb->sk->sk_family matches. However, it turns out that for bridging at least we don't need to do all of this work. This is of interest because most virtualisation setups use bridging so we don't actually go through the IP stack on the host (with the exception of our old nemesis the bridge netfilter, but that's easily taken care of). So this patch simply adds skb_orphan to the point just before we enter the IP stack, but after we've gone through the bridge on the receive path. It also adds an skb_orphan to the one place in netfilter that touches skb->sk/skb->destructor, that is, tproxy. One word of caution, because of the internal code structure, anyone wishing to deploy this must use skb_set_owner_w as opposed to skb_set_owner_r since many functions that create a new skb from an existing one will invoke skb_set_owner_w on the new skb. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ net/netfilter/nf_tproxy_core.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 220f52a1001e..3337cf98f231 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2288,6 +2288,8 @@ ncls: if (!skb) goto out; + skb_orphan(skb); + type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index cdc97f3105a3..5490fc37c92d 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -71,6 +71,7 @@ int nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) { if (inet_sk(sk)->transparent) { + skb_orphan(skb); skb->sk = sk; skb->destructor = nf_tproxy_destructor; return 1; -- cgit v1.2.3 From 56035022d86fff45299288cb372a42f752ba23fa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 5 Feb 2009 21:26:52 -0800 Subject: gro: Fix frag_list merging on imprecisely split packets The previous fix ad0f9904444de1309dedd2b9e365cae8af77d9b1 (gro: Fix handling of imprecisely split packets) only fixed the case of frags merging, frag_list merging in the same circumstances were still broken. In particular, the packet headers end up in the data stream. This patch fixes this plus another issue where an imprecisely split packet header may be read incorrectly (this is mostly harmless since it'll simply cause the packet to not match and be rejected for GRO). Thanks to Emil Tantilov and Jeff Kirsher for helping to track this down. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- net/core/skbuff.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3337cf98f231..247f1614a796 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2391,7 +2391,8 @@ void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + offset; + skb_shinfo(skb)->frags[0].page_offset + + offset - skb_headlen(skb); } EXPORT_SYMBOL(skb_gro_header); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e55d1ef5690d..67f2a2f85827 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2678,6 +2678,17 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: + if (skb_gro_offset(skb) > skb_headlen(skb)) { + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + skb_gro_reset_offset(skb); + skb_gro_pull(skb, skb_headlen(skb)); + } + + __skb_pull(skb, skb_gro_offset(skb)); + p->prev->next = skb; p->prev = skb; skb_header_release(skb); -- cgit v1.2.3 From 4ae5544f9a33e4ae306e337f96951eb3ff2df6d9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 8 Feb 2009 18:00:36 +0000 Subject: gro: Remember number of held packets instead of counting every time This patch prepares for the move of the same_flow checks out of dev_gro_receive. As such we need to remember the number of held packets since doing a loop just to count them every time is silly. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 12 +++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 864519e585fc..9ee344bc6c13 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -314,6 +314,9 @@ struct napi_struct { spinlock_t poll_lock; int poll_owner; #endif + + unsigned int gro_count; + struct net_device *dev; struct list_head dev_list; struct sk_buff *gro_list; diff --git a/net/core/dev.c b/net/core/dev.c index 709a9a922258..ae0b66936abe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2372,6 +2372,7 @@ void napi_gro_flush(struct napi_struct *napi) napi_gro_complete(skb); } + napi->gro_count = 0; napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); @@ -2402,7 +2403,6 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) struct packet_type *ptype; __be16 type = skb->protocol; struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; - int count = 0; int same_flow; int mac_len; int ret; @@ -2430,8 +2430,6 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) NAPI_GRO_CB(skb)->free = 0; for (p = napi->gro_list; p; p = p->next) { - count++; - if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -2457,15 +2455,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) *pp = nskb->next; nskb->next = NULL; napi_gro_complete(nskb); - count--; + napi->gro_count--; } if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) + if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) goto normal; + napi->gro_count++; NAPI_GRO_CB(skb)->count = 1; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; @@ -2713,6 +2712,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; napi->poll = poll; @@ -2741,6 +2741,7 @@ void netif_napi_del(struct napi_struct *napi) } napi->gro_list = NULL; + napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5246,6 +5247,7 @@ static int __init net_dev_init(void) queue->backlog.poll = process_backlog; queue->backlog.weight = weight_p; queue->backlog.gro_list = NULL; + queue->backlog.gro_count = 0; } dev_boot_phase = 0; -- cgit v1.2.3 From aa4b9f533ed5a22952e038b9fac2447ccc682124 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 8 Feb 2009 18:00:37 +0000 Subject: gro: Optimise Ethernet header comparison This patch optimises the Ethernet header comparison to use 2-byte and 4-byte xors instead of memcmp. In order to facilitate this, the actual comparison is now carried out by the callers of the shared dev_gro_receive function. This has a significant impact when receiving 1500B packets through 10GbE. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 21 +++++++++++++++++++++ include/linux/netdevice.h | 7 +++++++ net/8021q/vlan_core.c | 4 +++- net/core/dev.c | 23 ++--------------------- 4 files changed, 33 insertions(+), 22 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 1cb0f0b90926..a1f17abba7dc 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -184,4 +184,25 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2], } #endif /* __KERNEL__ */ +/** + * compare_ether_header - Compare two Ethernet headers + * @a: Pointer to Ethernet header + * @b: Pointer to Ethernet header + * + * Compare two ethernet headers, returns 0 if equal. + * This assumes that the network header (i.e., IP header) is 4-byte + * aligned OR the platform can handle unaligned access. This is the + * case for all packets coming into netif_receive_skb or similar + * entry points. + */ + +static inline int compare_ether_header(const void *a, const void *b) +{ + u32 *a32 = (u32 *)((u8 *)a + 2); + u32 *b32 = (u32 *)((u8 *)b + 2); + + return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | + (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); +} + #endif /* _LINUX_ETHERDEVICE_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9ee344bc6c13..355662aac940 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1117,6 +1117,13 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->data_offset = 0; } +static inline void *skb_gro_mac_header(struct sk_buff *skb) +{ + return skb_mac_header(skb) < skb->data ? skb_mac_header(skb) : + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; +} + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 378fa69d625a..70435af153f2 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -85,7 +85,9 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, goto drop; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = p->dev == skb->dev; + NAPI_GRO_CB(p)->same_flow = + p->dev == skb->dev && !compare_ether_header( + skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/dev.c b/net/core/dev.c index ae0b66936abe..1e27a67df242 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -215,13 +215,6 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; } -static inline void *skb_gro_mac_header(struct sk_buff *skb) -{ - return skb_mac_header(skb) < skb->data ? skb_mac_header(skb) : - page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset; -} - /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -2415,29 +2408,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - struct sk_buff *p; - void *mac; - if (ptype->type != type || ptype->dev || !ptype->gro_receive) continue; skb_set_network_header(skb, skb_gro_offset(skb)); - mac = skb_gro_mac_header(skb); mac_len = skb->network_header - skb->mac_header; skb->mac_len = mac_len; NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; - for (p = napi->gro_list; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - if (p->mac_len != mac_len || - memcmp(skb_mac_header(p), mac, mac_len)) - NAPI_GRO_CB(p)->same_flow = 0; - } - pp = ptype->gro_receive(&napi->gro_list, skb); break; } @@ -2492,7 +2472,8 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) struct sk_buff *p; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = 1; + NAPI_GRO_CB(p)->same_flow = !compare_ether_header( + skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.2.3 From ac45f602ee3d1b6f326f68bc0c2591ceebf05ba4 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 12 Feb 2009 05:03:37 +0000 Subject: net: infrastructure for hardware time stamping The additional per-packet information (16 bytes for time stamps, 1 byte for flags) is stored for all packets in the skb_shared_info struct. This implementation detail is hidden from users of that information via skb_* accessor functions. A separate struct resp. union is used for the additional information so that it can be stored/copied easily outside of skb_shared_info. Compared to previous implementations (reusing the tstamp field depending on the context, optional additional structures) this is the simplest solution. It does not extend sk_buff itself. TX time stamping is implemented in software if the device driver doesn't support hardware time stamping. The new semantic for hardware/software time stamping around ndo_start_xmit() is based on two assumptions about existing network device drivers which don't support hardware time stamping and know nothing about it: - they leave the new skb_shared_tx unmodified - the keep the connection to the originating socket in skb->sk alive, i.e., don't call skb_orphan() Given that skb_shared_tx is new, the first assumption is safe. The second is only true for some drivers. As a result, software TX time stamping currently works with the bnx2 driver, but not with the unmodified igb driver (the two drivers this patch series was tested with). Signed-off-by: Patrick Ohly Signed-off-by: David S. Miller --- include/linux/skbuff.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++- net/core/dev.c | 32 ++++++++++++++++-- net/core/skbuff.c | 41 +++++++++++++++++++++++ 3 files changed, 161 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 924700844580..f96bc91bf0a3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -132,6 +132,57 @@ struct skb_frag_struct { __u32 size; }; +#define HAVE_HW_TIME_STAMP + +/** + * skb_shared_hwtstamps - hardware time stamps + * + * @hwtstamp: hardware time stamp transformed into duration + * since arbitrary point in time + * @syststamp: hwtstamp transformed to system time base + * + * Software time stamps generated by ktime_get_real() are stored in + * skb->tstamp. The relation between the different kinds of time + * stamps is as follows: + * + * syststamp and tstamp can be compared against each other in + * arbitrary combinations. The accuracy of a + * syststamp/tstamp/"syststamp from other device" comparison is + * limited by the accuracy of the transformation into system time + * base. This depends on the device driver and its underlying + * hardware. + * + * hwtstamps can only be compared against other hwtstamps from + * the same device. + * + * This structure is attached to packets as part of the + * &skb_shared_info. Use skb_hwtstamps() to get a pointer. + */ +struct skb_shared_hwtstamps { + ktime_t hwtstamp; + ktime_t syststamp; +}; + +/** + * skb_shared_tx - instructions for time stamping of outgoing packets + * + * @hardware: generate hardware time stamp + * @software: generate software time stamp + * @in_progress: device driver is going to provide + * hardware time stamp + * + * These flags are attached to packets as part of the + * &skb_shared_info. Use skb_tx() to get a pointer. + */ +union skb_shared_tx { + struct { + __u8 hardware:1, + software:1, + in_progress:1; + }; + __u8 flags; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -143,10 +194,12 @@ struct skb_shared_info { unsigned short gso_segs; unsigned short gso_type; __be32 ip6_frag_id; + union skb_shared_tx tx_flags; #ifdef CONFIG_HAS_DMA unsigned int num_dma_maps; #endif struct sk_buff *frag_list; + struct skb_shared_hwtstamps hwtstamps; skb_frag_t frags[MAX_SKB_FRAGS]; #ifdef CONFIG_HAS_DMA dma_addr_t dma_maps[MAX_SKB_FRAGS + 1]; @@ -465,6 +518,16 @@ static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) +static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) +{ + return &skb_shinfo(skb)->hwtstamps; +} + +static inline union skb_shared_tx *skb_tx(struct sk_buff *skb) +{ + return &skb_shinfo(skb)->tx_flags; +} + /** * skb_queue_empty - check if a queue is empty * @list: queue head @@ -1730,6 +1793,11 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, extern void skb_init(void); +static inline ktime_t skb_get_ktime(const struct sk_buff *skb) +{ + return skb->tstamp; +} + /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from @@ -1739,11 +1807,18 @@ extern void skb_init(void); * This function converts the offset back to a struct timeval and stores * it in stamp. */ -static inline void skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp) +static inline void skb_get_timestamp(const struct sk_buff *skb, + struct timeval *stamp) { *stamp = ktime_to_timeval(skb->tstamp); } +static inline void skb_get_timestampns(const struct sk_buff *skb, + struct timespec *stamp) +{ + *stamp = ktime_to_timespec(skb->tstamp); +} + static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); @@ -1759,6 +1834,20 @@ static inline ktime_t net_invalid_timestamp(void) return ktime_set(0, 0); } +/** + * skb_tstamp_tx - queue clone of skb with send time stamps + * @orig_skb: the original outgoing packet + * @hwtstamps: hardware time stamps, may be NULL if not available + * + * If the skb has a socket associated, then this function clones the + * skb (thus sharing the actual data and optional structures), stores + * the optional hardware time stamping information (if non NULL) or + * generates a software time stamp (otherwise), then queues the clone + * to the error queue of the socket. Errors are silently ignored. + */ +extern void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps); + extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1e27a67df242..d20c28e839d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1672,10 +1672,21 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +static void tstamp_tx(struct sk_buff *skb) +{ + union skb_shared_tx *shtx = + skb_tx(skb); + if (unlikely(shtx->software && + !shtx->in_progress)) { + skb_tstamp_tx(skb, NULL); + } +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { @@ -1689,13 +1700,29 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } - return ops->ndo_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); + /* + * TODO: if skb_orphan() was called by + * dev->hard_start_xmit() (for example, the unmodified + * igb driver does that; bnx2 doesn't), then + * skb_tx_software_timestamp() will be unable to send + * back the time stamp. + * + * How can this be prevented? Always create another + * reference to the socket before calling + * dev->hard_start_xmit()? Prevent that skb_orphan() + * does anything in dev->hard_start_xmit() by clearing + * the skb destructor before the call and restoring it + * afterwards, then doing the skb_orphan() ourselves? + */ + if (likely(!rc)) + tstamp_tx(skb); + return rc; } gso: do { struct sk_buff *nskb = skb->next; - int rc; skb->next = nskb->next; nskb->next = NULL; @@ -1705,6 +1732,7 @@ gso: skb->next = nskb; return rc; } + tstamp_tx(skb); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ab7d2e9f02fa..e5a8351ff12d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo->gso_segs = 0; shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; + shinfo->tx_flags.flags = 0; shinfo->frag_list = NULL; + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { struct sk_buff *child = skb + 1; @@ -2945,6 +2948,44 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data); +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = orig_skb->sk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int err; + + if (!sk) + return; + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (hwtstamps) { + *skb_hwtstamps(skb) = + *hwtstamps; + } else { + /* + * no hardware time stamps available, + * so keep the skb_shared_tx and only + * store software time stamp + */ + skb->tstamp = ktime_get_real(); + } + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + + /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set -- cgit v1.2.3 From d24fff22d8dba13cc21034144f68f213415cb7c8 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 12 Feb 2009 05:03:40 +0000 Subject: net: pass new SIOCSHWTSTAMP through to device drivers Signed-off-by: Patrick Ohly Signed-off-by: David S. Miller --- fs/compat_ioctl.c | 6 ++++++ net/core/dev.c | 2 ++ 2 files changed, 8 insertions(+) (limited to 'net/core/dev.c') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 64f406593c0e..763fe69ef351 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -522,6 +522,11 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg) if (err) return -EFAULT; break; + case SIOCSHWTSTAMP: + if (copy_from_user(&ifr, uifr32, sizeof(*uifr32))) + return -EFAULT; + ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data); + break; default: if (copy_from_user(&ifr, uifr32, sizeof(*uifr32))) return -EFAULT; @@ -2563,6 +2568,7 @@ HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc) HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc) HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc) HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc) +HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc) /* ioctls used by appletalk ddp.c */ HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc) diff --git a/net/core/dev.c b/net/core/dev.c index d20c28e839d3..d393fc997cd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4019,6 +4019,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) cmd == SIOCSMIIREG || cmd == SIOCBRADDIF || cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || cmd == SIOCWANDEV) { err = -EOPNOTSUPP; if (ops->ndo_do_ioctl) { @@ -4173,6 +4174,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCBONDCHANGEACTIVE: case SIOCBRADDIF: case SIOCBRDELIF: + case SIOCSHWTSTAMP: if (!capable(CAP_NET_ADMIN)) return -EPERM; /* fall through */ -- cgit v1.2.3 From e88721f87d8caa679e62d6004a9a5661f1ac7b0e Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Wed, 18 Feb 2009 17:55:02 -0800 Subject: net: Optimize skb_tx_hash() by eliminating a comparison Optimize skb_tx_hash() by eliminating a comparison that executes for every packet. skb_tx_hashrnd initialization is moved to a later part of the startup sequence, namely after the "random" driver is initialized. Rebooted the system three times and verified that the code generates different random numbers each time. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d393fc997cd9..5493394118fb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1745,17 +1745,11 @@ out_kfree_skb: } static u32 skb_tx_hashrnd; -static int skb_tx_hashrnd_initialized = 0; static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) { u32 hash; - if (unlikely(!skb_tx_hashrnd_initialized)) { - get_random_bytes(&skb_tx_hashrnd, 4); - skb_tx_hashrnd_initialized = 1; - } - if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); } else if (skb->sk && skb->sk->sk_hash) { @@ -5291,6 +5285,14 @@ out: subsys_initcall(net_dev_init); +static int __init initialize_hashrnd(void) +{ + get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); + return 0; +} + +late_initcall_sync(initialize_hashrnd); + EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); -- cgit v1.2.3 From cd4d8fdad1f13205c769266dfa99015e226b6e07 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Sat, 21 Feb 2009 02:42:18 -0800 Subject: net: kernel panic in dev_hard_start_xmit: remove faulty software TX time stamping The current implementation of the TX software time stamping fallback is faulty because it accesses the skb after ndo_start_xmit() returns successfully. This patch removes the fallback, which fixes kernel panics seen during stress tests. Hardware time stamping is not affected by this removal. Signed-off-by: Patrick Ohly Signed-off-by: Emil Tantilov Signed-off-by: David S. Miller --- net/core/dev.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5493394118fb..88dc082b47d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1672,16 +1672,6 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } -static void tstamp_tx(struct sk_buff *skb) -{ - union skb_shared_tx *shtx = - skb_tx(skb); - if (unlikely(shtx->software && - !shtx->in_progress)) { - skb_tstamp_tx(skb, NULL); - } -} - int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { @@ -1715,8 +1705,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * the skb destructor before the call and restoring it * afterwards, then doing the skb_orphan() ourselves? */ - if (likely(!rc)) - tstamp_tx(skb); return rc; } @@ -1732,7 +1720,6 @@ gso: skb->next = nskb; return rc; } - tstamp_tx(skb); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); -- cgit v1.2.3 From ce16c5337ab0d165f95c88aa857207efd7c01139 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Feb 2009 00:11:09 -0800 Subject: netns: Remove net_alive It turns out that net_alive is unnecessary, and the original problem that led to it being added was simply that the icmp code thought it was a network device and wound up being unable to handle packets while there were still packets in the network namespace. Now that icmp and tcp have been fixed to properly register themselves this problem is no longer present and we have a stronger guarantee that packets will not arrive in a network namespace then that provided by net_alive in netif_receive_skb. So remove net_alive allowing packet reception run a little faster. Additionally document the strong reason why network namespace cleanup is safe so that if something happens again someone else will have a chance of figuring it out. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 27 +++++++++++++++++---------- net/core/dev.c | 6 ------ net/core/net_namespace.c | 3 --- 3 files changed, 17 insertions(+), 19 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 6fc13d905c5f..ded434b032a4 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -109,11 +109,6 @@ extern struct list_head net_namespace_list; #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); -static inline int net_alive(struct net *net) -{ - return net && atomic_read(&net->count); -} - static inline struct net *get_net(struct net *net) { atomic_inc(&net->count); @@ -145,11 +140,6 @@ int net_eq(const struct net *net1, const struct net *net2) } #else -static inline int net_alive(struct net *net) -{ - return 1; -} - static inline struct net *get_net(struct net *net) { return net; @@ -234,6 +224,23 @@ struct pernet_operations { void (*exit)(struct net *net); }; +/* + * Use these carefully. If you implement a network device and it + * needs per network namespace operations use device pernet operations, + * otherwise use pernet subsys operations. + * + * This is critically important. Most of the network code cleanup + * runs with the assumption that dev_remove_pack has been called so no + * new packets will arrive during and after the cleanup functions have + * been called. dev_remove_pack is not per namespace so instead the + * guarantee of no more packets arriving in a network namespace is + * provided by ensuring that all network devices and all sockets have + * left the network namespace before the cleanup methods are called. + * + * For the longest time the ipv4 icmp code was registered as a pernet + * device which caused kernel oops, and panics during network + * namespace cleanup. So please don't get this wrong. + */ extern int register_pernet_subsys(struct pernet_operations *); extern void unregister_pernet_subsys(struct pernet_operations *); extern int register_pernet_gen_subsys(int *id, struct pernet_operations *); diff --git a/net/core/dev.c b/net/core/dev.c index 88dc082b47d1..ac6ab12d3297 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2254,12 +2254,6 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); - /* Don't receive packets in an exiting network namespace */ - if (!net_alive(dev_net(skb->dev))) { - kfree_skb(skb); - goto out; - } - #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 55151faaf90c..516c7b154327 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -140,9 +140,6 @@ static void cleanup_net(struct work_struct *work) struct pernet_operations *ops; struct net *net; - /* Be very certain incoming network packets will not find us */ - rcu_barrier(); - net = container_of(work, struct net, work); mutex_lock(&net_mutex); -- cgit v1.2.3 From 1c8dbcf6496c2612d883a8bc6bccc38000e14866 Mon Sep 17 00:00:00 2001 From: Yi Zou Date: Fri, 27 Feb 2009 14:06:54 -0800 Subject: [SCSI] net: add NETIF_F_FCOE_CRC to can_checksum_protocol Add FC CRC offload check for ETH_P_FCOE. Signed-off-by: Yi Zou Acked-by: David Miller Signed-off-by: James Bottomley --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 72b0d26fd46d..3d3670640c2d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1457,7 +1457,9 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) ((features & NETIF_F_IP_CSUM) && protocol == htons(ETH_P_IP)) || ((features & NETIF_F_IPV6_CSUM) && - protocol == htons(ETH_P_IPV6))); + protocol == htons(ETH_P_IPV6)) || + ((features & NETIF_F_FCOE_CRC) && + protocol == htons(ETH_P_FCOE))); } static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) -- cgit v1.2.3 From d1c76af9e2434fac3add561e26c61b06503de986 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 16 Mar 2009 10:50:02 -0700 Subject: GRO: Move netpoll checks to correct location As my netpoll fix for net doesn't really work for net-next, we need this update to move the checks into the right place. As it stands we may pass freed skbs to netpoll_receive_skb. This patch also introduces a netpoll_rx_on function to avoid GRO completely if we're invoked through netpoll. This might seem paranoid but as netpoll may have an external receive hook it's better to be safe than sorry. I don't think we need this for 2.6.29 though since there's nothing immediately broken by it. This patch also moves the GRO_* return values to netdevice.h since VLAN needs them too (I tried to avoid this originally but alas this seems to be the easiest way out). This fixes a bug in VLAN where it continued to use the old return value 2 instead of the correct GRO_DROP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ include/linux/netpoll.h | 11 +++++++++++ net/8021q/vlan_core.c | 11 ++++------- net/core/dev.c | 17 +++-------------- 4 files changed, 26 insertions(+), 21 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 493b065f76d7..be3ebd7e8ce5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -330,6 +330,14 @@ enum NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ }; +enum { + GRO_MERGED, + GRO_MERGED_FREE, + GRO_HELD, + GRO_NORMAL, + GRO_DROP, +}; + extern void __napi_schedule(struct napi_struct *n); static inline int napi_disable_pending(struct napi_struct *n) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e38d3c9dccda..de99025f2c5d 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,6 +63,13 @@ static inline int netpoll_rx(struct sk_buff *skb) return ret; } +static inline int netpoll_rx_on(struct sk_buff *skb) +{ + struct netpoll_info *npinfo = skb->dev->npinfo; + + return npinfo && (npinfo->rx_np || npinfo->rx_flags); +} + static inline int netpoll_receive_skb(struct sk_buff *skb) { if (!list_empty(&skb->dev->napi_list)) @@ -99,6 +106,10 @@ static inline int netpoll_rx(struct sk_buff *skb) { return 0; } +static inline int netpoll_rx_on(struct sk_buff *skb) +{ + return 0; +} static inline int netpoll_receive_skb(struct sk_buff *skb) { return 0; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 2d6e405fc498..6227248597c4 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -79,6 +79,9 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, { struct sk_buff *p; + if (netpoll_rx_on(skb)) + return GRO_NORMAL; + if (skb_bond_should_drop(skb)) goto drop; @@ -98,7 +101,7 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, return dev_gro_receive(napi, skb); drop: - return 2; + return GRO_DROP; } int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, @@ -106,9 +109,6 @@ int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, { skb_gro_reset_offset(skb); - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); } EXPORT_SYMBOL(vlan_gro_receive); @@ -121,9 +121,6 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, if (!skb) return NET_RX_DROP; - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - return napi_frags_finish(napi, skb, vlan_gro_common(napi, grp, vlan_tci, skb)); } diff --git a/net/core/dev.c b/net/core/dev.c index 033d7ca28e6e..7bd3c29c5a78 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,14 +135,6 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) -enum { - GRO_MERGED, - GRO_MERGED_FREE, - GRO_HELD, - GRO_NORMAL, - GRO_DROP, -}; - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -2474,6 +2466,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + if (netpoll_rx_on(skb)) + return GRO_NORMAL; + for (p = napi->gro_list; p; p = p->next) { NAPI_GRO_CB(p)->same_flow = !compare_ether_header( skb_mac_header(p), skb_gro_mac_header(skb)); @@ -2487,9 +2482,6 @@ int napi_skb_finish(int ret, struct sk_buff *skb) { int err = NET_RX_SUCCESS; - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - switch (ret) { case GRO_NORMAL: return netif_receive_skb(skb); @@ -2587,9 +2579,6 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { int err = NET_RX_SUCCESS; - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - switch (ret) { case GRO_NORMAL: case GRO_HELD: -- cgit v1.2.3 From 303c6a0251852ecbdc5c15e466dcaff5971f7517 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 17 Mar 2009 13:11:29 -0700 Subject: gro: Fix legacy path napi_complete crash On the legacy netif_rx path, I incorrectly tried to optimise the napi_complete call by using __napi_complete before we reenable IRQs. This simply doesn't work since we need to flush the held GRO packets first. This patch fixes it by doing the obvious thing of reenabling IRQs first and then calling napi_complete. Reported-by: Frank Blaschka Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f1129706ce7b..2565f6d1d661 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2588,9 +2588,9 @@ static int process_backlog(struct napi_struct *napi, int quota) local_irq_disable(); skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { - __napi_complete(napi); local_irq_enable(); - break; + napi_complete(napi); + goto out; } local_irq_enable(); @@ -2599,6 +2599,7 @@ static int process_backlog(struct napi_struct *napi, int quota) napi_gro_flush(napi); +out: return work; } -- cgit v1.2.3 From e4a389a9b5c892446b5de2038bdc0cca8703c615 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 18 Mar 2009 23:12:13 -0700 Subject: net: kfree(napi->skb) => kfree_skb struct sk_buff pointers should be freed with kfree_skb. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2565f6d1d661..e3fe5c705606 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2672,7 +2672,7 @@ void netif_napi_del(struct napi_struct *napi) struct sk_buff *skb, *next; list_del_init(&napi->dev_list); - kfree(napi->skb); + kfree_skb(napi->skb); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; -- cgit v1.2.3 From 9247744e5eaa29aecee5342a0c8694187a6aadcd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 21 Mar 2009 13:39:26 -0700 Subject: skb: expose and constify hash primitives Some minor changes to queue hashing: 1. Use const on accessor functions 2. Export skb_tx_hash for use in drivers (see ixgbe) Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++++--- net/core/dev.c | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1fbab2ae613c..bb1981fd60f3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1969,7 +1969,7 @@ static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) skb->queue_mapping = queue_mapping; } -static inline u16 skb_get_queue_mapping(struct sk_buff *skb) +static inline u16 skb_get_queue_mapping(const struct sk_buff *skb) { return skb->queue_mapping; } @@ -1984,16 +1984,19 @@ static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) skb->queue_mapping = rx_queue + 1; } -static inline u16 skb_get_rx_queue(struct sk_buff *skb) +static inline u16 skb_get_rx_queue(const struct sk_buff *skb) { return skb->queue_mapping - 1; } -static inline bool skb_rx_queue_recorded(struct sk_buff *skb) +static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) { return (skb->queue_mapping != 0); } +extern u16 skb_tx_hash(const struct net_device *dev, + const struct sk_buff *skb); + #ifdef CONFIG_XFRM static inline struct sec_path *skb_sec_path(struct sk_buff *skb) { diff --git a/net/core/dev.c b/net/core/dev.c index ca212acd3348..fdb9973b82a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1725,7 +1725,7 @@ out_kfree_skb: static u32 skb_tx_hashrnd; -static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) +u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; @@ -1740,6 +1740,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } +EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) -- cgit v1.2.3 From ed734a97c6a81b644bd648afd7a337deb0ccd7e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 21 Mar 2009 13:42:55 -0700 Subject: net: remove useless prefetch() call There is no gain using prefetch() in dev_hard_start_xmit(), since we already had to read ops->ndo_select_queue pointer in dev_pick_tx(), and both pointers are probably located in the same cache line. This prefetch call slows down fast path because of a stall in address computation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fdb9973b82a6..052dd478d3e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1670,7 +1670,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; int rc; - prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); -- cgit v1.2.3 From 8f1ead2d1a626ed0c85b3d2c2046a49081d5933f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Mar 2009 00:59:10 -0700 Subject: GRO: Disable GRO on legacy netif_rx path When I fixed the GRO crash in the legacy receive path I used napi_complete to replace __napi_complete. Unfortunately they're not the same when NETPOLL is enabled, which may result in us not calling __napi_complete at all. What's more, we really do need to keep the __napi_complete call within the IRQ-off section since in theory an IRQ can occur in between and fill up the backlog to the maximum, causing us to lock up. Since we can't seem to find a fix that works properly right now, this patch reverts all the GRO support from the netif_rx path. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 052dd478d3e1..63ec4bf89b29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2627,18 +2627,15 @@ static int process_backlog(struct napi_struct *napi, int quota) local_irq_disable(); skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { + __napi_complete(napi); local_irq_enable(); - napi_complete(napi); - goto out; + break; } local_irq_enable(); - napi_gro_receive(napi, skb); + netif_receive_skb(skb); } while (++work < quota && jiffies == start_time); - napi_gro_flush(napi); - -out: return work; } -- cgit v1.2.3