diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 130 | ||||
-rw-r--r-- | net/core/dev.c | 514 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 139 | ||||
-rw-r--r-- | net/core/fib_rules.c | 4 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 15 | ||||
-rw-r--r-- | net/core/iovec.c | 33 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 9 | ||||
-rw-r--r-- | net/core/net-traces.c | 7 | ||||
-rw-r--r-- | net/core/net_namespace.c | 54 | ||||
-rw-r--r-- | net/core/netpoll.c | 15 | ||||
-rw-r--r-- | net/core/pktgen.c | 3 | ||||
-rw-r--r-- | net/core/skbuff.c | 101 | ||||
-rw-r--r-- | net/core/stream.c | 3 |
13 files changed, 785 insertions, 242 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index d0de644b378d..e2a36f05cdf7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } +static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + unsigned long bits = (unsigned long)key; + + /* + * Avoid a wakeup if event not interesting for us + */ + if (bits && !(bits & (POLLIN | POLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} /* * Wait for a packet.. */ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); @@ -248,7 +260,9 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) spin_unlock_bh(&sk->sk_receive_queue.lock); } - skb_free_datagram(sk, skb); + kfree_skb(skb); + sk_mem_reclaim_partial(sk); + return err; } @@ -339,17 +353,111 @@ fault: } /** + * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @to_offset: offset in the io vector to start copying to + * @len: amount of data to copy from buffer to iovec + * + * Returns 0 or -EFAULT. + * Note: the iovec is not modified during the copy. + */ +int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, + const struct iovec *to, int to_offset, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovecend(to, vaddr + frag->page_offset + + offset - start, to_offset, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_const_iovec(list, + offset - start, + to, to_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_const_iovec); + +/** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying to * @from: io vector to copy to + * @from_offset: offset in the io vector to start copying from * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. - * Note: the iovec is modified during the copy. + * Note: the iovec is not modified during the copy. */ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - struct iovec *from, int len) + const struct iovec *from, int from_offset, + int len) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -358,11 +466,12 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_fromiovec(skb->data + offset, from, copy)) + if (memcpy_fromiovecend(skb->data + offset, from, 0, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; + from_offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ @@ -381,8 +490,9 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap(page); - err = memcpy_fromiovec(vaddr + frag->page_offset + - offset - start, from, copy); + err = memcpy_fromiovecend(vaddr + frag->page_offset + + offset - start, + from, from_offset, copy); kunmap(page); if (err) goto fault; @@ -390,6 +500,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (!(len -= copy)) return 0; offset += copy; + from_offset += copy; } start = end; } @@ -408,11 +519,14 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, copy = len; if (skb_copy_datagram_from_iovec(list, offset - start, - from, copy)) + from, + from_offset, + copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; + from_offset += copy; } start = end; } diff --git a/net/core/dev.c b/net/core/dev.c index 343883f65ea7..ed4550fd9ece 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -1336,7 +1337,12 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; +#ifdef CONFIG_NET_CLS_ACT + if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) + net_timestamp(skb); +#else net_timestamp(skb); +#endif rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1683,7 +1689,17 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } + /* + * If device doesnt need skb->dst, release it right now while + * its hot in this cpu cache + */ + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) { + dst_release(skb->dst); + skb->dst = NULL; + } rc = ops->ndo_start_xmit(skb, dev); + if (rc == 0) + txq_trans_update(txq); /* * TODO: if skb_orphan() was called by * dev->hard_start_xmit() (for example, the unmodified @@ -1713,6 +1729,7 @@ gso: skb->next = nskb; return rc; } + txq_trans_update(txq); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -1732,9 +1749,14 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - } else if (skb->sk && skb->sk->sk_hash) { + while (unlikely (hash >= dev->real_num_tx_queues)) + hash -= dev->real_num_tx_queues; + return hash; + } + + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; - } else + else hash = skb->protocol; hash = jhash_1word(hash, skb_tx_hashrnd); @@ -2368,26 +2390,6 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) -{ - unsigned int offset = skb_gro_offset(skb); - - hlen += offset; - if (hlen <= skb_headlen(skb)) - return skb->data + offset; - - if (unlikely(!skb_shinfo(skb)->nr_frags || - skb_shinfo(skb)->frags[0].size <= - hlen - skb_headlen(skb) || - PageHighMem(skb_shinfo(skb)->frags[0].page))) - return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; - - return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + - offset - skb_headlen(skb); -} -EXPORT_SYMBOL(skb_gro_header); - int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -2450,10 +2452,25 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) ret = GRO_HELD; pull: - if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) { - if (napi->gro_list == skb) - napi->gro_list = skb->next; - ret = GRO_DROP; + if (skb_headlen(skb) < skb_gro_offset(skb)) { + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->tail += grow; + skb->data_len -= grow; + + skb_shinfo(skb)->frags[0].page_offset += grow; + skb_shinfo(skb)->frags[0].size -= grow; + + if (unlikely(!skb_shinfo(skb)->frags[0].size)) { + put_page(skb_shinfo(skb)->frags[0].page); + memmove(skb_shinfo(skb)->frags, + skb_shinfo(skb)->frags + 1, + --skb_shinfo(skb)->nr_frags); + } } ok: @@ -2503,6 +2520,22 @@ int napi_skb_finish(int ret, struct sk_buff *skb) } EXPORT_SYMBOL(napi_skb_finish); +void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb->mac_header == skb->tail && + !PageHighMem(skb_shinfo(skb)->frags[0].page)) { + NAPI_GRO_CB(skb)->frag0 = + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; + NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + } +} +EXPORT_SYMBOL(skb_gro_reset_offset); + int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); @@ -2520,16 +2553,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_reuse_skb); -struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, - struct napi_gro_fraginfo *info) +struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - skb_frag_t *frag; - int i; - - napi->skb = NULL; if (!skb) { skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); @@ -2537,47 +2564,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, goto out; skb_reserve(skb, NET_IP_ALIGN); - } - - BUG_ON(info->nr_frags > MAX_SKB_FRAGS); - frag = &info->frags[info->nr_frags - 1]; - - for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { - skb_fill_page_desc(skb, i, frag->page, frag->page_offset, - frag->size); - frag++; - } - skb_shinfo(skb)->nr_frags = info->nr_frags; - - skb->data_len = info->len; - skb->len += info->len; - skb->truesize += info->len; - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); - - eth = skb_gro_header(skb, sizeof(*eth)); - if (!eth) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + napi->skb = skb; } - skb_gro_pull(skb, sizeof(*eth)); - - /* - * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. - */ - skb->protocol = eth->h_proto; - - skb->ip_summed = info->ip_summed; - skb->csum = info->csum; - out: return skb; } -EXPORT_SYMBOL(napi_fraginfo_skb); +EXPORT_SYMBOL(napi_get_frags); int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { @@ -2607,9 +2601,46 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) } EXPORT_SYMBOL(napi_frags_finish); -int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +struct sk_buff *napi_frags_skb(struct napi_struct *napi) { - struct sk_buff *skb = napi_fraginfo_skb(napi, info); + struct sk_buff *skb = napi->skb; + struct ethhdr *eth; + unsigned int hlen; + unsigned int off; + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + off = skb_gro_offset(skb); + hlen = off + sizeof(*eth); + eth = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + eth = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + } + + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; + +out: + return skb; +} +EXPORT_SYMBOL(napi_frags_skb); + +int napi_gro_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi_frags_skb(napi); if (!skb) return NET_RX_DROP; @@ -2713,7 +2744,7 @@ void netif_napi_del(struct napi_struct *napi) struct sk_buff *skb, *next; list_del_init(&napi->dev_list); - kfree_skb(napi->skb); + napi_free_frags(napi); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; @@ -2767,8 +2798,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); @@ -3438,6 +3471,252 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } +/* hw addresses list handling functions */ + +static int __hw_addr_add(struct list_head *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + list_add_tail_rcu(&ha->list, list); + return 0; +} + +static void ha_rcu_free(struct rcu_head *head) +{ + struct netdev_hw_addr *ha; + + ha = container_of(head, struct netdev_hw_addr, rcu_head); + kfree(ha); +} + +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + struct netdev_hw_addr *ha; + int i = 0; + + list_for_each_entry(ha, list, list) { + if (i++ != ignore_index && + !memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_add_multiple_ii(struct list_head *to_list, + struct list_head *from_list, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, from_list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, from_list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del_ii(to_list, ha2->addr, addr_len, type, + ignore_index); + } + return err; +} + +static void __hw_addr_del_multiple_ii(struct list_head *to_list, + struct list_head *from_list, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, from_list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type, + ignore_index); + } +} + +static void __hw_addr_flush(struct list_head *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, list, list) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + } +} + +/* Device addresses handling functions */ + +static void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addr_list); + dev->dev_addr = NULL; +} + +static int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + INIT_LIST_HEAD(&dev->dev_addr_list); + memset(addr, 0, sizeof(*addr)); + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addr_list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, + addr_type, 0); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type, 0); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device by which addresses the addresses will be deleted + * @addr_type: address type - 0 means type will used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple_ii(&to_dev->dev_addr_list, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type, 0); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* unicast and multicast addresses handling functions */ + int __dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int glbl) { @@ -4327,39 +4606,6 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); -/* Some devices need to (re-)set their netdev_ops inside - * ->init() or similar. If that happens, we have to setup - * the compat pointers again. - */ -void netdev_resync_ops(struct net_device *dev) -{ -#ifdef CONFIG_COMPAT_NET_DEV_OPS - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->neigh_setup = ops->ndo_neigh_setup; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif -#endif -} -EXPORT_SYMBOL(netdev_resync_ops); - /** * register_netdevice - register a network device * @dev: device to register @@ -4399,23 +4645,6 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; -#ifdef CONFIG_COMPAT_NET_DEV_OPS - /* Netdevice_ops API compatiability support. - * This is temporary until all network devices are converted. - */ - if (dev->netdev_ops) { - netdev_resync_ops(dev); - } else { - char drivername[64]; - pr_info("%s (%s): not using net_device_ops yet\n", - dev->name, netdev_drivername(dev, drivername, 64)); - - /* This works only because net_device_ops and the - compatiablity structure are the same. */ - dev->netdev_ops = (void *) &(dev->init); - } -#endif - /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -4701,13 +4930,30 @@ void netdev_run_todo(void) * the internal statistics structure is used. */ const struct net_device_stats *dev_get_stats(struct net_device *dev) - { +{ const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_get_stats) return ops->ndo_get_stats(dev); - else - return &dev->stats; + else { + unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + struct net_device_stats *stats = &dev->stats; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } + return stats; + } } EXPORT_SYMBOL(dev_get_stats); @@ -4765,13 +5011,16 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (!tx) { printk(KERN_ERR "alloc_netdev: Unable to allocate " "tx qdiscs.\n"); - kfree(p); - return NULL; + goto free_p; } dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); dev->padded = (char *)dev - (char *)p; + + if (dev_addr_init(dev)) + goto free_tx; + dev_net_set(dev, &init_net); dev->_tx = tx; @@ -4783,9 +5032,17 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); + dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); return dev; + +free_tx: + kfree(tx); + +free_p: + kfree(p); + return NULL; } EXPORT_SYMBOL(alloc_netdev_mq); @@ -4805,6 +5062,9 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); + /* Flush device addresses */ + dev_addr_flush(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9fd0dc3cca99..defbbf093f08 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> -#include <trace/skb.h> +#include <trace/events/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,11 +50,18 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "NET_DM", - .version = 1, + .version = 2, .maxattr = NET_DM_CMD_MAX, }; @@ -59,19 +69,24 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; + struct nlattr *nla; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); + al += sizeof(struct nlattr); + data->skb = genlmsg_new(al, GFP_KERNEL); genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); - msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg)); + nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); + msg = nla_data(nla); memset(msg, 0, al); atomic_set(&data->dm_hit_count, dm_hit_limit); } @@ -111,10 +126,11 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; + struct nlattr *nla; int i; struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); @@ -127,7 +143,8 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) } nlh = (struct nlmsghdr *)data->skb->data; - msg = genlmsg_data(nlmsg_data(nlh)); + nla = genlmsg_data(nlmsg_data(nlh)); + msg = nla_data(nla); for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { msg->points[i].count++; @@ -139,6 +156,7 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) * We need to create a new entry */ __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); + nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); msg->points[msg->entries].count = 1; msg->entries++; @@ -152,24 +170,80 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + if (!rc) + trace_state = state; + + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; return rc; @@ -197,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *tmp; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + spin_lock(&trace_state_lock); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; + if (trace_state == TRACE_OFF) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + break; + } + } + } + spin_unlock(&trace_state_lock); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -213,6 +325,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -236,12 +352,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -252,6 +374,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 98691e1466b8..bd309384f8b8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -299,7 +299,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } else if (rule->action == FR_ACT_GOTO) goto errout_free; - err = ops->configure(rule, skb, nlh, frh, tb); + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; @@ -500,7 +500,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->target) NLA_PUT_U32(skb, FRA_GOTO, rule->target); - if (ops->fill(rule, skb, nlh, frh) < 0) + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9cc9f95b109e..78e5bfc454ae 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -66,9 +66,9 @@ NOTES. - * The stored value for avbps is scaled by 2^5, so that maximal - rate is ~1Gbit, avpps is scaled by 2^10. - + * avbps is scaled by 2^5, avpps is scaled by 2^10. + * both values are reported as 32 bit unsigned values. bps can + overflow for fast links : max speed being 34360Mbit/sec * Minimal interval is HZ/4=250msec (it is the greatest common divisor for HZ=100 and HZ=1024 8)), maximal interval is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals @@ -86,9 +86,9 @@ struct gen_estimator spinlock_t *stats_lock; int ewma_log; u64 last_bytes; + u64 avbps; u32 last_packets; u32 avpps; - u32 avbps; struct rcu_head e_rcu; struct rb_node node; }; @@ -115,6 +115,7 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { u64 nbytes; + u64 brate; u32 npackets; u32 rate; @@ -125,14 +126,14 @@ static void est_timer(unsigned long arg) nbytes = e->bstats->bytes; npackets = e->bstats->packets; - rate = (nbytes - e->last_bytes)<<(7 - idx); + brate = (nbytes - e->last_bytes)<<(7 - idx); e->last_bytes = nbytes; - e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; rate = (npackets - e->last_packets)<<(12 - idx); e->last_packets = npackets; - e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); e->rate_est->pps = (e->avpps+0x1FF)>>10; skip: read_unlock(&est_lock); diff --git a/net/core/iovec.c b/net/core/iovec.c index 4c9c0121c9da..40a76ce19d9f 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -98,6 +98,31 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) } /* + * Copy kernel to iovec. Returns -EFAULT on error. + */ + +int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, + int offset, int len) +{ + int copy; + for (; len > 0; ++iov) { + /* Skip over the finished iovecs */ + if (unlikely(offset >= iov->iov_len)) { + offset -= iov->iov_len; + continue; + } + copy = min_t(unsigned int, iov->iov_len - offset, len); + offset = 0; + if (copy_to_user(iov->iov_base, kdata, copy)) + return -EFAULT; + kdata += copy; + len -= copy; + } + + return 0; +} + +/* * Copy iovec to kernel. Returns -EFAULT on error. * * Note: this modifies the original iovec. @@ -122,10 +147,11 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) } /* - * For use with ip_build_xmit + * Copy iovec from kernel. Returns -EFAULT on error. */ -int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, - int len) + +int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, + int offset, int len) { /* Skip over the finished iovecs */ while (offset >= iov->iov_len) { @@ -236,3 +262,4 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend); EXPORT_SYMBOL(memcpy_fromiovec); EXPORT_SYMBOL(memcpy_fromiovecend); EXPORT_SYMBOL(memcpy_toiovec); +EXPORT_SYMBOL(memcpy_toiovecend); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 2da59a0ac4ac..3994680c08b9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -78,7 +78,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, goto err; if (!rtnl_trylock()) - return -ERESTARTSYS; + return restart_syscall(); if (dev_isalive(net)) { if ((ret = (*set)(net, new)) == 0) @@ -225,7 +225,8 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); ret = dev_set_alias(netdev, buf, count); rtnl_unlock(); @@ -238,7 +239,8 @@ static ssize_t show_ifalias(struct device *dev, const struct net_device *netdev = to_net_dev(dev); ssize_t ret = 0; - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); if (netdev->ifalias) ret = sprintf(buf, "%s\n", netdev->ifalias); rtnl_unlock(); @@ -497,7 +499,6 @@ int netdev_register_kobject(struct net_device *net) dev->platform_data = net; dev->groups = groups; - BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ); dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb45665e4f..2785d7aa119e 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -19,11 +19,14 @@ #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> -#include <trace/skb.h> #include <asm/unaligned.h> #include <asm/bitops.h> +#define CREATE_TRACE_POINTS +#include <trace/events/skb.h> +#include <trace/napi.h> -DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e3bebd36f053..b7292a2719dc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -115,41 +115,34 @@ static void net_free(struct net *net) kmem_cache_free(net_cachep, net); } -struct net *copy_net_ns(unsigned long flags, struct net *old_net) +static struct net *net_create(void) { - struct net *new_net = NULL; - int err; - - get_net(old_net); - - if (!(flags & CLONE_NEWNET)) - return old_net; - - err = -ENOMEM; - new_net = net_alloc(); - if (!new_net) - goto out_err; + struct net *net; + int rv; + net = net_alloc(); + if (!net) + return ERR_PTR(-ENOMEM); mutex_lock(&net_mutex); - err = setup_net(new_net); - if (!err) { + rv = setup_net(net); + if (rv == 0) { rtnl_lock(); - list_add_tail(&new_net->list, &net_namespace_list); + list_add_tail(&net->list, &net_namespace_list); rtnl_unlock(); } mutex_unlock(&net_mutex); + if (rv < 0) { + net_free(net); + return ERR_PTR(rv); + } + return net; +} - if (err) - goto out_free; -out: - put_net(old_net); - return new_net; - -out_free: - net_free(new_net); -out_err: - new_net = ERR_PTR(err); - goto out; +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + return net_create(); } static void cleanup_net(struct work_struct *work) @@ -203,9 +196,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) static int __init net_ns_init(void) { struct net_generic *ng; - int err; - printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); #ifdef CONFIG_NET_NS net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, @@ -224,15 +215,14 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); mutex_lock(&net_mutex); - err = setup_net(&init_net); + if (setup_net(&init_net)) + panic("Could not setup the initial network namespace"); rtnl_lock(); list_add_tail(&init_net.list, &net_namespace_list); rtnl_unlock(); mutex_unlock(&net_mutex); - if (err) - panic("Could not setup the initial network namespace"); return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bdff612..7ab31a7576a1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); @@ -175,9 +177,13 @@ static void service_arp_queue(struct netpoll_info *npi) void netpoll_poll(struct netpoll *np) { struct net_device *dev = np->dev; - const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_ops *ops; + + if (!dev || !netif_running(dev)) + return; - if (!dev || !netif_running(dev) || !ops->ndo_poll_controller) + ops = dev->netdev_ops; + if (!ops->ndo_poll_controller) return; /* Process pending work on NIC */ @@ -296,8 +302,11 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (__netif_tx_trylock(txq)) { - if (!netif_tx_queue_stopped(txq)) + if (!netif_tx_queue_stopped(txq)) { status = ops->ndo_start_xmit(skb, dev); + if (status == NETDEV_TX_OK) + txq_trans_update(txq); + } __netif_tx_unlock(txq); if (status == NETDEV_TX_OK) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 3779c1438c11..b8ccd3c88d63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2447,7 +2447,7 @@ static inline void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i = 0; - for (; i < pkt_dev->nflows; i++){ + for (; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; if (x) { xfrm_state_put(x); @@ -3438,6 +3438,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) retry_now: ret = (*xmit)(pkt_dev->skb, odev); if (likely(ret == NETDEV_TX_OK)) { + txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce6356cd9f71..5cb51b2ffe87 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,7 +65,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -#include <trace/skb.h> +#include <trace/events/skb.h> #include "kmap_skb.h" @@ -502,7 +502,9 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size) shinfo->gso_segs = 0; shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; + shinfo->tx_flags.flags = 0; shinfo->frag_list = NULL; + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); memset(skb, 0, offsetof(struct sk_buff, tail)); skb->data = skb->head + NET_SKB_PAD; @@ -524,8 +526,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->sp = secpath_get(old->sp); #endif memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum_start = old->csum_start; - new->csum_offset = old->csum_offset; + new->csum = old->csum; new->local_df = old->local_df; new->pkt_type = old->pkt_type; new->ip_summed = old->ip_summed; @@ -536,6 +537,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->protocol = old->protocol; new->mark = old->mark; + new->iif = old->iif; __nf_copy(new, old); #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) @@ -548,10 +550,18 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #endif new->vlan_tci = old->vlan_tci; +#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) + new->do_not_encrypt = old->do_not_encrypt; + new->requeue = old->requeue; +#endif skb_copy_secmark(new, old); } +/* + * You should not add any new code to this function. Add it to + * __copy_skb_header above instead. + */ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) { #define C(x) n->x = skb->x @@ -567,16 +577,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->destructor = NULL; - C(iif); C(tail); C(end); C(head); C(data); C(truesize); -#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) - C(do_not_encrypt); - C(requeue); -#endif atomic_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); @@ -1365,9 +1370,8 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) static inline struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, - struct sk_buff *skb) + struct sk_buff *skb, struct sock *sk) { - struct sock *sk = skb->sk; struct page *p = sk->sk_sndmsg_page; unsigned int off; @@ -1405,13 +1409,14 @@ new_page: */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, unsigned int *len, unsigned int offset, - struct sk_buff *skb, int linear) + struct sk_buff *skb, int linear, + struct sock *sk) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; if (linear) { - page = linear_to_page(page, len, &offset, skb); + page = linear_to_page(page, len, &offset, skb, sk); if (!page) return 1; } else @@ -1442,7 +1447,8 @@ static inline void __segment_seek(struct page **page, unsigned int *poff, static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd, int linear) + struct splice_pipe_desc *spd, int linear, + struct sock *sk) { if (!*len) return 1; @@ -1465,7 +1471,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, &flen, poff, skb, linear)) + if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1481,8 +1487,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff, * pipe is full or if we already spliced the requested length. */ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, - unsigned int *len, - struct splice_pipe_desc *spd) + unsigned int *len, struct splice_pipe_desc *spd, + struct sock *sk) { int seg; @@ -1492,7 +1498,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd, 1)) + offset, len, skb, spd, 1, sk)) return 1; /* @@ -1502,7 +1508,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd, 0)) + offset, len, skb, spd, 0, sk)) return 1; } @@ -1528,12 +1534,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; + struct sock *sk = skb->sk; /* * __skb_splice_bits() only fails if the output has no room left, * so no point in going over the frag_list for the error case. */ - if (__skb_splice_bits(skb, &offset, &tlen, &spd)) + if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) goto done; else if (!tlen) goto done; @@ -1545,14 +1552,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct sk_buff *list = skb_shinfo(skb)->frag_list; for (; list && tlen; list = list->next) { - if (__skb_splice_bits(list, &offset, &tlen, &spd)) + if (__skb_splice_bits(list, &offset, &tlen, &spd, sk)) break; } } done: if (spd.nr_pages) { - struct sock *sk = skb->sk; int ret; /* @@ -2285,7 +2291,7 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, next_skb: block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; - if (abs_offset < block_limit) { + if (abs_offset < block_limit && !st->frag_data) { *data = st->cur_skb->data + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } @@ -2658,30 +2664,40 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff *p = *head; struct sk_buff *nskb; + struct skb_shared_info *skbinfo = skb_shinfo(skb); + struct skb_shared_info *pinfo = skb_shinfo(p); unsigned int headroom; unsigned int len = skb_gro_len(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); if (p->len + len >= 65536) return -E2BIG; - if (skb_shinfo(p)->frag_list) + if (pinfo->frag_list) goto merge; - else if (skb_headlen(skb) <= skb_gro_offset(skb)) { - if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags > - MAX_SKB_FRAGS) + else if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + offset -= headlen; + + if (nr_frags > MAX_SKB_FRAGS) return -E2BIG; - skb_shinfo(skb)->frags[0].page_offset += - skb_gro_offset(skb) - skb_headlen(skb); - skb_shinfo(skb)->frags[0].size -= - skb_gro_offset(skb) - skb_headlen(skb); + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; - memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, - skb_shinfo(skb)->frags, - skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); - skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; + frag->page_offset += offset; + frag->size -= offset; skb->truesize -= skb->data_len; skb->len -= skb->data_len; @@ -2712,7 +2728,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; - skb_shinfo(nskb)->gso_size = skb_shinfo(p)->gso_size; + skb_shinfo(nskb)->gso_size = pinfo->gso_size; skb_header_release(p); nskb->prev = p; @@ -2727,16 +2743,13 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: - if (skb_gro_offset(skb) > skb_headlen(skb)) { - skb_shinfo(skb)->frags[0].page_offset += - skb_gro_offset(skb) - skb_headlen(skb); - skb_shinfo(skb)->frags[0].size -= - skb_gro_offset(skb) - skb_headlen(skb); - skb_gro_reset_offset(skb); - skb_gro_pull(skb, skb_headlen(skb)); + if (offset > headlen) { + skbinfo->frags[0].page_offset += offset - headlen; + skbinfo->frags[0].size -= offset - headlen; + offset = headlen; } - __skb_pull(skb, skb_gro_offset(skb)); + __skb_pull(skb, offset); p->prev->next = skb; p->prev = skb; diff --git a/net/core/stream.c b/net/core/stream.c index 8727cead64ad..a37debfeb1b2 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -33,7 +33,8 @@ void sk_stream_write_space(struct sock *sk) clear_bit(SOCK_NOSPACE, &sock->flags); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | + POLLWRNORM | POLLWRBAND); if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); } |