summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c130
-rw-r--r--net/core/dev.c514
-rw-r--r--net/core/drop_monitor.c139
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/gen_estimator.c15
-rw-r--r--net/core/iovec.c33
-rw-r--r--net/core/net-sysfs.c9
-rw-r--r--net/core/net-traces.c7
-rw-r--r--net/core/net_namespace.c54
-rw-r--r--net/core/netpoll.c15
-rw-r--r--net/core/pktgen.c3
-rw-r--r--net/core/skbuff.c101
-rw-r--r--net/core/stream.c3
13 files changed, 785 insertions, 242 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d0de644b378d..e2a36f05cdf7 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
+static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
+ void *key)
+{
+ unsigned long bits = (unsigned long)key;
+
+ /*
+ * Avoid a wakeup if event not interesting for us
+ */
+ if (bits && !(bits & (POLLIN | POLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
/*
* Wait for a packet..
*/
static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
{
int error;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, receiver_wake_function);
prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
@@ -248,7 +260,9 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
spin_unlock_bh(&sk->sk_receive_queue.lock);
}
- skb_free_datagram(sk, skb);
+ kfree_skb(skb);
+ sk_mem_reclaim_partial(sk);
+
return err;
}
@@ -339,17 +353,111 @@ fault:
}
/**
+ * skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: io vector to copy to
+ * @to_offset: offset in the io vector to start copying to
+ * @len: amount of data to copy from buffer to iovec
+ *
+ * Returns 0 or -EFAULT.
+ * Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
+ const struct iovec *to, int to_offset,
+ int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ err = memcpy_toiovecend(to, vaddr + frag->page_offset +
+ offset - start, to_offset, copy);
+ kunmap(page);
+ if (err)
+ goto fault;
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_const_iovec(list,
+ offset - start,
+ to, to_offset,
+ copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+
+/**
* skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying to
* @from: io vector to copy to
+ * @from_offset: offset in the io vector to start copying from
* @len: amount of data to copy to buffer from iovec
*
* Returns 0 or -EFAULT.
- * Note: the iovec is modified during the copy.
+ * Note: the iovec is not modified during the copy.
*/
int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
- struct iovec *from, int len)
+ const struct iovec *from, int from_offset,
+ int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -358,11 +466,12 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_fromiovec(skb->data + offset, from, copy))
+ if (memcpy_fromiovecend(skb->data + offset, from, 0, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
+ from_offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -381,8 +490,9 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
vaddr = kmap(page);
- err = memcpy_fromiovec(vaddr + frag->page_offset +
- offset - start, from, copy);
+ err = memcpy_fromiovecend(vaddr + frag->page_offset +
+ offset - start,
+ from, from_offset, copy);
kunmap(page);
if (err)
goto fault;
@@ -390,6 +500,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (!(len -= copy))
return 0;
offset += copy;
+ from_offset += copy;
}
start = end;
}
@@ -408,11 +519,14 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
copy = len;
if (skb_copy_datagram_from_iovec(list,
offset - start,
- from, copy))
+ from,
+ from_offset,
+ copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
+ from_offset += copy;
}
start = end;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f65ea7..ed4550fd9ece 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <trace/napi.h>
#include "net-sysfs.h"
@@ -1336,7 +1337,12 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
+#ifdef CONFIG_NET_CLS_ACT
+ if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
+ net_timestamp(skb);
+#else
net_timestamp(skb);
+#endif
rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1683,7 +1689,17 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
goto gso;
}
+ /*
+ * If device doesnt need skb->dst, release it right now while
+ * its hot in this cpu cache
+ */
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) {
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ }
rc = ops->ndo_start_xmit(skb, dev);
+ if (rc == 0)
+ txq_trans_update(txq);
/*
* TODO: if skb_orphan() was called by
* dev->hard_start_xmit() (for example, the unmodified
@@ -1713,6 +1729,7 @@ gso:
skb->next = nskb;
return rc;
}
+ txq_trans_update(txq);
if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
@@ -1732,9 +1749,14 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
- } else if (skb->sk && skb->sk->sk_hash) {
+ while (unlikely (hash >= dev->real_num_tx_queues))
+ hash -= dev->real_num_tx_queues;
+ return hash;
+ }
+
+ if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
- } else
+ else
hash = skb->protocol;
hash = jhash_1word(hash, skb_tx_hashrnd);
@@ -2368,26 +2390,6 @@ void napi_gro_flush(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_gro_flush);
-void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
-{
- unsigned int offset = skb_gro_offset(skb);
-
- hlen += offset;
- if (hlen <= skb_headlen(skb))
- return skb->data + offset;
-
- if (unlikely(!skb_shinfo(skb)->nr_frags ||
- skb_shinfo(skb)->frags[0].size <=
- hlen - skb_headlen(skb) ||
- PageHighMem(skb_shinfo(skb)->frags[0].page)))
- return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
-
- return page_address(skb_shinfo(skb)->frags[0].page) +
- skb_shinfo(skb)->frags[0].page_offset +
- offset - skb_headlen(skb);
-}
-EXPORT_SYMBOL(skb_gro_header);
-
int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
@@ -2450,10 +2452,25 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
ret = GRO_HELD;
pull:
- if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
- if (napi->gro_list == skb)
- napi->gro_list = skb->next;
- ret = GRO_DROP;
+ if (skb_headlen(skb) < skb_gro_offset(skb)) {
+ int grow = skb_gro_offset(skb) - skb_headlen(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->tail += grow;
+ skb->data_len -= grow;
+
+ skb_shinfo(skb)->frags[0].page_offset += grow;
+ skb_shinfo(skb)->frags[0].size -= grow;
+
+ if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
+ put_page(skb_shinfo(skb)->frags[0].page);
+ memmove(skb_shinfo(skb)->frags,
+ skb_shinfo(skb)->frags + 1,
+ --skb_shinfo(skb)->nr_frags);
+ }
}
ok:
@@ -2503,6 +2520,22 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
}
EXPORT_SYMBOL(napi_skb_finish);
+void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (skb->mac_header == skb->tail &&
+ !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
+ NAPI_GRO_CB(skb)->frag0 =
+ page_address(skb_shinfo(skb)->frags[0].page) +
+ skb_shinfo(skb)->frags[0].page_offset;
+ NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
+ }
+}
+EXPORT_SYMBOL(skb_gro_reset_offset);
+
int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
skb_gro_reset_offset(skb);
@@ -2520,16 +2553,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
}
EXPORT_SYMBOL(napi_reuse_skb);
-struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
- struct napi_gro_fraginfo *info)
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct net_device *dev = napi->dev;
struct sk_buff *skb = napi->skb;
- struct ethhdr *eth;
- skb_frag_t *frag;
- int i;
-
- napi->skb = NULL;
if (!skb) {
skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
@@ -2537,47 +2564,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
goto out;
skb_reserve(skb, NET_IP_ALIGN);
- }
-
- BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
- frag = &info->frags[info->nr_frags - 1];
-
- for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
- skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
- frag->size);
- frag++;
- }
- skb_shinfo(skb)->nr_frags = info->nr_frags;
-
- skb->data_len = info->len;
- skb->len += info->len;
- skb->truesize += info->len;
- skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
-
- eth = skb_gro_header(skb, sizeof(*eth));
- if (!eth) {
- napi_reuse_skb(napi, skb);
- skb = NULL;
- goto out;
+ napi->skb = skb;
}
- skb_gro_pull(skb, sizeof(*eth));
-
- /*
- * This works because the only protocols we care about don't require
- * special handling. We'll fix it up properly at the end.
- */
- skb->protocol = eth->h_proto;
-
- skb->ip_summed = info->ip_summed;
- skb->csum = info->csum;
-
out:
return skb;
}
-EXPORT_SYMBOL(napi_fraginfo_skb);
+EXPORT_SYMBOL(napi_get_frags);
int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
{
@@ -2607,9 +2601,46 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
}
EXPORT_SYMBOL(napi_frags_finish);
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
- struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+ struct sk_buff *skb = napi->skb;
+ struct ethhdr *eth;
+ unsigned int hlen;
+ unsigned int off;
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*eth);
+ eth = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ eth = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!eth)) {
+ napi_reuse_skb(napi, skb);
+ skb = NULL;
+ goto out;
+ }
+ }
+
+ skb_gro_pull(skb, sizeof(*eth));
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling. We'll fix it up properly at the end.
+ */
+ skb->protocol = eth->h_proto;
+
+out:
+ return skb;
+}
+EXPORT_SYMBOL(napi_frags_skb);
+
+int napi_gro_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
return NET_RX_DROP;
@@ -2713,7 +2744,7 @@ void netif_napi_del(struct napi_struct *napi)
struct sk_buff *skb, *next;
list_del_init(&napi->dev_list);
- kfree_skb(napi->skb);
+ napi_free_frags(napi);
for (skb = napi->gro_list; skb; skb = next) {
next = skb->next;
@@ -2767,8 +2798,10 @@ static void net_rx_action(struct softirq_action *h)
* accidently calling ->poll() when NAPI is not scheduled.
*/
work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state))
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
+ trace_napi_poll(n);
+ }
WARN_ON_ONCE(work > weight);
@@ -3438,6 +3471,252 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len, unsigned char addr_type)
+{
+ struct netdev_hw_addr *ha;
+ int alloc_size;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+ struct netdev_hw_addr *ha;
+
+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
+ kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, unsigned char addr_type,
+ int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len) &&
+ (ha->type == addr_type || !addr_type)) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, unsigned char addr_type,
+ int ignore_index)
+{
+ int err;
+ struct netdev_hw_addr *ha, *ha2;
+ unsigned char type;
+
+ list_for_each_entry(ha, from_list, list) {
+ type = addr_type ? addr_type : ha->type;
+ err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+ if (err)
+ goto unroll;
+ }
+ return 0;
+
+unroll:
+ list_for_each_entry(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ type = addr_type ? addr_type : ha2->type;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+ ignore_index);
+ }
+ return err;
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, unsigned char addr_type,
+ int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ unsigned char type;
+
+ list_for_each_entry(ha, from_list, list) {
+ type = addr_type ? addr_type : ha->type;
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+ ignore_index);
+ }
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+ NETDEV_HW_ADDR_T_LAN);
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ * @addr_type: address type
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+ addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ * @addr_type: address type
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+ addr_type, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ * @addr_type: address type - 0 means type will be used from from_dev
+ *
+ * Add device addresses of the one device to another.
+ **
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, addr_type, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ * @addr_type: address type - 0 means type will used from from_dev
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev,
+ unsigned char addr_type)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_del_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, addr_type, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4327,39 +4606,6 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
}
EXPORT_SYMBOL(netdev_fix_features);
-/* Some devices need to (re-)set their netdev_ops inside
- * ->init() or similar. If that happens, we have to setup
- * the compat pointers again.
- */
-void netdev_resync_ops(struct net_device *dev)
-{
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
- const struct net_device_ops *ops = dev->netdev_ops;
-
- dev->init = ops->ndo_init;
- dev->uninit = ops->ndo_uninit;
- dev->open = ops->ndo_open;
- dev->change_rx_flags = ops->ndo_change_rx_flags;
- dev->set_rx_mode = ops->ndo_set_rx_mode;
- dev->set_multicast_list = ops->ndo_set_multicast_list;
- dev->set_mac_address = ops->ndo_set_mac_address;
- dev->validate_addr = ops->ndo_validate_addr;
- dev->do_ioctl = ops->ndo_do_ioctl;
- dev->set_config = ops->ndo_set_config;
- dev->change_mtu = ops->ndo_change_mtu;
- dev->neigh_setup = ops->ndo_neigh_setup;
- dev->tx_timeout = ops->ndo_tx_timeout;
- dev->get_stats = ops->ndo_get_stats;
- dev->vlan_rx_register = ops->ndo_vlan_rx_register;
- dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
- dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
-#ifdef CONFIG_NET_POLL_CONTROLLER
- dev->poll_controller = ops->ndo_poll_controller;
-#endif
-#endif
-}
-EXPORT_SYMBOL(netdev_resync_ops);
-
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -4399,23 +4645,6 @@ int register_netdevice(struct net_device *dev)
dev->iflink = -1;
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
- /* Netdevice_ops API compatiability support.
- * This is temporary until all network devices are converted.
- */
- if (dev->netdev_ops) {
- netdev_resync_ops(dev);
- } else {
- char drivername[64];
- pr_info("%s (%s): not using net_device_ops yet\n",
- dev->name, netdev_drivername(dev, drivername, 64));
-
- /* This works only because net_device_ops and the
- compatiablity structure are the same. */
- dev->netdev_ops = (void *) &(dev->init);
- }
-#endif
-
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
@@ -4701,13 +4930,30 @@ void netdev_run_todo(void)
* the internal statistics structure is used.
*/
const struct net_device_stats *dev_get_stats(struct net_device *dev)
- {
+{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_get_stats)
return ops->ndo_get_stats(dev);
- else
- return &dev->stats;
+ else {
+ unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+ struct net_device_stats *stats = &dev->stats;
+ unsigned int i;
+ struct netdev_queue *txq;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ txq = netdev_get_tx_queue(dev, i);
+ tx_bytes += txq->tx_bytes;
+ tx_packets += txq->tx_packets;
+ tx_dropped += txq->tx_dropped;
+ }
+ if (tx_bytes || tx_packets || tx_dropped) {
+ stats->tx_bytes = tx_bytes;
+ stats->tx_packets = tx_packets;
+ stats->tx_dropped = tx_dropped;
+ }
+ return stats;
+ }
}
EXPORT_SYMBOL(dev_get_stats);
@@ -4765,13 +5011,16 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
if (!tx) {
printk(KERN_ERR "alloc_netdev: Unable to allocate "
"tx qdiscs.\n");
- kfree(p);
- return NULL;
+ goto free_p;
}
dev = (struct net_device *)
(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
dev->padded = (char *)dev - (char *)p;
+
+ if (dev_addr_init(dev))
+ goto free_tx;
+
dev_net_set(dev, &init_net);
dev->_tx = tx;
@@ -4783,9 +5032,17 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
netdev_init_queues(dev);
INIT_LIST_HEAD(&dev->napi_list);
+ dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
strcpy(dev->name, name);
return dev;
+
+free_tx:
+ kfree(tx);
+
+free_p:
+ kfree(p);
+ return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mq);
@@ -4805,6 +5062,9 @@ void free_netdev(struct net_device *dev)
kfree(dev->_tx);
+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9fd0dc3cca99..defbbf093f08 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
#include <linux/timer.h>
#include <linux/bitops.h>
#include <net/genetlink.h>
+#include <net/netevent.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
+#include <trace/napi.h>
#include <asm/unaligned.h>
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
* and the work handle that will send up
* netlink alerts
*/
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
struct per_cpu_dm_data {
struct work_struct dm_alert_work;
@@ -47,11 +50,18 @@ struct per_cpu_dm_data {
struct timer_list send_timer;
};
+struct dm_hw_stat_delta {
+ struct net_device *dev;
+ struct list_head list;
+ struct rcu_head rcu;
+ unsigned long last_drop_val;
+};
+
static struct genl_family net_drop_monitor_family = {
.id = GENL_ID_GENERATE,
.hdrsize = 0,
.name = "NET_DM",
- .version = 1,
+ .version = 2,
.maxattr = NET_DM_CMD_MAX,
};
@@ -59,19 +69,24 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
static void reset_per_cpu_data(struct per_cpu_dm_data *data)
{
size_t al;
struct net_dm_alert_msg *msg;
+ struct nlattr *nla;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ al += sizeof(struct nlattr);
+
data->skb = genlmsg_new(al, GFP_KERNEL);
genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
0, NET_DM_CMD_ALERT);
- msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg));
+ nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg));
+ msg = nla_data(nla);
memset(msg, 0, al);
atomic_set(&data->dm_hit_count, dm_hit_limit);
}
@@ -111,10 +126,11 @@ static void sched_send_work(unsigned long unused)
schedule_work(&data->dm_alert_work);
}
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
struct nlmsghdr *nlh;
+ struct nlattr *nla;
int i;
struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
@@ -127,7 +143,8 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
}
nlh = (struct nlmsghdr *)data->skb->data;
- msg = genlmsg_data(nlmsg_data(nlh));
+ nla = genlmsg_data(nlmsg_data(nlh));
+ msg = nla_data(nla);
for (i = 0; i < msg->entries; i++) {
if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
msg->points[i].count++;
@@ -139,6 +156,7 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
* We need to create a new entry
*/
__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+ nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
msg->points[msg->entries].count = 1;
msg->entries++;
@@ -152,24 +170,80 @@ out:
return;
}
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+ trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+ struct dm_hw_stat_delta *new_stat;
+
+ /*
+ * Ratelimit our check time to dm_hw_check_delta jiffies
+ */
+ if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ if ((new_stat->dev == napi->dev) &&
+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ trace_drop_common(NULL, NULL);
+ new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+ struct dm_hw_stat_delta *n;
+ n = container_of(head, struct dm_hw_stat_delta, rcu);
+ kfree(n);
+}
+
static int set_all_monitor_traces(int state)
{
int rc = 0;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *temp;
+
+ spin_lock(&trace_state_lock);
switch (state) {
case TRACE_ON:
rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+ rc |= register_trace_napi_poll(trace_napi_poll_hit);
break;
case TRACE_OFF:
rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
tracepoint_synchronize_unregister();
+
+ /*
+ * Clean the device list
+ */
+ list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+ if (new_stat->dev == NULL) {
+ list_del_rcu(&new_stat->list);
+ call_rcu(&new_stat->rcu, free_dm_hw_stat);
+ }
+ }
break;
default:
rc = 1;
break;
}
+ if (!rc)
+ trace_state = state;
+
+ spin_unlock(&trace_state_lock);
+
if (rc)
return -EINPROGRESS;
return rc;
@@ -197,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
return -ENOTSUPP;
}
+static int dropmon_net_event(struct notifier_block *ev_block,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *tmp;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+ if (!new_stat)
+ goto out;
+
+ new_stat->dev = dev;
+ INIT_RCU_HEAD(&new_stat->rcu);
+ spin_lock(&trace_state_lock);
+ list_add_rcu(&new_stat->list, &hw_stats_list);
+ spin_unlock(&trace_state_lock);
+ break;
+ case NETDEV_UNREGISTER:
+ spin_lock(&trace_state_lock);
+ list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+ if (new_stat->dev == dev) {
+ new_stat->dev = NULL;
+ if (trace_state == TRACE_OFF) {
+ list_del_rcu(&new_stat->list);
+ call_rcu(&new_stat->rcu, free_dm_hw_stat);
+ break;
+ }
+ }
+ }
+ spin_unlock(&trace_state_lock);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
static struct genl_ops dropmon_ops[] = {
{
@@ -213,6 +325,10 @@ static struct genl_ops dropmon_ops[] = {
},
};
+static struct notifier_block dropmon_net_notifier = {
+ .notifier_call = dropmon_net_event
+};
+
static int __init init_net_drop_monitor(void)
{
int cpu;
@@ -236,12 +352,18 @@ static int __init init_net_drop_monitor(void)
ret = genl_register_ops(&net_drop_monitor_family,
&dropmon_ops[i]);
if (ret) {
- printk(KERN_CRIT "failed to register operation %d\n",
+ printk(KERN_CRIT "Failed to register operation %d\n",
dropmon_ops[i].cmd);
goto out_unreg;
}
}
+ rc = register_netdevice_notifier(&dropmon_net_notifier);
+ if (rc < 0) {
+ printk(KERN_CRIT "Failed to register netdevice notifier\n");
+ goto out_unreg;
+ }
+
rc = 0;
for_each_present_cpu(cpu) {
@@ -252,6 +374,7 @@ static int __init init_net_drop_monitor(void)
data->send_timer.data = cpu;
data->send_timer.function = sched_send_work;
}
+
goto out;
out_unreg:
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98691e1466b8..bd309384f8b8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -299,7 +299,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
} else if (rule->action == FR_ACT_GOTO)
goto errout_free;
- err = ops->configure(rule, skb, nlh, frh, tb);
+ err = ops->configure(rule, skb, frh, tb);
if (err < 0)
goto errout_free;
@@ -500,7 +500,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
if (rule->target)
NLA_PUT_U32(skb, FRA_GOTO, rule->target);
- if (ops->fill(rule, skb, nlh, frh) < 0)
+ if (ops->fill(rule, skb, frh) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9cc9f95b109e..78e5bfc454ae 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -66,9 +66,9 @@
NOTES.
- * The stored value for avbps is scaled by 2^5, so that maximal
- rate is ~1Gbit, avpps is scaled by 2^10.
-
+ * avbps is scaled by 2^5, avpps is scaled by 2^10.
+ * both values are reported as 32 bit unsigned values. bps can
+ overflow for fast links : max speed being 34360Mbit/sec
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
for HZ=100 and HZ=1024 8)), maximal interval
is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
@@ -86,9 +86,9 @@ struct gen_estimator
spinlock_t *stats_lock;
int ewma_log;
u64 last_bytes;
+ u64 avbps;
u32 last_packets;
u32 avpps;
- u32 avbps;
struct rcu_head e_rcu;
struct rb_node node;
};
@@ -115,6 +115,7 @@ static void est_timer(unsigned long arg)
rcu_read_lock();
list_for_each_entry_rcu(e, &elist[idx].list, list) {
u64 nbytes;
+ u64 brate;
u32 npackets;
u32 rate;
@@ -125,14 +126,14 @@ static void est_timer(unsigned long arg)
nbytes = e->bstats->bytes;
npackets = e->bstats->packets;
- rate = (nbytes - e->last_bytes)<<(7 - idx);
+ brate = (nbytes - e->last_bytes)<<(7 - idx);
e->last_bytes = nbytes;
- e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
+ e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
rate = (npackets - e->last_packets)<<(12 - idx);
e->last_packets = npackets;
- e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
+ e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
e->rate_est->pps = (e->avpps+0x1FF)>>10;
skip:
read_unlock(&est_lock);
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 4c9c0121c9da..40a76ce19d9f 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -98,6 +98,31 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
}
/*
+ * Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+ int offset, int len)
+{
+ int copy;
+ for (; len > 0; ++iov) {
+ /* Skip over the finished iovecs */
+ if (unlikely(offset >= iov->iov_len)) {
+ offset -= iov->iov_len;
+ continue;
+ }
+ copy = min_t(unsigned int, iov->iov_len - offset, len);
+ offset = 0;
+ if (copy_to_user(iov->iov_base, kdata, copy))
+ return -EFAULT;
+ kdata += copy;
+ len -= copy;
+ }
+
+ return 0;
+}
+
+/*
* Copy iovec to kernel. Returns -EFAULT on error.
*
* Note: this modifies the original iovec.
@@ -122,10 +147,11 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
}
/*
- * For use with ip_build_xmit
+ * Copy iovec from kernel. Returns -EFAULT on error.
*/
-int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
- int len)
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+ int offset, int len)
{
/* Skip over the finished iovecs */
while (offset >= iov->iov_len) {
@@ -236,3 +262,4 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
EXPORT_SYMBOL(memcpy_fromiovec);
EXPORT_SYMBOL(memcpy_fromiovecend);
EXPORT_SYMBOL(memcpy_toiovec);
+EXPORT_SYMBOL(memcpy_toiovecend);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2da59a0ac4ac..3994680c08b9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -78,7 +78,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
goto err;
if (!rtnl_trylock())
- return -ERESTARTSYS;
+ return restart_syscall();
if (dev_isalive(net)) {
if ((ret = (*set)(net, new)) == 0)
@@ -225,7 +225,8 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- rtnl_lock();
+ if (!rtnl_trylock())
+ return restart_syscall();
ret = dev_set_alias(netdev, buf, count);
rtnl_unlock();
@@ -238,7 +239,8 @@ static ssize_t show_ifalias(struct device *dev,
const struct net_device *netdev = to_net_dev(dev);
ssize_t ret = 0;
- rtnl_lock();
+ if (!rtnl_trylock())
+ return restart_syscall();
if (netdev->ifalias)
ret = sprintf(buf, "%s\n", netdev->ifalias);
rtnl_unlock();
@@ -497,7 +499,6 @@ int netdev_register_kobject(struct net_device *net)
dev->platform_data = net;
dev->groups = groups;
- BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ);
dev_set_name(dev, "%s", net->name);
#ifdef CONFIG_SYSFS
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb45665e4f..2785d7aa119e 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -19,11 +19,14 @@
#include <linux/workqueue.h>
#include <linux/netlink.h>
#include <linux/net_dropmon.h>
-#include <trace/skb.h>
#include <asm/unaligned.h>
#include <asm/bitops.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
+#include <trace/napi.h>
-DEFINE_TRACE(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e3bebd36f053..b7292a2719dc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -115,41 +115,34 @@ static void net_free(struct net *net)
kmem_cache_free(net_cachep, net);
}
-struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+static struct net *net_create(void)
{
- struct net *new_net = NULL;
- int err;
-
- get_net(old_net);
-
- if (!(flags & CLONE_NEWNET))
- return old_net;
-
- err = -ENOMEM;
- new_net = net_alloc();
- if (!new_net)
- goto out_err;
+ struct net *net;
+ int rv;
+ net = net_alloc();
+ if (!net)
+ return ERR_PTR(-ENOMEM);
mutex_lock(&net_mutex);
- err = setup_net(new_net);
- if (!err) {
+ rv = setup_net(net);
+ if (rv == 0) {
rtnl_lock();
- list_add_tail(&new_net->list, &net_namespace_list);
+ list_add_tail(&net->list, &net_namespace_list);
rtnl_unlock();
}
mutex_unlock(&net_mutex);
+ if (rv < 0) {
+ net_free(net);
+ return ERR_PTR(rv);
+ }
+ return net;
+}
- if (err)
- goto out_free;
-out:
- put_net(old_net);
- return new_net;
-
-out_free:
- net_free(new_net);
-out_err:
- new_net = ERR_PTR(err);
- goto out;
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+ return net_create();
}
static void cleanup_net(struct work_struct *work)
@@ -203,9 +196,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
static int __init net_ns_init(void)
{
struct net_generic *ng;
- int err;
- printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
@@ -224,15 +215,14 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
mutex_lock(&net_mutex);
- err = setup_net(&init_net);
+ if (setup_net(&init_net))
+ panic("Could not setup the initial network namespace");
rtnl_lock();
list_add_tail(&init_net.list, &net_namespace_list);
rtnl_unlock();
mutex_unlock(&net_mutex);
- if (err)
- panic("Could not setup the initial network namespace");
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bdff612..7ab31a7576a1 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <asm/unaligned.h>
+#include <trace/napi.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
set_bit(NAPI_STATE_NPSVC, &napi->state);
work = napi->poll(napi, budget);
+ trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
atomic_dec(&trapped);
@@ -175,9 +177,13 @@ static void service_arp_queue(struct netpoll_info *npi)
void netpoll_poll(struct netpoll *np)
{
struct net_device *dev = np->dev;
- const struct net_device_ops *ops = dev->netdev_ops;
+ const struct net_device_ops *ops;
+
+ if (!dev || !netif_running(dev))
+ return;
- if (!dev || !netif_running(dev) || !ops->ndo_poll_controller)
+ ops = dev->netdev_ops;
+ if (!ops->ndo_poll_controller)
return;
/* Process pending work on NIC */
@@ -296,8 +302,11 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
if (__netif_tx_trylock(txq)) {
- if (!netif_tx_queue_stopped(txq))
+ if (!netif_tx_queue_stopped(txq)) {
status = ops->ndo_start_xmit(skb, dev);
+ if (status == NETDEV_TX_OK)
+ txq_trans_update(txq);
+ }
__netif_tx_unlock(txq);
if (status == NETDEV_TX_OK)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3779c1438c11..b8ccd3c88d63 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2447,7 +2447,7 @@ static inline void free_SAs(struct pktgen_dev *pkt_dev)
if (pkt_dev->cflows) {
/* let go of the SAs if we have them */
int i = 0;
- for (; i < pkt_dev->nflows; i++){
+ for (; i < pkt_dev->cflows; i++) {
struct xfrm_state *x = pkt_dev->flows[i].x;
if (x) {
xfrm_state_put(x);
@@ -3438,6 +3438,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
retry_now:
ret = (*xmit)(pkt_dev->skb, odev);
if (likely(ret == NETDEV_TX_OK)) {
+ txq_trans_update(txq);
pkt_dev->last_ok = 1;
pkt_dev->sofar++;
pkt_dev->seq_num++;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce6356cd9f71..5cb51b2ffe87 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,7 +65,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
#include "kmap_skb.h"
@@ -502,7 +502,9 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
shinfo->gso_segs = 0;
shinfo->gso_type = 0;
shinfo->ip6_frag_id = 0;
+ shinfo->tx_flags.flags = 0;
shinfo->frag_list = NULL;
+ memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->data = skb->head + NET_SKB_PAD;
@@ -524,8 +526,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->sp = secpath_get(old->sp);
#endif
memcpy(new->cb, old->cb, sizeof(old->cb));
- new->csum_start = old->csum_start;
- new->csum_offset = old->csum_offset;
+ new->csum = old->csum;
new->local_df = old->local_df;
new->pkt_type = old->pkt_type;
new->ip_summed = old->ip_summed;
@@ -536,6 +537,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
new->protocol = old->protocol;
new->mark = old->mark;
+ new->iif = old->iif;
__nf_copy(new, old);
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
@@ -548,10 +550,18 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
#endif
new->vlan_tci = old->vlan_tci;
+#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
+ new->do_not_encrypt = old->do_not_encrypt;
+ new->requeue = old->requeue;
+#endif
skb_copy_secmark(new, old);
}
+/*
+ * You should not add any new code to this function. Add it to
+ * __copy_skb_header above instead.
+ */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x
@@ -567,16 +577,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->cloned = 1;
n->nohdr = 0;
n->destructor = NULL;
- C(iif);
C(tail);
C(end);
C(head);
C(data);
C(truesize);
-#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
- C(do_not_encrypt);
- C(requeue);
-#endif
atomic_set(&n->users, 1);
atomic_inc(&(skb_shinfo(skb)->dataref));
@@ -1365,9 +1370,8 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
static inline struct page *linear_to_page(struct page *page, unsigned int *len,
unsigned int *offset,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct sock *sk)
{
- struct sock *sk = skb->sk;
struct page *p = sk->sk_sndmsg_page;
unsigned int off;
@@ -1405,13 +1409,14 @@ new_page:
*/
static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
unsigned int *len, unsigned int offset,
- struct sk_buff *skb, int linear)
+ struct sk_buff *skb, int linear,
+ struct sock *sk)
{
if (unlikely(spd->nr_pages == PIPE_BUFFERS))
return 1;
if (linear) {
- page = linear_to_page(page, len, &offset, skb);
+ page = linear_to_page(page, len, &offset, skb, sk);
if (!page)
return 1;
} else
@@ -1442,7 +1447,8 @@ static inline void __segment_seek(struct page **page, unsigned int *poff,
static inline int __splice_segment(struct page *page, unsigned int poff,
unsigned int plen, unsigned int *off,
unsigned int *len, struct sk_buff *skb,
- struct splice_pipe_desc *spd, int linear)
+ struct splice_pipe_desc *spd, int linear,
+ struct sock *sk)
{
if (!*len)
return 1;
@@ -1465,7 +1471,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
/* the linear region may spread across several pages */
flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
- if (spd_fill_page(spd, page, &flen, poff, skb, linear))
+ if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
return 1;
__segment_seek(&page, &poff, &plen, flen);
@@ -1481,8 +1487,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
* pipe is full or if we already spliced the requested length.
*/
static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
- unsigned int *len,
- struct splice_pipe_desc *spd)
+ unsigned int *len, struct splice_pipe_desc *spd,
+ struct sock *sk)
{
int seg;
@@ -1492,7 +1498,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
- offset, len, skb, spd, 1))
+ offset, len, skb, spd, 1, sk))
return 1;
/*
@@ -1502,7 +1508,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
if (__splice_segment(f->page, f->page_offset, f->size,
- offset, len, skb, spd, 0))
+ offset, len, skb, spd, 0, sk))
return 1;
}
@@ -1528,12 +1534,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
.ops = &sock_pipe_buf_ops,
.spd_release = sock_spd_release,
};
+ struct sock *sk = skb->sk;
/*
* __skb_splice_bits() only fails if the output has no room left,
* so no point in going over the frag_list for the error case.
*/
- if (__skb_splice_bits(skb, &offset, &tlen, &spd))
+ if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
goto done;
else if (!tlen)
goto done;
@@ -1545,14 +1552,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
struct sk_buff *list = skb_shinfo(skb)->frag_list;
for (; list && tlen; list = list->next) {
- if (__skb_splice_bits(list, &offset, &tlen, &spd))
+ if (__skb_splice_bits(list, &offset, &tlen, &spd, sk))
break;
}
}
done:
if (spd.nr_pages) {
- struct sock *sk = skb->sk;
int ret;
/*
@@ -2285,7 +2291,7 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
next_skb:
block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
- if (abs_offset < block_limit) {
+ if (abs_offset < block_limit && !st->frag_data) {
*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
return block_limit - abs_offset;
}
@@ -2658,30 +2664,40 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
struct sk_buff *p = *head;
struct sk_buff *nskb;
+ struct skb_shared_info *skbinfo = skb_shinfo(skb);
+ struct skb_shared_info *pinfo = skb_shinfo(p);
unsigned int headroom;
unsigned int len = skb_gro_len(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
if (p->len + len >= 65536)
return -E2BIG;
- if (skb_shinfo(p)->frag_list)
+ if (pinfo->frag_list)
goto merge;
- else if (skb_headlen(skb) <= skb_gro_offset(skb)) {
- if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags >
- MAX_SKB_FRAGS)
+ else if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ offset -= headlen;
+
+ if (nr_frags > MAX_SKB_FRAGS)
return -E2BIG;
- skb_shinfo(skb)->frags[0].page_offset +=
- skb_gro_offset(skb) - skb_headlen(skb);
- skb_shinfo(skb)->frags[0].size -=
- skb_gro_offset(skb) - skb_headlen(skb);
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
- memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
- skb_shinfo(skb)->frags,
- skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
- skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags;
- skb_shinfo(skb)->nr_frags = 0;
+ frag->page_offset += offset;
+ frag->size -= offset;
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
@@ -2712,7 +2728,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
skb_shinfo(nskb)->frag_list = p;
- skb_shinfo(nskb)->gso_size = skb_shinfo(p)->gso_size;
+ skb_shinfo(nskb)->gso_size = pinfo->gso_size;
skb_header_release(p);
nskb->prev = p;
@@ -2727,16 +2743,13 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
p = nskb;
merge:
- if (skb_gro_offset(skb) > skb_headlen(skb)) {
- skb_shinfo(skb)->frags[0].page_offset +=
- skb_gro_offset(skb) - skb_headlen(skb);
- skb_shinfo(skb)->frags[0].size -=
- skb_gro_offset(skb) - skb_headlen(skb);
- skb_gro_reset_offset(skb);
- skb_gro_pull(skb, skb_headlen(skb));
+ if (offset > headlen) {
+ skbinfo->frags[0].page_offset += offset - headlen;
+ skbinfo->frags[0].size -= offset - headlen;
+ offset = headlen;
}
- __skb_pull(skb, skb_gro_offset(skb));
+ __skb_pull(skb, offset);
p->prev->next = skb;
p->prev = skb;
diff --git a/net/core/stream.c b/net/core/stream.c
index 8727cead64ad..a37debfeb1b2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -33,7 +33,8 @@ void sk_stream_write_space(struct sock *sk)
clear_bit(SOCK_NOSPACE, &sock->flags);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible_poll(sk->sk_sleep, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
}