summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c1104
1 files changed, 682 insertions, 422 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 0dd54a69dace..856b6ee9a1d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
#include <trace/events/skb.h>
#include <linux/pci.h>
#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
#include "net-sysfs.h"
@@ -743,34 +744,32 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
EXPORT_SYMBOL(dev_get_by_index);
/**
- * dev_getbyhwaddr - find a device by its hardware address
+ * dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
* @type: media type of device
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
- * is not found or a pointer to the device. The caller must hold the
- * rtnl semaphore. The returned device has not had its ref count increased
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
+ * The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
- * BUGS:
- * If the API was consistent this would be __dev_get_by_hwaddr
*/
-struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
+struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
+ const char *ha)
{
struct net_device *dev;
- ASSERT_RTNL();
-
- for_each_netdev(net, dev)
+ for_each_netdev_rcu(net, dev)
if (dev->type == type &&
!memcmp(dev->dev_addr, ha, dev->addr_len))
return dev;
return NULL;
}
-EXPORT_SYMBOL(dev_getbyhwaddr);
+EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
@@ -1116,13 +1115,21 @@ EXPORT_SYMBOL(netdev_bonding_change);
void dev_load(struct net *net, const char *name)
{
struct net_device *dev;
+ int no_module;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
rcu_read_unlock();
- if (!dev && capable(CAP_NET_ADMIN))
- request_module("%s", name);
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE)) {
+ if (!request_module("%s", name))
+ pr_err("Loading kernel module for a network device "
+"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
+"instead\n", name);
+ }
}
EXPORT_SYMBOL(dev_load);
@@ -1133,9 +1140,6 @@ static int __dev_open(struct net_device *dev)
ASSERT_RTNL();
- /*
- * Is it even present?
- */
if (!netif_device_present(dev))
return -ENODEV;
@@ -1144,9 +1148,6 @@ static int __dev_open(struct net_device *dev)
if (ret)
return ret;
- /*
- * Call device private open method
- */
set_bit(__LINK_STATE_START, &dev->state);
if (ops->ndo_validate_addr)
@@ -1155,31 +1156,12 @@ static int __dev_open(struct net_device *dev)
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
- /*
- * If it went open OK then:
- */
-
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- /*
- * Set the flags.
- */
dev->flags |= IFF_UP;
-
- /*
- * Enable NET_DMA
- */
net_dmaengine_get();
-
- /*
- * Initialize multicasting status
- */
dev_set_rx_mode(dev);
-
- /*
- * Wakeup transmit queue engine
- */
dev_activate(dev);
}
@@ -1202,22 +1184,13 @@ int dev_open(struct net_device *dev)
{
int ret;
- /*
- * Is it already up?
- */
if (dev->flags & IFF_UP)
return 0;
- /*
- * Open device
- */
ret = __dev_open(dev);
if (ret < 0)
return ret;
- /*
- * ... and announce new interface.
- */
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
call_netdevice_notifiers(NETDEV_UP, dev);
@@ -1225,52 +1198,78 @@ int dev_open(struct net_device *dev)
}
EXPORT_SYMBOL(dev_open);
-static int __dev_close(struct net_device *dev)
+static int __dev_close_many(struct list_head *head)
{
- const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *dev;
ASSERT_RTNL();
might_sleep();
- /*
- * Tell people we are going down, so that they can
- * prepare to death, when device is still operating.
- */
- call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+ list_for_each_entry(dev, head, unreg_list) {
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
- clear_bit(__LINK_STATE_START, &dev->state);
+ clear_bit(__LINK_STATE_START, &dev->state);
- /* Synchronize to scheduled poll. We cannot touch poll list,
- * it can be even on different cpu. So just clear netif_running().
- *
- * dev->stop() will invoke napi_disable() on all of it's
- * napi_struct instances on this device.
- */
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ /* Synchronize to scheduled poll. We cannot touch poll list, it
+ * can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
+ */
+ smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ }
- dev_deactivate(dev);
+ dev_deactivate_many(head);
- /*
- * Call the device specific close. This cannot fail.
- * Only if device is UP
- *
- * We allow it to be called even after a DETACH hot-plug
- * event.
- */
- if (ops->ndo_stop)
- ops->ndo_stop(dev);
+ list_for_each_entry(dev, head, unreg_list) {
+ const struct net_device_ops *ops = dev->netdev_ops;
- /*
- * Device is now down.
- */
+ /*
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
+ */
+ if (ops->ndo_stop)
+ ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ dev->flags &= ~IFF_UP;
+ net_dmaengine_put();
+ }
- /*
- * Shutdown NET_DMA
- */
- net_dmaengine_put();
+ return 0;
+}
+static int __dev_close(struct net_device *dev)
+{
+ int retval;
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ retval = __dev_close_many(&single);
+ list_del(&single);
+ return retval;
+}
+
+static int dev_close_many(struct list_head *head)
+{
+ struct net_device *dev, *tmp;
+ LIST_HEAD(tmp_list);
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list)
+ if (!(dev->flags & IFF_UP))
+ list_move(&dev->unreg_list, &tmp_list);
+
+ __dev_close_many(head);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ }
+
+ /* rollback_registered_many needs the complete original list */
+ list_splice(&tmp_list, head);
return 0;
}
@@ -1285,17 +1284,11 @@ static int __dev_close(struct net_device *dev)
*/
int dev_close(struct net_device *dev)
{
- if (!(dev->flags & IFF_UP))
- return 0;
-
- __dev_close(dev);
-
- /*
- * Tell people we are down
- */
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
- call_netdevice_notifiers(NETDEV_DOWN, dev);
+ LIST_HEAD(single);
+ list_add(&dev->unreg_list, &single);
+ dev_close_many(&single);
+ list_del(&single);
return 0;
}
EXPORT_SYMBOL(dev_close);
@@ -1311,14 +1304,17 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
- dev->ethtool_ops->set_flags) {
- u32 flags = dev->ethtool_ops->get_flags(dev);
- if (flags & ETH_FLAG_LRO) {
- flags &= ~ETH_FLAG_LRO;
- dev->ethtool_ops->set_flags(dev, flags);
- }
- }
+ u32 flags;
+
+ if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
+ flags = dev->ethtool_ops->get_flags(dev);
+ else
+ flags = ethtool_op_get_flags(dev);
+
+ if (!(flags & ETH_FLAG_LRO))
+ return;
+
+ __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
WARN_ON(dev->features & NETIF_F_LRO);
}
EXPORT_SYMBOL(dev_disable_lro);
@@ -1326,11 +1322,6 @@ EXPORT_SYMBOL(dev_disable_lro);
static int dev_boot_phase = 1;
-/*
- * Device change register/unregister. These are not inline or static
- * as we export them to the world.
- */
-
/**
* register_netdevice_notifier - register a network notifier block
* @nb: notifier
@@ -1432,6 +1423,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
ASSERT_RTNL();
return raw_notifier_call_chain(&netdev_chain, val, dev);
}
+EXPORT_SYMBOL(call_netdevice_notifiers);
/* When > 0 there are consumers of rx skb time stamps */
static atomic_t netstamp_needed = ATOMIC_INIT(0);
@@ -1462,6 +1454,27 @@ static inline void net_timestamp_check(struct sk_buff *skb)
__net_timestamp(skb);
}
+static inline bool is_skb_forwardable(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ unsigned int len;
+
+ if (!(dev->flags & IFF_UP))
+ return false;
+
+ len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+ if (skb->len <= len)
+ return true;
+
+ /* if TSO is enabled, we don't care about the length as the packet
+ * could be forwarded without being segmented before
+ */
+ if (skb_is_gso(skb))
+ return true;
+
+ return false;
+}
+
/**
* dev_forward_skb - loopback an skb to another netif
*
@@ -1485,8 +1498,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
skb_orphan(skb);
nf_reset(skb);
- if (unlikely(!(dev->flags & IFF_UP) ||
- (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
+ if (unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
@@ -1499,6 +1511,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
+static inline int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ atomic_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
@@ -1507,13 +1527,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb);
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
-
-#ifdef CONFIG_NET_CLS_ACT
- if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
- net_timestamp_set(skb);
-#else
- net_timestamp_set(skb);
-#endif
+ struct sk_buff *skb2 = NULL;
+ struct packet_type *pt_prev = NULL;
rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1523,10 +1538,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
if ((ptype->dev == dev || !ptype->dev) &&
(ptype->af_packet_priv == NULL ||
(struct sock *)ptype->af_packet_priv != skb->sk)) {
- struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (pt_prev) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
break;
+ net_timestamp_set(skb2);
+
/* skb->nh should be correctly
set by sender, so that the second statement is
just protection against buggy protocols.
@@ -1545,24 +1568,79 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
skb2->transport_header = skb2->network_header;
skb2->pkt_type = PACKET_OUTGOING;
- ptype->func(skb2, skb->dev, ptype, skb->dev);
+ pt_prev = ptype;
}
}
+ if (pt_prev)
+ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warning("Number of in use tx queues changed "
+ "invalidating tc mappings. Priority "
+ "traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warning("Number of in use tx queues "
+ "changed. Priority %i to tc "
+ "mapping %i is no longer valid "
+ "setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
*/
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
+ int rc;
+
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
- if (dev->reg_state == NETREG_REGISTERED) {
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
+ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+ txq);
+ if (rc)
+ return rc;
+
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
if (txq < dev->real_num_tx_queues)
qdisc_reset_all_tx_gt(dev, txq);
}
@@ -1683,33 +1761,6 @@ void netif_device_attach(struct net_device *dev)
}
EXPORT_SYMBOL(netif_device_attach);
-static bool can_checksum_protocol(unsigned long features, __be16 protocol)
-{
- return ((features & NETIF_F_NO_CSUM) ||
- ((features & NETIF_F_V4_CSUM) &&
- protocol == htons(ETH_P_IP)) ||
- ((features & NETIF_F_V6_CSUM) &&
- protocol == htons(ETH_P_IPV6)) ||
- ((features & NETIF_F_FCOE_CRC) &&
- protocol == htons(ETH_P_FCOE)));
-}
-
-static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
-{
- __be16 protocol = skb->protocol;
- int features = dev->features;
-
- if (vlan_tx_tag_present(skb)) {
- features &= dev->vlan_features;
- } else if (protocol == htons(ETH_P_8021Q)) {
- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
- protocol = veh->h_vlan_encapsulated_proto;
- features &= dev->vlan_features;
- }
-
- return can_checksum_protocol(features, protocol);
-}
-
/**
* skb_dev_set -- assign a new device to a buffer
* @skb: buffer for the new device
@@ -1757,7 +1808,7 @@ int skb_checksum_help(struct sk_buff *skb)
goto out_set_summed;
}
- offset = skb->csum_start - skb_headroom(skb);
+ offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));
csum = skb_checksum(skb, offset, skb->len - offset, 0);
@@ -1789,21 +1840,23 @@ EXPORT_SYMBOL(skb_checksum_help);
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*/
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_type *ptype;
__be16 type = skb->protocol;
+ int vlan_depth = ETH_HLEN;
int err;
- if (type == htons(ETH_P_8021Q)) {
- struct vlan_ethhdr *veh;
+ while (type == htons(ETH_P_8021Q)) {
+ struct vlan_hdr *vh;
- if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
+ if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
return ERR_PTR(-EINVAL);
- veh = (struct vlan_ethhdr *)skb->data;
- type = veh->h_vlan_encapsulated_proto;
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
}
skb_reset_mac_header(skb);
@@ -1817,8 +1870,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
dev->ethtool_ops->get_drvinfo(dev, &info);
- WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
- "ip_summed=%d",
+ WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
info.driver, dev ? dev->features : 0L,
skb->sk ? skb->sk->sk_route_caps : 0L,
skb->len, skb->data_len, skb->ip_summed);
@@ -1921,16 +1973,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
/**
* dev_gso_segment - Perform emulated hardware segmentation on skb.
* @skb: buffer to segment
+ * @features: device features as applicable to this skb
*
* This function segments the given skb and stores the list of segments
* in skb->next.
*/
-static int dev_gso_segment(struct sk_buff *skb)
+static int dev_gso_segment(struct sk_buff *skb, int features)
{
- struct net_device *dev = skb->dev;
struct sk_buff *segs;
- int features = dev->features & ~(illegal_highdma(dev, skb) ?
- NETIF_F_SG : 0);
segs = skb_gso_segment(skb, features);
@@ -1967,6 +2017,53 @@ static inline void skb_orphan_try(struct sk_buff *skb)
}
}
+static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+{
+ return ((features & NETIF_F_GEN_CSUM) ||
+ ((features & NETIF_F_V4_CSUM) &&
+ protocol == htons(ETH_P_IP)) ||
+ ((features & NETIF_F_V6_CSUM) &&
+ protocol == htons(ETH_P_IPV6)) ||
+ ((features & NETIF_F_FCOE_CRC) &&
+ protocol == htons(ETH_P_FCOE)));
+}
+
+static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
+{
+ if (!can_checksum_protocol(features, protocol)) {
+ features &= ~NETIF_F_ALL_CSUM;
+ features &= ~NETIF_F_SG;
+ } else if (illegal_highdma(skb->dev, skb)) {
+ features &= ~NETIF_F_SG;
+ }
+
+ return features;
+}
+
+u32 netif_skb_features(struct sk_buff *skb)
+{
+ __be16 protocol = skb->protocol;
+ u32 features = skb->dev->features;
+
+ if (protocol == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ protocol = veh->h_vlan_encapsulated_proto;
+ } else if (!vlan_tx_tag_present(skb)) {
+ return harmonize_features(skb, protocol, features);
+ }
+
+ features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
+
+ if (protocol != htons(ETH_P_8021Q)) {
+ return harmonize_features(skb, protocol, features);
+ } else {
+ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
+ NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
+ return harmonize_features(skb, protocol, features);
+ }
+}
+EXPORT_SYMBOL(netif_skb_features);
+
/*
* Returns true if either:
* 1. skb has frag_list and the device doesn't support FRAGLIST, or
@@ -1975,17 +2072,13 @@ static inline void skb_orphan_try(struct sk_buff *skb)
* support DMA from it.
*/
static inline int skb_needs_linearize(struct sk_buff *skb,
- struct net_device *dev)
+ int features)
{
- int features = dev->features;
-
- if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
- features &= dev->vlan_features;
-
return skb_is_nonlinear(skb) &&
- ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
- (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
- illegal_highdma(dev, skb))));
+ ((skb_has_frag_list(skb) &&
+ !(features & NETIF_F_FRAGLIST)) ||
+ (skb_shinfo(skb)->nr_frags &&
+ !(features & NETIF_F_SG)));
}
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
@@ -1995,20 +2088,24 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
int rc = NETDEV_TX_OK;
if (likely(!skb->next)) {
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(skb, dev);
+ u32 features;
/*
- * If device doesnt need skb->dst, release it right now while
+ * If device doesn't need skb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
skb_orphan_try(skb);
+ features = netif_skb_features(skb);
+
if (vlan_tx_tag_present(skb) &&
- !(dev->features & NETIF_F_HW_VLAN_TX)) {
+ !(features & NETIF_F_HW_VLAN_TX)) {
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (unlikely(!skb))
goto out;
@@ -2016,13 +2113,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
skb->vlan_tci = 0;
}
- if (netif_needs_gso(dev, skb)) {
- if (unlikely(dev_gso_segment(skb)))
+ if (netif_needs_gso(skb, features)) {
+ if (unlikely(dev_gso_segment(skb, features)))
goto out_kfree_skb;
if (skb->next)
goto gso;
} else {
- if (skb_needs_linearize(skb, dev) &&
+ if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
@@ -2031,9 +2128,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- skb_set_transport_header(skb, skb->csum_start -
- skb_headroom(skb));
- if (!dev_can_checksum(dev, skb) &&
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (!(features & NETIF_F_ALL_CSUM) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
@@ -2054,7 +2151,7 @@ gso:
nskb->next = NULL;
/*
- * If device doesnt need nskb->dst, release it right now while
+ * If device doesn't need nskb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
@@ -2085,26 +2182,39 @@ out:
static u32 hashrnd __read_mostly;
-u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+ unsigned int num_tx_queues)
{
u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= dev->real_num_tx_queues))
- hash -= dev->real_num_tx_queues;
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
return hash;
}
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
hash = (__force u16) skb->protocol ^ skb->rxhash;
hash = jhash_1word(hash, hashrnd);
- return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+ return (u16) (((u64) hash * qcount) >> 32) + qoffset;
}
-EXPORT_SYMBOL(skb_tx_hash);
+EXPORT_SYMBOL(__skb_tx_hash);
static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
@@ -2119,26 +2229,70 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
return queue_index;
}
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[raw_smp_processor_id()]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else {
+ u32 hash;
+ if (skb->sk && skb->sk->sk_hash)
+ hash = skb->sk->sk_hash;
+ else
+ hash = (__force u16) skb->protocol ^
+ skb->rxhash;
+ hash = jhash_1word(hash, hashrnd);
+ queue_index = map->queues[
+ ((u64)hash * map->len) >> 32];
+ }
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
struct sk_buff *skb)
{
int queue_index;
const struct net_device_ops *ops = dev->netdev_ops;
- if (ops->ndo_select_queue) {
+ if (dev->real_num_tx_queues == 1)
+ queue_index = 0;
+ else if (ops->ndo_select_queue) {
queue_index = ops->ndo_select_queue(dev, skb);
queue_index = dev_cap_txqueue(dev, queue_index);
} else {
struct sock *sk = skb->sk;
queue_index = sk_tx_queue_get(sk);
- if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) {
- queue_index = 0;
- if (dev->real_num_tx_queues > 1)
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int old_index = queue_index;
+
+ queue_index = get_xps_queue(dev, skb);
+ if (queue_index < 0)
queue_index = skb_tx_hash(dev, skb);
- if (sk) {
- struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
+ if (queue_index != old_index && sk) {
+ struct dst_entry *dst =
+ rcu_dereference_check(sk->sk_dst_cache, 1);
if (dst && skb_dst(skb) == dst)
sk_tx_queue_set(sk, queue_index);
@@ -2155,15 +2309,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
- bool contended = qdisc_is_running(q);
+ bool contended;
int rc;
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
* and dequeue packets faster.
*/
+ contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -2180,7 +2337,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
*/
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
- __qdisc_update_bstats(q, skb->len);
+
+ qdisc_bstats_update(q, skb);
+
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -2193,7 +2352,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
skb_dst_force(skb);
- rc = qdisc_enqueue_root(skb, q);
+ rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -2412,6 +2571,54 @@ EXPORT_SYMBOL(__skb_get_rxhash);
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ u16 tcpu;
+
+ tcpu = rflow->cpu = next_cpu;
+ if (tcpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb->rxhash & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->cpu = next_cpu;
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, tcpu).input_queue_head;
+ }
+
+ return rflow;
+}
+
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
@@ -2442,7 +2649,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
map = rcu_dereference(rxqueue->rps_map);
if (map) {
- if (map->len == 1) {
+ if (map->len == 1 &&
+ !rcu_dereference_raw(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
@@ -2482,12 +2690,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (unlikely(tcpu != next_cpu) &&
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
- rflow->last_qtail)) >= 0)) {
- tcpu = rflow->cpu = next_cpu;
- if (tcpu != RPS_NO_CPU)
- rflow->last_qtail = per_cpu(softnet_data,
- tcpu).input_queue_head;
- }
+ rflow->last_qtail)) >= 0))
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
@@ -2508,6 +2713,46 @@ done:
return cpu;
}
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
@@ -2712,14 +2957,6 @@ static void net_tx_action(struct softirq_action *h)
}
}
-static inline int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
-{
- atomic_inc(&skb->users);
- return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-}
-
#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
(defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
/* This hook is defined here for ATM LANE */
@@ -2733,8 +2970,8 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
* a compare and 2 stores extra right now if we dont have it on
* but have CONFIG_NET_CLS_ACT
- * NOTE: This doesnt stop any functionality; if you dont have
- * the ingress scheduler, you just cant add policies on ingress.
+ * NOTE: This doesn't stop any functionality; if you dont have
+ * the ingress scheduler, you just can't add policies on ingress.
*
*/
static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
@@ -2803,6 +3040,8 @@ out:
* on a failure.
*
* The caller must hold the rtnl_mutex.
+ *
+ * For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
@@ -2837,64 +3076,32 @@ void netdev_rx_handler_unregister(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
- struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
{
- if (skb->pkt_type == PACKET_HOST) {
- u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
- memcpy(dest, master->dev_addr, ETH_ALEN);
- }
-}
-
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
-{
- struct net_device *dev = skb->dev;
-
- if (master->priv_flags & IFF_MASTER_ARPMON)
- dev->last_rx = jiffies;
-
- if ((master->priv_flags & IFF_MASTER_ALB) &&
- (master->priv_flags & IFF_BRIDGE_PORT)) {
- /* Do address unmangle. The local destination address
- * will be always the one master has. Provides the right
- * functionality in a bridge.
- */
- skb_bond_set_mac_by_master(skb, master);
- }
-
- if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
- if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
- skb->protocol == __cpu_to_be16(ETH_P_ARP))
- return 0;
-
- if (master->priv_flags & IFF_MASTER_ALB) {
- if (skb->pkt_type != PACKET_BROADCAST &&
- skb->pkt_type != PACKET_MULTICAST)
- return 0;
- }
- if (master->priv_flags & IFF_MASTER_8023AD &&
- skb->protocol == __cpu_to_be16(ETH_P_SLOW))
- return 0;
+ /*
+ * Make sure ARP frames received on VLAN interfaces stacked on
+ * bonding interfaces still make their way to any base bonding
+ * device that may have registered for a specific ptype.
+ */
+ if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+ vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+ skb->protocol == htons(ETH_P_ARP)) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
- return 1;
+ if (!skb2)
+ return;
+ skb2->dev = vlan_dev_real_dev(skb->dev);
+ netif_rx(skb2);
}
- return 0;
}
-EXPORT_SYMBOL(__skb_bond_should_drop);
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
- struct net_device *master;
- struct net_device *null_or_orig;
- struct net_device *orig_or_bond;
+ struct net_device *null_or_dev;
+ bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
@@ -2909,28 +3116,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
if (!skb->skb_iif)
skb->skb_iif = skb->dev->ifindex;
-
- /*
- * bonding note: skbs received on inactive slaves should only
- * be delivered to pkt handlers that are exact matches. Also
- * the deliver_no_wcard flag will be set. If packet handlers
- * are sensitive to duplicate packets these skbs will need to
- * be dropped at the handler.
- */
- null_or_orig = NULL;
orig_dev = skb->dev;
- master = ACCESS_ONCE(orig_dev->master);
- if (skb->deliver_no_wcard)
- null_or_orig = orig_dev;
- else if (master) {
- if (skb_bond_should_drop(skb, master)) {
- skb->deliver_no_wcard = 1;
- null_or_orig = orig_dev; /* deliver only exact match */
- } else
- skb->dev = master;
- }
- __this_cpu_inc(softnet_data.processed);
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
@@ -2939,6 +3126,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
rcu_read_lock();
+another_round:
+
+ __this_cpu_inc(softnet_data.processed);
+
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -2947,8 +3138,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
#endif
list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
- ptype->dev == orig_dev) {
+ if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -2962,16 +3152,24 @@ static int __netif_receive_skb(struct sk_buff *skb)
ncls:
#endif
- /* Handle special case of bridge or macvlan */
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
- skb = rx_handler(skb);
- if (!skb)
+ switch (rx_handler(&skb)) {
+ case RX_HANDLER_CONSUMED:
goto out;
+ case RX_HANDLER_ANOTHER:
+ goto another_round;
+ case RX_HANDLER_EXACT:
+ deliver_exact = true;
+ case RX_HANDLER_PASS:
+ break;
+ default:
+ BUG();
+ }
}
if (vlan_tx_tag_present(skb)) {
@@ -2986,24 +3184,17 @@ ncls:
goto out;
}
- /*
- * Make sure frames received on VLAN interfaces stacked on
- * bonding interfaces still make their way to any base bonding
- * device that may have registered for a specific ptype. The
- * handler may have to adjust skb->dev and orig_dev.
- */
- orig_or_bond = orig_dev;
- if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
- (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
- orig_or_bond = vlan_dev_real_dev(skb->dev);
- }
+ vlan_on_bond_hook(skb);
+
+ /* deliver only exact match when indicated */
+ null_or_dev = deliver_exact ? skb->dev : NULL;
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type && (ptype->dev == null_or_orig ||
- ptype->dev == skb->dev || ptype->dev == orig_dev ||
- ptype->dev == orig_or_bond)) {
+ if (ptype->type == type &&
+ (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
+ ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -3311,6 +3502,8 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
__skb_pull(skb, skb_headlen(skb));
skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
napi->skb = skb;
}
@@ -3607,7 +3800,7 @@ static void net_rx_action(struct softirq_action *h)
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
- * accidently calling ->poll() when NAPI is not scheduled.
+ * accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
@@ -3798,12 +3991,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct net_device *dev = (v == SEQ_START_TOKEN) ?
- first_net_device(seq_file_net(seq)) :
- next_net_device((struct net_device *)v);
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ dev = first_net_device_rcu(seq_file_net(seq));
+ else
+ dev = next_net_device_rcu(dev);
++*pos;
- return rcu_dereference(dev);
+ return dev;
}
void dev_seq_stop(struct seq_file *seq, void *v)
@@ -4087,15 +4283,14 @@ static int __init dev_proc_init(void)
/**
- * netdev_set_master - set up master/slave pair
+ * netdev_set_master - set up master pointer
* @slave: slave device
* @master: new master device
*
* Changes the master device of the slave. Pass %NULL to break the
* bonding. The caller must hold the RTNL semaphore. On a failure
* a negative errno code is returned. On success the reference counts
- * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
- * function returns zero.
+ * are adjusted and the function returns zero.
*/
int netdev_set_master(struct net_device *slave, struct net_device *master)
{
@@ -4115,6 +4310,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
synchronize_net();
dev_put(old);
}
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_master);
+
+/**
+ * netdev_set_bond_master - set up bonding master/slave pair
+ * @slave: slave device
+ * @master: new master device
+ *
+ * Changes the master device of the slave. Pass %NULL to break the
+ * bonding. The caller must hold the RTNL semaphore. On a failure
+ * a negative errno code is returned. On success %RTM_NEWLINK is sent
+ * to the routing socket and the function returns zero.
+ */
+int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = netdev_set_master(slave, master);
+ if (err)
+ return err;
if (master)
slave->flags |= IFF_SLAVE;
else
@@ -4123,7 +4341,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
return 0;
}
-EXPORT_SYMBOL(netdev_set_master);
+EXPORT_SYMBOL(netdev_set_bond_master);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
@@ -4460,6 +4678,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
EXPORT_SYMBOL(dev_set_mtu);
/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
@@ -4544,7 +4773,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
* is never reached
*/
WARN_ON(1);
- err = -EINVAL;
+ err = -ENOTTY;
break;
}
@@ -4812,7 +5041,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
/* Set the per device memory buffer space.
* Not applicable in our case */
case SIOCSIFLINK:
- return -EINVAL;
+ return -ENOTTY;
/*
* Unknown or private ioctl.
@@ -4833,7 +5062,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
/* Take care of Wireless Extensions */
if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
return wext_handle_ioctl(net, &ifr, cmd, arg);
- return -EINVAL;
+ return -ENOTTY;
}
}
@@ -4887,10 +5116,12 @@ static void rollback_registered_many(struct list_head *head)
}
BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
- /* If device is running, close it first. */
- dev_close(dev);
+ /* If device is running, close it first. */
+ dev_close_many(head);
+ list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
@@ -4945,40 +5176,62 @@ static void rollback_registered(struct net_device *dev)
list_add(&dev->unreg_list, &single);
rollback_registered_many(&single);
+ list_del(&single);
}
-unsigned long netdev_fix_features(unsigned long features, const char *name)
+u32 netdev_fix_features(struct net_device *dev, u32 features)
{
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
+
+ if ((features & NETIF_F_NO_CSUM) &&
+ (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev, "mixed no checksumming and other settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ }
+
/* Fix illegal SG+CSUM combinations. */
if ((features & NETIF_F_SG) &&
!(features & NETIF_F_ALL_CSUM)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
- "checksum feature.\n", name);
+ netdev_info(dev,
+ "Dropping NETIF_F_SG since no checksum feature.\n");
features &= ~NETIF_F_SG;
}
/* TSO requires that SG is present as well. */
- if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
- "SG feature.\n", name);
- features &= ~NETIF_F_TSO;
+ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+ netdev_info(dev, "Dropping TSO features since no SG feature.\n");
+ features &= ~NETIF_F_ALL_TSO;
+ }
+
+ /* TSO ECN requires that TSO is present as well. */
+ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+ features &= ~NETIF_F_TSO_ECN;
+
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
}
+ /* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
- if (!(features & NETIF_F_GEN_CSUM)) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no NETIF_F_HW_CSUM feature.\n",
- name);
+ /* maybe split UFO into V4 and V6? */
+ if (!((features & NETIF_F_GEN_CSUM) ||
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+ == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev,
+ "Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
}
if (!(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no NETIF_F_SG feature.\n", name);
+ netdev_info(dev,
+ "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
features &= ~NETIF_F_UFO;
}
}
@@ -4987,6 +5240,37 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
}
EXPORT_SYMBOL(netdev_fix_features);
+void netdev_update_features(struct net_device *dev)
+{
+ u32 features;
+ int err = 0;
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ if (dev->features == features)
+ return;
+
+ netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
+ dev->features, features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+
+ if (!err)
+ dev->features = features;
+ else if (err < 0)
+ netdev_err(dev,
+ "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
+ err, features, dev->features);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
/**
* netif_stacked_transfer_operstate - transfer operstate
* @rootdev: the root or lower level device to transfer state from
@@ -5014,9 +5298,9 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+#ifdef CONFIG_RPS
static int netif_alloc_rx_queues(struct net_device *dev)
{
-#ifdef CONFIG_RPS
unsigned int i, count = dev->num_rx_queues;
struct netdev_rx_queue *rx;
@@ -5029,15 +5313,22 @@ static int netif_alloc_rx_queues(struct net_device *dev)
}
dev->_rx = rx;
- /*
- * Set a pointer to first element in the array which holds the
- * reference count.
- */
for (i = 0; i < count; i++)
- rx[i].first = rx;
-#endif
+ rx[i].dev = dev;
return 0;
}
+#endif
+
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue, void *_unused)
+{
+ /* Initialize queue lock */
+ spin_lock_init(&queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ queue->xmit_lock_owner = -1;
+ netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
+ queue->dev = dev;
+}
static int netif_alloc_netdev_queues(struct net_device *dev)
{
@@ -5053,25 +5344,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
return -ENOMEM;
}
dev->_tx = tx;
- return 0;
-}
-
-static void netdev_init_one_queue(struct net_device *dev,
- struct netdev_queue *queue,
- void *_unused)
-{
- queue->dev = dev;
-
- /* Initialize queue lock */
- spin_lock_init(&queue->_xmit_lock);
- netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
- queue->xmit_lock_owner = -1;
-}
-static void netdev_init_queues(struct net_device *dev)
-{
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
spin_lock_init(&dev->tx_global_lock);
+
+ return 0;
}
/**
@@ -5110,16 +5387,6 @@ int register_netdevice(struct net_device *dev)
dev->iflink = -1;
- ret = netif_alloc_rx_queues(dev);
- if (ret)
- goto out;
-
- ret = netif_alloc_netdev_queues(dev);
- if (ret)
- goto out;
-
- netdev_init_queues(dev);
-
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
@@ -5138,27 +5405,19 @@ int register_netdevice(struct net_device *dev)
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
- /* Fix illegal checksum combinations */
- if ((dev->features & NETIF_F_HW_CSUM) &&
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
- }
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->features |= NETIF_F_SOFT_FEATURES;
+ dev->wanted_features = dev->features & dev->hw_features;
- if ((dev->features & NETIF_F_NO_CSUM) &&
- (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ /* Avoid warning from netdev_fix_features() for GSO without SG */
+ if (!(dev->wanted_features & NETIF_F_SG)) {
+ dev->wanted_features &= ~NETIF_F_GSO;
+ dev->features &= ~NETIF_F_GSO;
}
- dev->features = netdev_fix_features(dev->features, dev->name);
-
- /* Enable software GSO if SG is supported. */
- if (dev->features & NETIF_F_SG)
- dev->features |= NETIF_F_GSO;
-
/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
* vlan_dev_init() will do the dev->features check, so these features
* are enabled only if supported by underlying device.
@@ -5175,6 +5434,8 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
+ netdev_update_features(dev);
+
/*
* Default initial state at registry is that the
* device is present.
@@ -5423,34 +5684,6 @@ void netdev_run_todo(void)
}
}
-/**
- * dev_txq_stats_fold - fold tx_queues stats
- * @dev: device to get statistics from
- * @stats: struct rtnl_link_stats64 to hold results
- */
-void dev_txq_stats_fold(const struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
- unsigned int i;
- struct netdev_queue *txq;
-
- for (i = 0; i < dev->num_tx_queues; i++) {
- txq = netdev_get_tx_queue(dev, i);
- spin_lock_bh(&txq->_xmit_lock);
- tx_bytes += txq->tx_bytes;
- tx_packets += txq->tx_packets;
- tx_dropped += txq->tx_dropped;
- spin_unlock_bh(&txq->_xmit_lock);
- }
- if (tx_bytes || tx_packets || tx_dropped) {
- stats->tx_bytes = tx_bytes;
- stats->tx_packets = tx_packets;
- stats->tx_dropped = tx_dropped;
- }
-}
-EXPORT_SYMBOL(dev_txq_stats_fold);
-
/* Convert net_device_stats to rtnl_link_stats64. They have the same
* fields in the same order, with only the type differing.
*/
@@ -5494,7 +5727,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
} else {
netdev_stats_to_stats64(storage, &dev->stats);
- dev_txq_stats_fold(dev, storage);
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
return storage;
@@ -5520,18 +5752,20 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
}
/**
- * alloc_netdev_mq - allocate network device
+ * alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
- * @queue_count: the number of subqueues to allocate
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subquue structs
- * for each queue on the device at the end of the netdevice.
+ * for each queue on the device.
*/
-struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *), unsigned int queue_count)
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *),
+ unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
size_t alloc_size;
@@ -5539,12 +5773,20 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
BUG_ON(strlen(name) >= sizeof(dev->name));
- if (queue_count < 1) {
+ if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device "
"with zero queues.\n");
return NULL;
}
+#ifdef CONFIG_RPS
+ if (rxqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device "
+ "with zero RX queues.\n");
+ return NULL;
+ }
+#endif
+
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
@@ -5575,14 +5817,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
- dev->num_tx_queues = queue_count;
- dev->real_num_tx_queues = queue_count;
-
-#ifdef CONFIG_RPS
- dev->num_rx_queues = queue_count;
- dev->real_num_rx_queues = queue_count;
-#endif
-
dev->gso_max_size = GSO_MAX_SIZE;
INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
@@ -5592,16 +5826,39 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->link_watch_list);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
+
+ dev->num_tx_queues = txqs;
+ dev->real_num_tx_queues = txqs;
+ if (netif_alloc_netdev_queues(dev))
+ goto free_all;
+
+#ifdef CONFIG_RPS
+ dev->num_rx_queues = rxqs;
+ dev->real_num_rx_queues = rxqs;
+ if (netif_alloc_rx_queues(dev))
+ goto free_all;
+#endif
+
strcpy(dev->name, name);
+ dev->group = INIT_NETDEV_GROUP;
return dev;
+free_all:
+ free_netdev(dev);
+ return NULL;
+
free_pcpu:
free_percpu(dev->pcpu_refcnt);
+ kfree(dev->_tx);
+#ifdef CONFIG_RPS
+ kfree(dev->_rx);
+#endif
+
free_p:
kfree(p);
return NULL;
}
-EXPORT_SYMBOL(alloc_netdev_mq);
+EXPORT_SYMBOL(alloc_netdev_mqs);
/**
* free_netdev - free network device
@@ -5618,6 +5875,9 @@ void free_netdev(struct net_device *dev)
release_net(dev_net(dev));
kfree(dev->_tx);
+#ifdef CONFIG_RPS
+ kfree(dev->_rx);
+#endif
kfree(rcu_dereference_raw(dev->ingress_queue));
@@ -5895,8 +6155,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
-unsigned long netdev_increment_features(unsigned long all, unsigned long one,
- unsigned long mask)
+u32 netdev_increment_features(u32 all, u32 one, u32 mask)
{
/* If device needs checksumming, downgrade to it. */
if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
@@ -6081,7 +6340,7 @@ static void __net_exit default_device_exit(struct net *net)
if (dev->rtnl_link_ops)
continue;
- /* Push remaing network devices to init_net */
+ /* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
@@ -6096,7 +6355,7 @@ static void __net_exit default_device_exit(struct net *net)
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
/* At exit all network devices most be removed from a network
- * namespace. Do this in the reverse order of registeration.
+ * namespace. Do this in the reverse order of registration.
* Do this across as many network namespaces as possible to
* improve batching efficiency.
*/
@@ -6114,6 +6373,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
}
unregister_netdevice_many(&dev_kill_list);
+ list_del(&dev_kill_list);
rtnl_unlock();
}