diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 1104 |
1 files changed, 682 insertions, 422 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 0dd54a69dace..856b6ee9a1d5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include <trace/events/skb.h> #include <linux/pci.h> #include <linux/inetdevice.h> +#include <linux/cpu_rmap.h> #include "net-sysfs.h" @@ -743,34 +744,32 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** - * dev_getbyhwaddr - find a device by its hardware address + * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. + * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; - ASSERT_RTNL(); - - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } -EXPORT_SYMBOL(dev_getbyhwaddr); +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { @@ -1116,13 +1115,21 @@ EXPORT_SYMBOL(netdev_bonding_change); void dev_load(struct net *net, const char *name) { struct net_device *dev; + int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); - if (!dev && capable(CAP_NET_ADMIN)) - request_module("%s", name); + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device " +"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " +"instead\n", name); + } } EXPORT_SYMBOL(dev_load); @@ -1133,9 +1140,6 @@ static int __dev_open(struct net_device *dev) ASSERT_RTNL(); - /* - * Is it even present? - */ if (!netif_device_present(dev)) return -ENODEV; @@ -1144,9 +1148,6 @@ static int __dev_open(struct net_device *dev) if (ret) return ret; - /* - * Call device private open method - */ set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) @@ -1155,31 +1156,12 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - /* - * If it went open OK then: - */ - if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - /* - * Set the flags. - */ dev->flags |= IFF_UP; - - /* - * Enable NET_DMA - */ net_dmaengine_get(); - - /* - * Initialize multicasting status - */ dev_set_rx_mode(dev); - - /* - * Wakeup transmit queue engine - */ dev_activate(dev); } @@ -1202,22 +1184,13 @@ int dev_open(struct net_device *dev) { int ret; - /* - * Is it already up? - */ if (dev->flags & IFF_UP) return 0; - /* - * Open device - */ ret = __dev_open(dev); if (ret < 0) return ret; - /* - * ... and announce new interface. - */ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_UP, dev); @@ -1225,52 +1198,78 @@ int dev_open(struct net_device *dev) } EXPORT_SYMBOL(dev_open); -static int __dev_close(struct net_device *dev) +static int __dev_close_many(struct list_head *head) { - const struct net_device_ops *ops = dev->netdev_ops; + struct net_device *dev; ASSERT_RTNL(); might_sleep(); - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ - call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + list_for_each_entry(dev, head, unreg_list) { + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); - clear_bit(__LINK_STATE_START, &dev->state); + clear_bit(__LINK_STATE_START, &dev->state); - /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(). - * - * dev->stop() will invoke napi_disable() on all of it's - * napi_struct instances on this device. - */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + } - dev_deactivate(dev); + dev_deactivate_many(head); - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - * - * We allow it to be called even after a DETACH hot-plug - * event. - */ - if (ops->ndo_stop) - ops->ndo_stop(dev); + list_for_each_entry(dev, head, unreg_list) { + const struct net_device_ops *ops = dev->netdev_ops; - /* - * Device is now down. - */ + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; + dev->flags &= ~IFF_UP; + net_dmaengine_put(); + } - /* - * Shutdown NET_DMA - */ - net_dmaengine_put(); + return 0; +} +static int __dev_close(struct net_device *dev) +{ + int retval; + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + retval = __dev_close_many(&single); + list_del(&single); + return retval; +} + +static int dev_close_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) + if (!(dev->flags & IFF_UP)) + list_move(&dev->unreg_list, &tmp_list); + + __dev_close_many(head); + + list_for_each_entry(dev, head, unreg_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + /* rollback_registered_many needs the complete original list */ + list_splice(&tmp_list, head); return 0; } @@ -1285,17 +1284,11 @@ static int __dev_close(struct net_device *dev) */ int dev_close(struct net_device *dev) { - if (!(dev->flags & IFF_UP)) - return 0; - - __dev_close(dev); - - /* - * Tell people we are down - */ - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); - call_netdevice_notifiers(NETDEV_DOWN, dev); + LIST_HEAD(single); + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); return 0; } EXPORT_SYMBOL(dev_close); @@ -1311,14 +1304,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } + u32 flags; + + if (dev->ethtool_ops && dev->ethtool_ops->get_flags) + flags = dev->ethtool_ops->get_flags(dev); + else + flags = ethtool_op_get_flags(dev); + + if (!(flags & ETH_FLAG_LRO)) + return; + + __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); WARN_ON(dev->features & NETIF_F_LRO); } EXPORT_SYMBOL(dev_disable_lro); @@ -1326,11 +1322,6 @@ EXPORT_SYMBOL(dev_disable_lro); static int dev_boot_phase = 1; -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - /** * register_netdevice_notifier - register a network notifier block * @nb: notifier @@ -1432,6 +1423,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } +EXPORT_SYMBOL(call_netdevice_notifiers); /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); @@ -1462,6 +1454,27 @@ static inline void net_timestamp_check(struct sk_buff *skb) __net_timestamp(skb); } +static inline bool is_skb_forwardable(struct net_device *dev, + struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1485,8 +1498,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (unlikely(!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { + if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; @@ -1499,6 +1511,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1507,13 +1527,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb); static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - -#ifdef CONFIG_NET_CLS_ACT - if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) - net_timestamp_set(skb); -#else - net_timestamp_set(skb); -#endif + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1523,10 +1538,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) if ((ptype->dev == dev || !ptype->dev) && (ptype->af_packet_priv == NULL || (struct sock *)ptype->af_packet_priv != skb->sk)) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) break; + net_timestamp_set(skb2); + /* skb->nh should be correctly set by sender, so that the second statement is just protection against buggy protocols. @@ -1545,24 +1568,79 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype, skb->dev); + pt_prev = ptype; } } + if (pt_prev) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues changed " + "invalidating tc mappings. Priority " + "traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues " + "changed. Priority %i to tc " + "mapping %i is no longer valid " + "setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { + int rc; + if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; - if (dev->reg_state == NETREG_REGISTERED) { + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (dev->num_tc) + netif_setup_tc(dev, txq); + if (txq < dev->real_num_tx_queues) qdisc_reset_all_tx_gt(dev, txq); } @@ -1683,33 +1761,6 @@ void netif_device_attach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_attach); -static bool can_checksum_protocol(unsigned long features, __be16 protocol) -{ - return ((features & NETIF_F_NO_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); -} - -static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) -{ - __be16 protocol = skb->protocol; - int features = dev->features; - - if (vlan_tx_tag_present(skb)) { - features &= dev->vlan_features; - } else if (protocol == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - features &= dev->vlan_features; - } - - return can_checksum_protocol(features, protocol); -} - /** * skb_dev_set -- assign a new device to a buffer * @skb: buffer for the new device @@ -1757,7 +1808,7 @@ int skb_checksum_help(struct sk_buff *skb) goto out_set_summed; } - offset = skb->csum_start - skb_headroom(skb); + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1789,21 +1840,23 @@ EXPORT_SYMBOL(skb_checksum_help); * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; __be16 type = skb->protocol; + int vlan_depth = ETH_HLEN; int err; - if (type == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *veh; + while (type == htons(ETH_P_8021Q)) { + struct vlan_hdr *vh; - if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) return ERR_PTR(-EINVAL); - veh = (struct vlan_ethhdr *)skb->data; - type = veh->h_vlan_encapsulated_proto; + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; } skb_reset_mac_header(skb); @@ -1817,8 +1870,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) dev->ethtool_ops->get_drvinfo(dev, &info); - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " - "ip_summed=%d", + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", info.driver, dev ? dev->features : 0L, skb->sk ? skb->sk->sk_route_caps : 0L, skb->len, skb->data_len, skb->ip_summed); @@ -1921,16 +1973,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) /** * dev_gso_segment - Perform emulated hardware segmentation on skb. * @skb: buffer to segment + * @features: device features as applicable to this skb * * This function segments the given skb and stores the list of segments * in skb->next. */ -static int dev_gso_segment(struct sk_buff *skb) +static int dev_gso_segment(struct sk_buff *skb, int features) { - struct net_device *dev = skb->dev; struct sk_buff *segs; - int features = dev->features & ~(illegal_highdma(dev, skb) ? - NETIF_F_SG : 0); segs = skb_gso_segment(skb, features); @@ -1967,6 +2017,53 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } +static bool can_checksum_protocol(unsigned long features, __be16 protocol) +{ + return ((features & NETIF_F_GEN_CSUM) || + ((features & NETIF_F_V4_CSUM) && + protocol == htons(ETH_P_IP)) || + ((features & NETIF_F_V6_CSUM) && + protocol == htons(ETH_P_IPV6)) || + ((features & NETIF_F_FCOE_CRC) && + protocol == htons(ETH_P_FCOE))); +} + +static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +{ + if (!can_checksum_protocol(features, protocol)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } + + return features; +} + +u32 netif_skb_features(struct sk_buff *skb) +{ + __be16 protocol = skb->protocol; + u32 features = skb->dev->features; + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, protocol, features); + } + + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + + if (protocol != htons(ETH_P_8021Q)) { + return harmonize_features(skb, protocol, features); + } else { + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; + return harmonize_features(skb, protocol, features); + } +} +EXPORT_SYMBOL(netif_skb_features); + /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or @@ -1975,17 +2072,13 @@ static inline void skb_orphan_try(struct sk_buff *skb) * support DMA from it. */ static inline int skb_needs_linearize(struct sk_buff *skb, - struct net_device *dev) + int features) { - int features = dev->features; - - if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb)) - features &= dev->vlan_features; - return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) || - illegal_highdma(dev, skb)))); + ((skb_has_frag_list(skb) && + !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && + !(features & NETIF_F_SG))); } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, @@ -1995,20 +2088,24 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int rc = NETDEV_TX_OK; if (likely(!skb->next)) { - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); + u32 features; /* - * If device doesnt need skb->dst, release it right now while + * If device doesn't need skb->dst, release it right now while * its hot in this cpu cache */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + skb_orphan_try(skb); + features = netif_skb_features(skb); + if (vlan_tx_tag_present(skb) && - !(dev->features & NETIF_F_HW_VLAN_TX)) { + !(features & NETIF_F_HW_VLAN_TX)) { skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); if (unlikely(!skb)) goto out; @@ -2016,13 +2113,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - if (netif_needs_gso(dev, skb)) { - if (unlikely(dev_gso_segment(skb))) + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) goto out_kfree_skb; if (skb->next) goto gso; } else { - if (skb_needs_linearize(skb, dev) && + if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; @@ -2031,9 +2128,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, skb->csum_start - - skb_headroom(skb)); - if (!dev_can_checksum(dev, skb) && + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && skb_checksum_help(skb)) goto out_kfree_skb; } @@ -2054,7 +2151,7 @@ gso: nskb->next = NULL; /* - * If device doesnt need nskb->dst, release it right now while + * If device doesn't need nskb->dst, release it right now while * its hot in this cpu cache */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) @@ -2085,26 +2182,39 @@ out: static u32 hashrnd __read_mostly; -u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) { u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - while (unlikely(hash >= dev->real_num_tx_queues)) - hash -= dev->real_num_tx_queues; + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; return hash; } + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); - return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); + return (u16) (((u64) hash * qcount) >> 32) + qoffset; } -EXPORT_SYMBOL(skb_tx_hash); +EXPORT_SYMBOL(__skb_tx_hash); static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) { @@ -2119,26 +2229,70 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) return queue_index; } +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { int queue_index; const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) { + if (dev->real_num_tx_queues == 1) + queue_index = 0; + else if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb); queue_index = dev_cap_txqueue(dev, queue_index); } else { struct sock *sk = skb->sk; queue_index = sk_tx_queue_get(sk); - if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) { - queue_index = 0; - if (dev->real_num_tx_queues > 1) + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int old_index = queue_index; + + queue_index = get_xps_queue(dev, skb); + if (queue_index < 0) queue_index = skb_tx_hash(dev, skb); - if (sk) { - struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); + if (queue_index != old_index && sk) { + struct dst_entry *dst = + rcu_dereference_check(sk->sk_dst_cache, 1); if (dst && skb_dst(skb) == dst) sk_tx_queue_set(sk, queue_index); @@ -2155,15 +2309,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); - bool contended = qdisc_is_running(q); + bool contended; int rc; + qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits __QDISC_STATE_RUNNING owner to get the lock more often * and dequeue packets faster. */ + contended = qdisc_is_running(q); if (unlikely(contended)) spin_lock(&q->busylock); @@ -2180,7 +2337,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, */ if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) skb_dst_force(skb); - __qdisc_update_bstats(q, skb->len); + + qdisc_bstats_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -2193,7 +2352,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { skb_dst_force(skb); - rc = qdisc_enqueue_root(skb, q); + rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -2412,6 +2571,54 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + u16 tcpu; + + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb->rxhash & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->cpu = next_cpu; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, tcpu).input_queue_head; + } + + return rflow; +} + /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. @@ -2442,7 +2649,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, map = rcu_dereference(rxqueue->rps_map); if (map) { - if (map->len == 1) { + if (map->len == 1 && + !rcu_dereference_raw(rxqueue->rps_flow_table)) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; @@ -2482,12 +2690,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { - tcpu = rflow->cpu = next_cpu; - if (tcpu != RPS_NO_CPU) - rflow->last_qtail = per_cpu(softnet_data, - tcpu).input_queue_head; - } + rflow->last_qtail)) >= 0)) + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; @@ -2508,6 +2713,46 @@ done: return cpu; } +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = ACCESS_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { @@ -2712,14 +2957,6 @@ static void net_tx_action(struct softirq_action *h) } } -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) -{ - atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} - #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) /* This hook is defined here for ATM LANE */ @@ -2733,8 +2970,8 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions * a compare and 2 stores extra right now if we dont have it on * but have CONFIG_NET_CLS_ACT - * NOTE: This doesnt stop any functionality; if you dont have - * the ingress scheduler, you just cant add policies on ingress. + * NOTE: This doesn't stop any functionality; if you dont have + * the ingress scheduler, you just can't add policies on ingress. * */ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) @@ -2803,6 +3040,8 @@ out: * on a failure. * * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, @@ -2837,64 +3076,32 @@ void netdev_rx_handler_unregister(struct net_device *dev) } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); -static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, - struct net_device *master) +static void vlan_on_bond_hook(struct sk_buff *skb) { - if (skb->pkt_type == PACKET_HOST) { - u16 *dest = (u16 *) eth_hdr(skb)->h_dest; - - memcpy(dest, master->dev_addr, ETH_ALEN); - } -} - -/* On bonding slaves other than the currently active slave, suppress - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and - * ARP on active-backup slaves with arp_validate enabled. - */ -int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) -{ - struct net_device *dev = skb->dev; - - if (master->priv_flags & IFF_MASTER_ARPMON) - dev->last_rx = jiffies; - - if ((master->priv_flags & IFF_MASTER_ALB) && - (master->priv_flags & IFF_BRIDGE_PORT)) { - /* Do address unmangle. The local destination address - * will be always the one master has. Provides the right - * functionality in a bridge. - */ - skb_bond_set_mac_by_master(skb, master); - } - - if (dev->priv_flags & IFF_SLAVE_INACTIVE) { - if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && - skb->protocol == __cpu_to_be16(ETH_P_ARP)) - return 0; - - if (master->priv_flags & IFF_MASTER_ALB) { - if (skb->pkt_type != PACKET_BROADCAST && - skb->pkt_type != PACKET_MULTICAST) - return 0; - } - if (master->priv_flags & IFF_MASTER_8023AD && - skb->protocol == __cpu_to_be16(ETH_P_SLOW)) - return 0; + /* + * Make sure ARP frames received on VLAN interfaces stacked on + * bonding interfaces still make their way to any base bonding + * device that may have registered for a specific ptype. + */ + if (skb->dev->priv_flags & IFF_802_1Q_VLAN && + vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING && + skb->protocol == htons(ETH_P_ARP)) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - return 1; + if (!skb2) + return; + skb2->dev = vlan_dev_real_dev(skb->dev); + netif_rx(skb2); } - return 0; } -EXPORT_SYMBOL(__skb_bond_should_drop); static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct net_device *orig_dev; - struct net_device *master; - struct net_device *null_or_orig; - struct net_device *orig_or_bond; + struct net_device *null_or_dev; + bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; @@ -2909,28 +3116,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex; - - /* - * bonding note: skbs received on inactive slaves should only - * be delivered to pkt handlers that are exact matches. Also - * the deliver_no_wcard flag will be set. If packet handlers - * are sensitive to duplicate packets these skbs will need to - * be dropped at the handler. - */ - null_or_orig = NULL; orig_dev = skb->dev; - master = ACCESS_ONCE(orig_dev->master); - if (skb->deliver_no_wcard) - null_or_orig = orig_dev; - else if (master) { - if (skb_bond_should_drop(skb, master)) { - skb->deliver_no_wcard = 1; - null_or_orig = orig_dev; /* deliver only exact match */ - } else - skb->dev = master; - } - __this_cpu_inc(softnet_data.processed); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; @@ -2939,6 +3126,10 @@ static int __netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); +another_round: + + __this_cpu_inc(softnet_data.processed); + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); @@ -2947,8 +3138,7 @@ static int __netif_receive_skb(struct sk_buff *skb) #endif list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev) { + if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -2962,16 +3152,24 @@ static int __netif_receive_skb(struct sk_buff *skb) ncls: #endif - /* Handle special case of bridge or macvlan */ rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - skb = rx_handler(skb); - if (!skb) + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: goto out; + case RX_HANDLER_ANOTHER: + goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } } if (vlan_tx_tag_present(skb)) { @@ -2986,24 +3184,17 @@ ncls: goto out; } - /* - * Make sure frames received on VLAN interfaces stacked on - * bonding interfaces still make their way to any base bonding - * device that may have registered for a specific ptype. The - * handler may have to adjust skb->dev and orig_dev. - */ - orig_or_bond = orig_dev; - if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && - (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { - orig_or_bond = vlan_dev_real_dev(skb->dev); - } + vlan_on_bond_hook(skb); + + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && (ptype->dev == null_or_orig || - ptype->dev == skb->dev || ptype->dev == orig_dev || - ptype->dev == orig_or_bond)) { + if (ptype->type == type && + (ptype->dev == null_or_dev || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -3311,6 +3502,8 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); skb->vlan_tci = 0; + skb->dev = napi->dev; + skb->skb_iif = 0; napi->skb = skb; } @@ -3607,7 +3800,7 @@ static void net_rx_action(struct softirq_action *h) * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid - * accidently calling ->poll() when NAPI is not scheduled. + * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { @@ -3798,12 +3991,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = (v == SEQ_START_TOKEN) ? - first_net_device(seq_file_net(seq)) : - next_net_device((struct net_device *)v); + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + dev = first_net_device_rcu(seq_file_net(seq)); + else + dev = next_net_device_rcu(dev); ++*pos; - return rcu_dereference(dev); + return dev; } void dev_seq_stop(struct seq_file *seq, void *v) @@ -4087,15 +4283,14 @@ static int __init dev_proc_init(void) /** - * netdev_set_master - set up master/slave pair + * netdev_set_master - set up master pointer * @slave: slave device * @master: new master device * * Changes the master device of the slave. Pass %NULL to break the * bonding. The caller must hold the RTNL semaphore. On a failure * a negative errno code is returned. On success the reference counts - * are adjusted, %RTM_NEWLINK is sent to the routing socket and the - * function returns zero. + * are adjusted and the function returns zero. */ int netdev_set_master(struct net_device *slave, struct net_device *master) { @@ -4115,6 +4310,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) synchronize_net(); dev_put(old); } + return 0; +} +EXPORT_SYMBOL(netdev_set_master); + +/** + * netdev_set_bond_master - set up bonding master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success %RTM_NEWLINK is sent + * to the routing socket and the function returns zero. + */ +int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +{ + int err; + + ASSERT_RTNL(); + + err = netdev_set_master(slave, master); + if (err) + return err; if (master) slave->flags |= IFF_SLAVE; else @@ -4123,7 +4341,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); return 0; } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_set_bond_master); static void dev_change_rx_flags(struct net_device *dev, int flags) { @@ -4460,6 +4678,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(dev_set_mtu); /** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address @@ -4544,7 +4773,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm * is never reached */ WARN_ON(1); - err = -EINVAL; + err = -ENOTTY; break; } @@ -4812,7 +5041,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* Set the per device memory buffer space. * Not applicable in our case */ case SIOCSIFLINK: - return -EINVAL; + return -ENOTTY; /* * Unknown or private ioctl. @@ -4833,7 +5062,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* Take care of Wireless Extensions */ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) return wext_handle_ioctl(net, &ifr, cmd, arg); - return -EINVAL; + return -ENOTTY; } } @@ -4887,10 +5116,12 @@ static void rollback_registered_many(struct list_head *head) } BUG_ON(dev->reg_state != NETREG_REGISTERED); + } - /* If device is running, close it first. */ - dev_close(dev); + /* If device is running, close it first. */ + dev_close_many(head); + list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); @@ -4945,40 +5176,62 @@ static void rollback_registered(struct net_device *dev) list_add(&dev->unreg_list, &single); rollback_registered_many(&single); + list_del(&single); } -unsigned long netdev_fix_features(unsigned long features, const char *name) +u32 netdev_fix_features(struct net_device *dev, u32 features) { + /* Fix illegal checksum combinations */ + if ((features & NETIF_F_HW_CSUM) && + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_info(dev, "mixed HW and IP checksum settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } + + if ((features & NETIF_F_NO_CSUM) && + (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_info(dev, "mixed no checksumming and other settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); + } + /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " - "checksum feature.\n", name); + netdev_info(dev, + "Dropping NETIF_F_SG since no checksum feature.\n"); features &= ~NETIF_F_SG; } /* TSO requires that SG is present as well. */ - if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " - "SG feature.\n", name); - features &= ~NETIF_F_TSO; + if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { + netdev_info(dev, "Dropping TSO features since no SG feature.\n"); + features &= ~NETIF_F_ALL_TSO; + } + + /* TSO ECN requires that TSO is present as well. */ + if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) + features &= ~NETIF_F_TSO_ECN; + + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; } + /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { - if (!(features & NETIF_F_GEN_CSUM)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_HW_CSUM feature.\n", - name); + /* maybe split UFO into V4 and V6? */ + if (!((features & NETIF_F_GEN_CSUM) || + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_info(dev, + "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; } if (!(features & NETIF_F_SG)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_SG feature.\n", name); + netdev_info(dev, + "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); features &= ~NETIF_F_UFO; } } @@ -4987,6 +5240,37 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +void netdev_update_features(struct net_device *dev) +{ + u32 features; + int err = 0; + + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + if (dev->features == features) + return; + + netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n", + dev->features, features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + + if (!err) + dev->features = features; + else if (err < 0) + netdev_err(dev, + "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", + err, features, dev->features); +} +EXPORT_SYMBOL(netdev_update_features); + /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from @@ -5014,9 +5298,9 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +#ifdef CONFIG_RPS static int netif_alloc_rx_queues(struct net_device *dev) { -#ifdef CONFIG_RPS unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; @@ -5029,15 +5313,22 @@ static int netif_alloc_rx_queues(struct net_device *dev) } dev->_rx = rx; - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ for (i = 0; i < count; i++) - rx[i].first = rx; -#endif + rx[i].dev = dev; return 0; } +#endif + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) +{ + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +} static int netif_alloc_netdev_queues(struct net_device *dev) { @@ -5053,25 +5344,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return -ENOMEM; } dev->_tx = tx; - return 0; -} - -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) -{ - queue->dev = dev; - - /* Initialize queue lock */ - spin_lock_init(&queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); - queue->xmit_lock_owner = -1; -} -static void netdev_init_queues(struct net_device *dev) -{ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); + + return 0; } /** @@ -5110,16 +5387,6 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = netif_alloc_rx_queues(dev); - if (ret) - goto out; - - ret = netif_alloc_netdev_queues(dev); - if (ret) - goto out; - - netdev_init_queues(dev); - /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5138,27 +5405,19 @@ int register_netdevice(struct net_device *dev) if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Fix illegal checksum combinations */ - if ((dev->features & NETIF_F_HW_CSUM) && - (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", - dev->name); - dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); - } + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->features |= NETIF_F_SOFT_FEATURES; + dev->wanted_features = dev->features & dev->hw_features; - if ((dev->features & NETIF_F_NO_CSUM) && - (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", - dev->name); - dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); + /* Avoid warning from netdev_fix_features() for GSO without SG */ + if (!(dev->wanted_features & NETIF_F_SG)) { + dev->wanted_features &= ~NETIF_F_GSO; + dev->features &= ~NETIF_F_GSO; } - dev->features = netdev_fix_features(dev->features, dev->name); - - /* Enable software GSO if SG is supported. */ - if (dev->features & NETIF_F_SG) - dev->features |= NETIF_F_GSO; - /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features * are enabled only if supported by underlying device. @@ -5175,6 +5434,8 @@ int register_netdevice(struct net_device *dev) goto err_uninit; dev->reg_state = NETREG_REGISTERED; + netdev_update_features(dev); + /* * Default initial state at registry is that the * device is present. @@ -5423,34 +5684,6 @@ void netdev_run_todo(void) } } -/** - * dev_txq_stats_fold - fold tx_queues stats - * @dev: device to get statistics from - * @stats: struct rtnl_link_stats64 to hold results - */ -void dev_txq_stats_fold(const struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - spin_lock_bh(&txq->_xmit_lock); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - spin_unlock_bh(&txq->_xmit_lock); - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } -} -EXPORT_SYMBOL(dev_txq_stats_fold); - /* Convert net_device_stats to rtnl_link_stats64. They have the same * fields in the same order, with only the type differing. */ @@ -5494,7 +5727,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else { netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; @@ -5520,18 +5752,20 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) } /** - * alloc_netdev_mq - allocate network device + * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device - * @queue_count: the number of subqueues to allocate + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subquue structs - * for each queue on the device at the end of the netdevice. + * for each queue on the device. */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, - void (*setup)(struct net_device *), unsigned int queue_count) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t alloc_size; @@ -5539,12 +5773,20 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); - if (queue_count < 1) { + if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device " "with zero queues.\n"); return NULL; } +#ifdef CONFIG_RPS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero RX queues.\n"); + return NULL; + } +#endif + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ @@ -5575,14 +5817,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->num_tx_queues = queue_count; - dev->real_num_tx_queues = queue_count; - -#ifdef CONFIG_RPS - dev->num_rx_queues = queue_count; - dev->real_num_rx_queues = queue_count; -#endif - dev->gso_max_size = GSO_MAX_SIZE; INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); @@ -5592,16 +5826,39 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->link_watch_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); + + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_all; + +#ifdef CONFIG_RPS + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_all; +#endif + strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; return dev; +free_all: + free_netdev(dev); + return NULL; + free_pcpu: free_percpu(dev->pcpu_refcnt); + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + free_p: kfree(p); return NULL; } -EXPORT_SYMBOL(alloc_netdev_mq); +EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device @@ -5618,6 +5875,9 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif kfree(rcu_dereference_raw(dev->ingress_queue)); @@ -5895,8 +6155,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ -unsigned long netdev_increment_features(unsigned long all, unsigned long one, - unsigned long mask) +u32 netdev_increment_features(u32 all, u32 one, u32 mask) { /* If device needs checksumming, downgrade to it. */ if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) @@ -6081,7 +6340,7 @@ static void __net_exit default_device_exit(struct net *net) if (dev->rtnl_link_ops) continue; - /* Push remaing network devices to init_net */ + /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { @@ -6096,7 +6355,7 @@ static void __net_exit default_device_exit(struct net *net) static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network - * namespace. Do this in the reverse order of registeration. + * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ @@ -6114,6 +6373,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } } unregister_netdevice_many(&dev_kill_list); + list_del(&dev_kill_list); rtnl_unlock(); } |