diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 596 |
1 files changed, 417 insertions, 179 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 5e37e9abe8c5..130d64220229 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> +#include <linux/errqueue.h> #include "net-sysfs.h" @@ -148,6 +149,9 @@ struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -1082,6 +1086,7 @@ static int dev_get_valid_name(struct net *net, */ int dev_change_name(struct net_device *dev, const char *newname) { + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; @@ -1109,10 +1114,17 @@ int dev_change_name(struct net_device *dev, const char *newname) return err; } + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s\n", oldname); + + old_assign_type = dev->name_assign_type; + dev->name_assign_type = NET_NAME_RENAMED; + rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; write_seqcount_end(&devnet_rename_seq); return ret; } @@ -1141,6 +1153,8 @@ rollback: write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -1207,7 +1221,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } @@ -1245,7 +1263,7 @@ static int __dev_open(struct net_device *dev) * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - netpoll_rx_disable(dev); + netpoll_poll_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); @@ -1260,7 +1278,7 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - netpoll_rx_enable(dev); + netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); @@ -1312,6 +1330,9 @@ static int __dev_close_many(struct list_head *head) might_sleep(); list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1322,7 +1343,7 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); @@ -1341,6 +1362,7 @@ static int __dev_close_many(struct list_head *head) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; + netpoll_poll_enable(dev); } return 0; @@ -1351,14 +1373,10 @@ static int __dev_close(struct net_device *dev) int retval; LIST_HEAD(single); - /* Temporarily disable netpoll until the interface is down */ - netpoll_rx_disable(dev); - list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); - netpoll_rx_enable(dev); return retval; } @@ -1396,14 +1414,9 @@ int dev_close(struct net_device *dev) if (dev->flags & IFF_UP) { LIST_HEAD(single); - /* Block netpoll rx while the interface is going down */ - netpoll_rx_disable(dev); - list_add(&dev->close_list, &single); dev_close_many(&single); list_del(&single); - - netpoll_rx_enable(dev); } return 0; } @@ -1643,8 +1656,7 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -static inline bool is_skb_forwardable(struct net_device *dev, - struct sk_buff *skb) +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { unsigned int len; @@ -1663,6 +1675,30 @@ static inline bool is_skb_forwardable(struct net_device *dev, return false; } +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->protocol = eth_type_trans(skb, dev); + + return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif @@ -1684,24 +1720,7 @@ static inline bool is_skb_forwardable(struct net_device *dev, */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - - skb_scrub_packet(skb, true); - skb->protocol = eth_type_trans(skb, dev); - - return netif_rx_internal(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -2284,10 +2303,10 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { + unsigned int vlan_depth = skb->mac_len; __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { @@ -2300,17 +2319,34 @@ __be16 skb_network_protocol(struct sk_buff *skb) type = eth->h_proto; } - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; + /* if skb->protocol is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (WARN_ON(vlan_depth < VLAN_HLEN)) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); } + *depth = vlan_depth; + return type; } @@ -2324,12 +2360,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; - __be16 type = skb_network_protocol(skb); + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); - __skb_pull(skb, skb->mac_len); + __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { @@ -2386,8 +2423,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, skb_warn_bad_offload(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_cow_head(skb, 0); + if (err < 0) return ERR_PTR(err); } @@ -2418,7 +2455,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2492,47 +2529,79 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + features &= skb->dev->mpls_features; + + return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif + static netdev_features_t harmonize_features(struct sk_buff *skb, - const struct net_device *dev, - netdev_features_t features) + netdev_features_t features) { + int tmp; + __be16 type; + + type = skb_network_protocol(skb, &tmp); + features = net_mpls_features(skb, features, type); + if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, skb_network_protocol(skb))) { + !can_checksum_protocol(features, type)) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(dev, skb)) { + } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - netdev_features_t features = dev->features; + netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } - features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + features = netdev_intersect_features(features, + skb->dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) - features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; + features = netdev_intersect_features(features, + NETIF_F_SG | + NETIF_F_HIGHDMA | + NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } -EXPORT_SYMBOL(netif_skb_dev_features); +EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) @@ -2691,8 +2760,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + * This permits __QDISC___STATE_RUNNING owner to get the lock more + * often and dequeue packets faster. */ contended = qdisc_is_running(q); if (unlikely(contended)) @@ -2812,6 +2881,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_reset_mac_header(skb); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ @@ -2878,6 +2950,7 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return rc; out: @@ -2950,7 +3023,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb->rxhash & flow_table->mask; + flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -2984,6 +3057,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_sock_flow_table *sock_flow_table; int cpu = -1; u16 tcpu; + u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); @@ -3012,7 +3086,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } skb_reset_network_header(skb); - if (!skb_get_hash(skb)) + hash = skb_get_hash(skb); + if (!hash) goto done; flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -3021,11 +3096,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, u16 next_cpu; struct rps_dev_flow *rflow; - rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; - next_cpu = sock_flow_table->ents[skb->rxhash & - sock_flow_table->mask]; + next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; /* * If the desired CPU (where last recvmsg was done) is @@ -3054,7 +3128,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + tcpu = map->cpus[((u64) hash * map->len) >> 32]; if (cpu_online(tcpu)) { cpu = tcpu; @@ -3229,10 +3303,6 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; - net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); @@ -3343,7 +3413,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3353,7 +3423,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -3439,7 +3509,7 @@ out: * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * - * Register a receive hander for a device. This handler will then be + * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * @@ -3493,11 +3563,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_ARP): - case __constant_htons(ETH_P_IP): - case __constant_htons(ETH_P_IPV6): - case __constant_htons(ETH_P_8021Q): - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): return true; default: return false; @@ -3518,10 +3588,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) trace_netif_receive_skb(skb); - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - goto out; - orig_dev = skb->dev; skb_reset_network_header(skb); @@ -3540,7 +3606,7 @@ another_round: if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { - skb = vlan_untag(skb); + skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto unlock; } @@ -3648,7 +3714,6 @@ drop: unlock: rcu_read_unlock(); -out: return ret; } @@ -3838,10 +3903,10 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs |= p->vlan_tci ^ skb->vlan_tci; if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), + skb_mac_header(skb), maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } @@ -3864,6 +3929,27 @@ static void skb_gro_reset_offset(struct sk_buff *skb) } } +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + pinfo->frags[0].page_offset += grow; + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -3872,14 +3958,14 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff struct list_head *head = &offload_base; int same_flow; enum gro_result ret; + int grow; - if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) + if (!(skb->dev->features & NETIF_F_GRO)) goto normal; if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; - skb_gro_reset_offset(skb); gro_list_prepare(napi, skb); NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ @@ -3937,33 +4023,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; pull: - if (skb_headlen(skb) < skb_gro_offset(skb)) { - int grow = skb_gro_offset(skb) - skb_headlen(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->tail += grow; - skb->data_len -= grow; - - skb_shinfo(skb)->frags[0].page_offset += grow; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - - if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(skb_shinfo(skb)->frags, - skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); - } - } - + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); ok: return ret; @@ -4031,6 +4100,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { trace_napi_gro_receive_entry(skb); + skb_gro_reset_offset(skb); + return napi_skb_finish(dev_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -4043,6 +4114,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; } @@ -4059,12 +4133,16 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -4073,7 +4151,6 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * napi_reuse_skb(napi, skb); break; - case GRO_HELD: case GRO_MERGED: break; } @@ -4081,17 +4158,41 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * return ret; } +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); napi->skb = NULL; - if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) { - napi_reuse_skb(napi, skb); - return NULL; + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header_fast(skb, 0); + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; } - skb->protocol = eth_type_trans(skb, skb->dev); + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; return skb; } @@ -4128,8 +4229,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); + smp_call_function_single_async(remsd->cpu, + &remsd->csd); remsd = next; } } else @@ -4153,9 +4254,8 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -4169,24 +4269,24 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); @@ -4216,7 +4316,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); @@ -4507,6 +4607,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list) EXPORT_SYMBOL(netdev_adjacent_get_private); /** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +/** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position @@ -4553,8 +4679,7 @@ void *netdev_lower_get_next_private(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = lower->list.next; + *iter = lower->list.next; return lower->private; } @@ -4582,14 +4707,39 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = &lower->list; + *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +/** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant @@ -4649,9 +4799,14 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev, sysfs_remove_link(&(dev->dev.kobj), linkname); } -#define netdev_adjacent_is_neigh_list(dev, dev_list) \ - (dev_list == &dev->adj_list.upper || \ - dev_list == &dev->adj_list.lower) +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, @@ -4681,7 +4836,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, pr_debug("dev_hold for %s, because of link added from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); - if (netdev_adjacent_is_neigh_list(dev, dev_list)) { + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; @@ -4702,7 +4857,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; remove_symlinks: - if (netdev_adjacent_is_neigh_list(dev, dev_list)) + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: kfree(adj); @@ -4735,7 +4890,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); - if (netdev_adjacent_is_neigh_list(dev, dev_list)) + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); @@ -5005,11 +5160,65 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; + struct net *net = dev_net(dev); + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, @@ -5017,6 +5226,8 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) } list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, @@ -5039,6 +5250,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_dev_get_private); + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) +{ + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; + + ASSERT_RTNL(); + + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } + + if (type_check(dev)) + max_nest++; + + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -5276,13 +5511,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + if ((old_flags ^ flags) & IFF_UP) ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - if (!ret) - dev_set_rx_mode(dev); - } - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; @@ -5508,7 +5739,7 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); -static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { @@ -5565,10 +5796,6 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); - /* * Flush the unicast and multicast chains */ @@ -5578,6 +5805,10 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -5661,6 +5892,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } @@ -5796,10 +6034,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netif_free_tx_queues(struct net_device *dev) { - if (is_vmalloc_addr(dev->_tx)) - vfree(dev->_tx); - else - kfree(dev->_tx); + kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -6236,6 +6471,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, &dev->stats); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += atomic_long_read(&dev->tx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -6272,25 +6508,24 @@ void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } /** * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { @@ -6369,6 +6604,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #endif strcpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; @@ -6380,11 +6616,6 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kfree(dev->_rx); -#endif - free_dev: netdev_freemem(dev); return NULL; @@ -6481,6 +6712,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -6490,6 +6724,7 @@ void unregister_netdevice_many(struct list_head *head) rollback_registered_many(head); list_for_each_entry(dev, head, unreg_list) net_set_todo(dev); + list_del(head); } } EXPORT_SYMBOL(unregister_netdevice_many); @@ -6595,6 +6830,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -6609,6 +6845,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); @@ -6781,12 +7018,14 @@ static int __netdev_printk(const char *level, const struct net_device *dev, if (dev && dev->dev.parent) { r = dev_printk_emit(level[1] - '0', dev->dev.parent, - "%s %s %s: %pV", + "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), - netdev_name(dev), vaf); + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + r = printk("%s%s%s: %pV", level, netdev_name(dev), + netdev_reg_state(dev), vaf); } else { r = printk("%s(NULL net_device): %pV", level, vaf); } @@ -6938,14 +7177,13 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); - list_del(&dev_kill_list); rtnl_unlock(); } |