diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 2 | ||||
-rw-r--r-- | net/core/dev.c | 1255 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 18 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 14 | ||||
-rw-r--r-- | net/core/dst.c | 93 | ||||
-rw-r--r-- | net/core/ethtool.c | 825 | ||||
-rw-r--r-- | net/core/fib_rules.c | 31 | ||||
-rw-r--r-- | net/core/filter.c | 482 | ||||
-rw-r--r-- | net/core/flow.c | 14 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 9 | ||||
-rw-r--r-- | net/core/iovec.c | 20 | ||||
-rw-r--r-- | net/core/link_watch.c | 2 | ||||
-rw-r--r-- | net/core/neighbour.c | 14 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 487 | ||||
-rw-r--r-- | net/core/net-sysfs.h | 4 | ||||
-rw-r--r-- | net/core/net_namespace.c | 26 | ||||
-rw-r--r-- | net/core/netpoll.c | 47 | ||||
-rw-r--r-- | net/core/pktgen.c | 473 | ||||
-rw-r--r-- | net/core/request_sock.c | 5 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 292 | ||||
-rw-r--r-- | net/core/scm.c | 12 | ||||
-rw-r--r-- | net/core/skbuff.c | 61 | ||||
-rw-r--r-- | net/core/sock.c | 92 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 12 | ||||
-rw-r--r-- | net/core/timestamping.c | 10 | ||||
-rw-r--r-- | net/core/utils.c | 24 |
26 files changed, 2796 insertions, 1528 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index cd1e039c8755..18ac112ea7ae 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -177,7 +177,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, * interrupt level will suddenly eat the receive_queue. * * Look at current nfs client by the way... - * However, this function was corrent in any case. 8) + * However, this function was correct in any case. 8) */ unsigned long cpu_flags; diff --git a/net/core/dev.c b/net/core/dev.c index 78b5a89b0f40..d94537914a71 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include <trace/events/skb.h> #include <linux/pci.h> #include <linux/inetdevice.h> +#include <linux/cpu_rmap.h> #include "net-sysfs.h" @@ -743,34 +744,32 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** - * dev_getbyhwaddr - find a device by its hardware address + * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. + * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; - ASSERT_RTNL(); - - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } -EXPORT_SYMBOL(dev_getbyhwaddr); +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { @@ -949,7 +948,7 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) +static int dev_get_valid_name(struct net_device *dev, const char *name) { struct net *net; @@ -959,7 +958,7 @@ static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt if (!dev_valid_name(name)) return -EINVAL; - if (fmt && strchr(name, '%')) + if (strchr(name, '%')) return dev_alloc_name(dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; @@ -996,7 +995,7 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(dev, newname, 1); + err = dev_get_valid_name(dev, newname); if (err < 0) return err; @@ -1008,7 +1007,7 @@ rollback: } write_lock_bh(&dev_base_lock); - hlist_del(&dev->name_hlist); + hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); synchronize_rcu(); @@ -1116,13 +1115,21 @@ EXPORT_SYMBOL(netdev_bonding_change); void dev_load(struct net *net, const char *name) { struct net_device *dev; + int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); - if (!dev && capable(CAP_NET_ADMIN)) - request_module("%s", name); + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device " +"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " +"instead\n", name); + } } EXPORT_SYMBOL(dev_load); @@ -1133,9 +1140,6 @@ static int __dev_open(struct net_device *dev) ASSERT_RTNL(); - /* - * Is it even present? - */ if (!netif_device_present(dev)) return -ENODEV; @@ -1144,9 +1148,6 @@ static int __dev_open(struct net_device *dev) if (ret) return ret; - /* - * Call device private open method - */ set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) @@ -1155,31 +1156,12 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - /* - * If it went open OK then: - */ - if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - /* - * Set the flags. - */ dev->flags |= IFF_UP; - - /* - * Enable NET_DMA - */ net_dmaengine_get(); - - /* - * Initialize multicasting status - */ dev_set_rx_mode(dev); - - /* - * Wakeup transmit queue engine - */ dev_activate(dev); } @@ -1202,22 +1184,13 @@ int dev_open(struct net_device *dev) { int ret; - /* - * Is it already up? - */ if (dev->flags & IFF_UP) return 0; - /* - * Open device - */ ret = __dev_open(dev); if (ret < 0) return ret; - /* - * ... and announce new interface. - */ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_UP, dev); @@ -1225,52 +1198,78 @@ int dev_open(struct net_device *dev) } EXPORT_SYMBOL(dev_open); -static int __dev_close(struct net_device *dev) +static int __dev_close_many(struct list_head *head) { - const struct net_device_ops *ops = dev->netdev_ops; + struct net_device *dev; ASSERT_RTNL(); might_sleep(); - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ - call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + list_for_each_entry(dev, head, unreg_list) { + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); - clear_bit(__LINK_STATE_START, &dev->state); + clear_bit(__LINK_STATE_START, &dev->state); - /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(). - * - * dev->stop() will invoke napi_disable() on all of it's - * napi_struct instances on this device. - */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + } - dev_deactivate(dev); + dev_deactivate_many(head); - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - * - * We allow it to be called even after a DETACH hot-plug - * event. - */ - if (ops->ndo_stop) - ops->ndo_stop(dev); + list_for_each_entry(dev, head, unreg_list) { + const struct net_device_ops *ops = dev->netdev_ops; - /* - * Device is now down. - */ + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; + dev->flags &= ~IFF_UP; + net_dmaengine_put(); + } - /* - * Shutdown NET_DMA - */ - net_dmaengine_put(); + return 0; +} +static int __dev_close(struct net_device *dev) +{ + int retval; + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + retval = __dev_close_many(&single); + list_del(&single); + return retval; +} + +static int dev_close_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) + if (!(dev->flags & IFF_UP)) + list_move(&dev->unreg_list, &tmp_list); + + __dev_close_many(head); + + list_for_each_entry(dev, head, unreg_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + /* rollback_registered_many needs the complete original list */ + list_splice(&tmp_list, head); return 0; } @@ -1285,17 +1284,13 @@ static int __dev_close(struct net_device *dev) */ int dev_close(struct net_device *dev) { - if (!(dev->flags & IFF_UP)) - return 0; - - __dev_close(dev); - - /* - * Tell people we are down - */ - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); - call_netdevice_notifiers(NETDEV_DOWN, dev); + if (dev->flags & IFF_UP) { + LIST_HEAD(single); + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } return 0; } EXPORT_SYMBOL(dev_close); @@ -1311,26 +1306,25 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } - WARN_ON(dev->features & NETIF_F_LRO); + u32 flags; + + if (dev->ethtool_ops && dev->ethtool_ops->get_flags) + flags = dev->ethtool_ops->get_flags(dev); + else + flags = ethtool_op_get_flags(dev); + + if (!(flags & ETH_FLAG_LRO)) + return; + + __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); } EXPORT_SYMBOL(dev_disable_lro); static int dev_boot_phase = 1; -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - /** * register_netdevice_notifier - register a network notifier block * @nb: notifier @@ -1432,6 +1426,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } +EXPORT_SYMBOL(call_netdevice_notifiers); /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); @@ -1462,6 +1457,27 @@ static inline void net_timestamp_check(struct sk_buff *skb) __net_timestamp(skb); } +static inline bool is_skb_forwardable(struct net_device *dev, + struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1485,8 +1501,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (unlikely(!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { + if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; @@ -1499,6 +1514,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1507,13 +1530,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb); static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - -#ifdef CONFIG_NET_CLS_ACT - if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) - net_timestamp_set(skb); -#else - net_timestamp_set(skb); -#endif + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1523,10 +1541,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) if ((ptype->dev == dev || !ptype->dev) && (ptype->af_packet_priv == NULL || (struct sock *)ptype->af_packet_priv != skb->sk)) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) break; + net_timestamp_set(skb2); + /* skb->nh should be correctly set by sender, so that the second statement is just protection against buggy protocols. @@ -1545,24 +1571,79 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype, skb->dev); + pt_prev = ptype; } } + if (pt_prev) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues changed " + "invalidating tc mappings. Priority " + "traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues " + "changed. Priority %i to tc " + "mapping %i is no longer valid " + "setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { + int rc; + if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; - if (dev->reg_state == NETREG_REGISTERED) { + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (dev->num_tc) + netif_setup_tc(dev, txq); + if (txq < dev->real_num_tx_queues) qdisc_reset_all_tx_gt(dev, txq); } @@ -1683,37 +1764,6 @@ void netif_device_attach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_attach); -static bool can_checksum_protocol(unsigned long features, __be16 protocol) -{ - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_IP_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_IPV6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); -} - -static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) -{ - int features = dev->features; - - if (vlan_tx_tag_present(skb)) - features &= dev->vlan_features; - - if (can_checksum_protocol(features, skb->protocol)) - return true; - - if (skb->protocol == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - if (can_checksum_protocol(dev->features & dev->vlan_features, - veh->h_vlan_encapsulated_proto)) - return true; - } - - return false; -} - /** * skb_dev_set -- assign a new device to a buffer * @skb: buffer for the new device @@ -1761,7 +1811,7 @@ int skb_checksum_help(struct sk_buff *skb) goto out_set_summed; } - offset = skb->csum_start - skb_headroom(skb); + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1793,21 +1843,23 @@ EXPORT_SYMBOL(skb_checksum_help); * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; __be16 type = skb->protocol; + int vlan_depth = ETH_HLEN; int err; - if (type == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *veh; + while (type == htons(ETH_P_8021Q)) { + struct vlan_hdr *vh; - if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) return ERR_PTR(-EINVAL); - veh = (struct vlan_ethhdr *)skb->data; - type = veh->h_vlan_encapsulated_proto; + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; } skb_reset_mac_header(skb); @@ -1821,8 +1873,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) dev->ethtool_ops->get_drvinfo(dev, &info); - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " - "ip_summed=%d", + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", info.driver, dev ? dev->features : 0L, skb->sk ? skb->sk->sk_route_caps : 0L, skb->len, skb->data_len, skb->ip_summed); @@ -1925,16 +1976,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) /** * dev_gso_segment - Perform emulated hardware segmentation on skb. * @skb: buffer to segment + * @features: device features as applicable to this skb * * This function segments the given skb and stores the list of segments * in skb->next. */ -static int dev_gso_segment(struct sk_buff *skb) +static int dev_gso_segment(struct sk_buff *skb, int features) { - struct net_device *dev = skb->dev; struct sk_buff *segs; - int features = dev->features & ~(illegal_highdma(dev, skb) ? - NETIF_F_SG : 0); segs = skb_gso_segment(skb, features); @@ -1971,6 +2020,53 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } +static bool can_checksum_protocol(unsigned long features, __be16 protocol) +{ + return ((features & NETIF_F_GEN_CSUM) || + ((features & NETIF_F_V4_CSUM) && + protocol == htons(ETH_P_IP)) || + ((features & NETIF_F_V6_CSUM) && + protocol == htons(ETH_P_IPV6)) || + ((features & NETIF_F_FCOE_CRC) && + protocol == htons(ETH_P_FCOE))); +} + +static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +{ + if (!can_checksum_protocol(features, protocol)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } + + return features; +} + +u32 netif_skb_features(struct sk_buff *skb) +{ + __be16 protocol = skb->protocol; + u32 features = skb->dev->features; + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, protocol, features); + } + + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + + if (protocol != htons(ETH_P_8021Q)) { + return harmonize_features(skb, protocol, features); + } else { + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; + return harmonize_features(skb, protocol, features); + } +} +EXPORT_SYMBOL(netif_skb_features); + /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or @@ -1979,17 +2075,13 @@ static inline void skb_orphan_try(struct sk_buff *skb) * support DMA from it. */ static inline int skb_needs_linearize(struct sk_buff *skb, - struct net_device *dev) + int features) { - int features = dev->features; - - if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb)) - features &= dev->vlan_features; - return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) || - illegal_highdma(dev, skb)))); + ((skb_has_frag_list(skb) && + !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && + !(features & NETIF_F_SG))); } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, @@ -1999,20 +2091,24 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int rc = NETDEV_TX_OK; if (likely(!skb->next)) { - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); + u32 features; /* - * If device doesnt need skb->dst, release it right now while + * If device doesn't need skb->dst, release it right now while * its hot in this cpu cache */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + skb_orphan_try(skb); + features = netif_skb_features(skb); + if (vlan_tx_tag_present(skb) && - !(dev->features & NETIF_F_HW_VLAN_TX)) { + !(features & NETIF_F_HW_VLAN_TX)) { skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); if (unlikely(!skb)) goto out; @@ -2020,13 +2116,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - if (netif_needs_gso(dev, skb)) { - if (unlikely(dev_gso_segment(skb))) + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) goto out_kfree_skb; if (skb->next) goto gso; } else { - if (skb_needs_linearize(skb, dev) && + if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; @@ -2035,9 +2131,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, skb->csum_start - - skb_headroom(skb)); - if (!dev_can_checksum(dev, skb) && + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && skb_checksum_help(skb)) goto out_kfree_skb; } @@ -2058,7 +2154,7 @@ gso: nskb->next = NULL; /* - * If device doesnt need nskb->dst, release it right now while + * If device doesn't need nskb->dst, release it right now while * its hot in this cpu cache */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) @@ -2089,26 +2185,39 @@ out: static u32 hashrnd __read_mostly; -u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) { u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - while (unlikely(hash >= dev->real_num_tx_queues)) - hash -= dev->real_num_tx_queues; + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; return hash; } + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); - return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); + return (u16) (((u64) hash * qcount) >> 32) + qoffset; } -EXPORT_SYMBOL(skb_tx_hash); +EXPORT_SYMBOL(__skb_tx_hash); static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) { @@ -2123,26 +2232,70 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) return queue_index; } +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { int queue_index; const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) { + if (dev->real_num_tx_queues == 1) + queue_index = 0; + else if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb); queue_index = dev_cap_txqueue(dev, queue_index); } else { struct sock *sk = skb->sk; queue_index = sk_tx_queue_get(sk); - if (queue_index < 0) { - queue_index = 0; - if (dev->real_num_tx_queues > 1) + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int old_index = queue_index; + + queue_index = get_xps_queue(dev, skb); + if (queue_index < 0) queue_index = skb_tx_hash(dev, skb); - if (sk) { - struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); + if (queue_index != old_index && sk) { + struct dst_entry *dst = + rcu_dereference_check(sk->sk_dst_cache, 1); if (dst && skb_dst(skb) == dst) sk_tx_queue_set(sk, queue_index); @@ -2159,15 +2312,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); - bool contended = qdisc_is_running(q); + bool contended; int rc; + qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits __QDISC_STATE_RUNNING owner to get the lock more often * and dequeue packets faster. */ + contended = qdisc_is_running(q); if (unlikely(contended)) spin_lock(&q->busylock); @@ -2184,7 +2340,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, */ if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) skb_dst_force(skb); - __qdisc_update_bstats(q, skb->len); + + qdisc_bstats_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -2197,7 +2355,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { skb_dst_force(skb); - rc = qdisc_enqueue_root(skb, q); + rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -2213,7 +2371,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } static DEFINE_PER_CPU(int, xmit_recursion); -#define RECURSION_LIMIT 3 +#define RECURSION_LIMIT 10 /** * dev_queue_xmit - transmit a buffer @@ -2347,8 +2505,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, __u32 __skb_get_rxhash(struct sk_buff *skb) { int nhoff, hash = 0, poff; - struct ipv6hdr *ip6; - struct iphdr *ip; + const struct ipv6hdr *ip6; + const struct iphdr *ip; u8 ip_proto; u32 addr1, addr2, ihl; union { @@ -2363,7 +2521,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) (skb->data + nhoff); + ip = (const struct iphdr *) (skb->data + nhoff); if (ip->frag_off & htons(IP_MF | IP_OFFSET)) ip_proto = 0; else @@ -2376,7 +2534,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) (skb->data + nhoff); + ip6 = (const struct ipv6hdr *) (skb->data + nhoff); ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2413,9 +2571,57 @@ EXPORT_SYMBOL(__skb_get_rxhash); #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + u16 tcpu; + + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb->rxhash & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->cpu = next_cpu; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, tcpu).input_queue_head; + } + + return rflow; +} + /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. @@ -2425,7 +2631,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map = NULL; + struct rps_map *map; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2444,15 +2650,16 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (rxqueue->rps_map) { - map = rcu_dereference(rxqueue->rps_map); - if (map && map->len == 1) { + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1 && + !rcu_dereference_raw(rxqueue->rps_flow_table)) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rxqueue->rps_flow_table) { + } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { goto done; } @@ -2486,12 +2693,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { - tcpu = rflow->cpu = next_cpu; - if (tcpu != RPS_NO_CPU) - rflow->last_qtail = per_cpu(softnet_data, - tcpu).input_queue_head; - } + rflow->last_qtail)) >= 0)) + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; @@ -2512,6 +2716,46 @@ done: return cpu; } +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = ACCESS_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { @@ -2716,14 +2960,6 @@ static void net_tx_action(struct softirq_action *h) } } -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) -{ - atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} - #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) /* This hook is defined here for ATM LANE */ @@ -2737,8 +2973,8 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions * a compare and 2 stores extra right now if we dont have it on * but have CONFIG_NET_CLS_ACT - * NOTE: This doesnt stop any functionality; if you dont have - * the ingress scheduler, you just cant add policies on ingress. + * NOTE: This doesn't stop any functionality; if you dont have + * the ingress scheduler, you just can't add policies on ingress. * */ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) @@ -2807,6 +3043,8 @@ out: * on a failure. * * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, @@ -2841,64 +3079,13 @@ void netdev_rx_handler_unregister(struct net_device *dev) } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); -static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, - struct net_device *master) -{ - if (skb->pkt_type == PACKET_HOST) { - u16 *dest = (u16 *) eth_hdr(skb)->h_dest; - - memcpy(dest, master->dev_addr, ETH_ALEN); - } -} - -/* On bonding slaves other than the currently active slave, suppress - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and - * ARP on active-backup slaves with arp_validate enabled. - */ -int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) -{ - struct net_device *dev = skb->dev; - - if (master->priv_flags & IFF_MASTER_ARPMON) - dev->last_rx = jiffies; - - if ((master->priv_flags & IFF_MASTER_ALB) && - (master->priv_flags & IFF_BRIDGE_PORT)) { - /* Do address unmangle. The local destination address - * will be always the one master has. Provides the right - * functionality in a bridge. - */ - skb_bond_set_mac_by_master(skb, master); - } - - if (dev->priv_flags & IFF_SLAVE_INACTIVE) { - if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && - skb->protocol == __cpu_to_be16(ETH_P_ARP)) - return 0; - - if (master->priv_flags & IFF_MASTER_ALB) { - if (skb->pkt_type != PACKET_BROADCAST && - skb->pkt_type != PACKET_MULTICAST) - return 0; - } - if (master->priv_flags & IFF_MASTER_8023AD && - skb->protocol == __cpu_to_be16(ETH_P_SLOW)) - return 0; - - return 1; - } - return 0; -} -EXPORT_SYMBOL(__skb_bond_should_drop); - static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct net_device *orig_dev; - struct net_device *master; - struct net_device *null_or_orig; - struct net_device *orig_or_bond; + struct net_device *null_or_dev; + bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; @@ -2913,28 +3100,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex; - - /* - * bonding note: skbs received on inactive slaves should only - * be delivered to pkt handlers that are exact matches. Also - * the deliver_no_wcard flag will be set. If packet handlers - * are sensitive to duplicate packets these skbs will need to - * be dropped at the handler. - */ - null_or_orig = NULL; orig_dev = skb->dev; - master = ACCESS_ONCE(orig_dev->master); - if (skb->deliver_no_wcard) - null_or_orig = orig_dev; - else if (master) { - if (skb_bond_should_drop(skb, master)) { - skb->deliver_no_wcard = 1; - null_or_orig = orig_dev; /* deliver only exact match */ - } else - skb->dev = master; - } - __this_cpu_inc(softnet_data.processed); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; @@ -2943,6 +3110,16 @@ static int __netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); +another_round: + + __this_cpu_inc(softnet_data.processed); + + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); @@ -2951,8 +3128,7 @@ static int __netif_receive_skb(struct sk_buff *skb) #endif list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev) { + if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -2966,16 +3142,24 @@ static int __netif_receive_skb(struct sk_buff *skb) ncls: #endif - /* Handle special case of bridge or macvlan */ rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - skb = rx_handler(skb); - if (!skb) + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: goto out; + case RX_HANDLER_ANOTHER: + goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } } if (vlan_tx_tag_present(skb)) { @@ -2983,31 +3167,22 @@ ncls: ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_hwaccel_do_receive(&skb)) { + if (vlan_do_receive(&skb)) { ret = __netif_receive_skb(skb); goto out; } else if (unlikely(!skb)) goto out; } - /* - * Make sure frames received on VLAN interfaces stacked on - * bonding interfaces still make their way to any base bonding - * device that may have registered for a specific ptype. The - * handler may have to adjust skb->dev and orig_dev. - */ - orig_or_bond = orig_dev; - if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && - (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { - orig_or_bond = vlan_dev_real_dev(skb->dev); - } + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && (ptype->dev == null_or_orig || - ptype->dev == skb->dev || ptype->dev == orig_dev || - ptype->dev == orig_or_bond)) { + if (ptype->type == type && + (ptype->dev == null_or_dev || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -3315,6 +3490,8 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); skb->vlan_tci = 0; + skb->dev = napi->dev; + skb->skb_iif = 0; napi->skb = skb; } @@ -3611,7 +3788,7 @@ static void net_rx_action(struct softirq_action *h) * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid - * accidently calling ->poll() when NAPI is not scheduled. + * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { @@ -3802,12 +3979,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = (v == SEQ_START_TOKEN) ? - first_net_device(seq_file_net(seq)) : - next_net_device((struct net_device *)v); + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + dev = first_net_device_rcu(seq_file_net(seq)); + else + dev = next_net_device_rcu(dev); ++*pos; - return rcu_dereference(dev); + return dev; } void dev_seq_stop(struct seq_file *seq, void *v) @@ -4091,15 +4271,14 @@ static int __init dev_proc_init(void) /** - * netdev_set_master - set up master/slave pair + * netdev_set_master - set up master pointer * @slave: slave device * @master: new master device * * Changes the master device of the slave. Pass %NULL to break the * bonding. The caller must hold the RTNL semaphore. On a failure * a negative errno code is returned. On success the reference counts - * are adjusted, %RTM_NEWLINK is sent to the routing socket and the - * function returns zero. + * are adjusted and the function returns zero. */ int netdev_set_master(struct net_device *slave, struct net_device *master) { @@ -4119,6 +4298,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) synchronize_net(); dev_put(old); } + return 0; +} +EXPORT_SYMBOL(netdev_set_master); + +/** + * netdev_set_bond_master - set up bonding master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success %RTM_NEWLINK is sent + * to the routing socket and the function returns zero. + */ +int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +{ + int err; + + ASSERT_RTNL(); + + err = netdev_set_master(slave, master); + if (err) + return err; if (master) slave->flags |= IFF_SLAVE; else @@ -4127,7 +4329,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); return 0; } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_set_bond_master); static void dev_change_rx_flags(struct net_device *dev, int flags) { @@ -4296,6 +4498,30 @@ void dev_set_rx_mode(struct net_device *dev) } /** + * dev_ethtool_get_settings - call device's ethtool_ops::get_settings() + * @dev: device + * @cmd: memory area for ethtool_ops::get_settings() result + * + * The cmd arg is initialized properly (cleared and + * ethtool_cmd::cmd field set to ETHTOOL_GSET). + * + * Return device's ethtool_ops::get_settings() result value or + * -EOPNOTSUPP when device doesn't expose + * ethtool_ops::get_settings() operation. + */ +int dev_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd) +{ + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(cmd, 0, sizeof(struct ethtool_cmd)); + cmd->cmd = ETHTOOL_GSET; + return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(dev_ethtool_get_settings); + +/** * dev_get_flags - get flags reported to userspace * @dev: device * @@ -4464,6 +4690,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(dev_set_mtu); /** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address @@ -4548,7 +4785,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm * is never reached */ WARN_ON(1); - err = -EINVAL; + err = -ENOTTY; break; } @@ -4816,7 +5053,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* Set the per device memory buffer space. * Not applicable in our case */ case SIOCSIFLINK: - return -EINVAL; + return -ENOTTY; /* * Unknown or private ioctl. @@ -4837,7 +5074,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* Take care of Wireless Extensions */ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) return wext_handle_ioctl(net, &ifr, cmd, arg); - return -EINVAL; + return -ENOTTY; } } @@ -4889,12 +5126,14 @@ static void rollback_registered_many(struct list_head *head) list_del(&dev->unreg_list); continue; } - + dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); + } - /* If device is running, close it first. */ - dev_close(dev); + /* If device is running, close it first. */ + dev_close_many(head); + list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); @@ -4949,40 +5188,62 @@ static void rollback_registered(struct net_device *dev) list_add(&dev->unreg_list, &single); rollback_registered_many(&single); + list_del(&single); } -unsigned long netdev_fix_features(unsigned long features, const char *name) +u32 netdev_fix_features(struct net_device *dev, u32 features) { + /* Fix illegal checksum combinations */ + if ((features & NETIF_F_HW_CSUM) && + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed HW and IP checksum settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } + + if ((features & NETIF_F_NO_CSUM) && + (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed no checksumming and other settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); + } + /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " - "checksum feature.\n", name); + netdev_dbg(dev, + "Dropping NETIF_F_SG since no checksum feature.\n"); features &= ~NETIF_F_SG; } /* TSO requires that SG is present as well. */ - if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " - "SG feature.\n", name); - features &= ~NETIF_F_TSO; + if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); + features &= ~NETIF_F_ALL_TSO; + } + + /* TSO ECN requires that TSO is present as well. */ + if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) + features &= ~NETIF_F_TSO_ECN; + + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; } + /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { - if (!(features & NETIF_F_GEN_CSUM)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_HW_CSUM feature.\n", - name); + /* maybe split UFO into V4 and V6? */ + if (!((features & NETIF_F_GEN_CSUM) || + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; } if (!(features & NETIF_F_SG)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_SG feature.\n", name); + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); features &= ~NETIF_F_UFO; } } @@ -4991,6 +5252,75 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +int __netdev_update_features(struct net_device *dev) +{ + u32 features; + int err = 0; + + ASSERT_RTNL(); + + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + if (dev->features == features) + return 0; + + netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", + dev->features, features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + + if (unlikely(err < 0)) { + netdev_err(dev, + "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", + err, features, dev->features); + return -1; + } + + if (!err) + dev->features = features; + + return 1; +} + +/** + * netdev_update_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications if it + * has changed. Should be called after driver or hardware dependent + * conditions might have changed that influence the features. + */ +void netdev_update_features(struct net_device *dev) +{ + if (__netdev_update_features(dev)) + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_update_features); + +/** + * netdev_change_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications even + * if they have not changed. Should be called instead of + * netdev_update_features() if also dev->vlan_features might + * have changed to allow the changes to be propagated to stacked + * VLAN devices. + */ +void netdev_change_features(struct net_device *dev) +{ + __netdev_update_features(dev); + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features); + /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from @@ -5018,9 +5348,9 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +#ifdef CONFIG_RPS static int netif_alloc_rx_queues(struct net_device *dev) { -#ifdef CONFIG_RPS unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; @@ -5033,15 +5363,22 @@ static int netif_alloc_rx_queues(struct net_device *dev) } dev->_rx = rx; - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ for (i = 0; i < count; i++) - rx[i].first = rx; -#endif + rx[i].dev = dev; return 0; } +#endif + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) +{ + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +} static int netif_alloc_netdev_queues(struct net_device *dev) { @@ -5057,25 +5394,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return -ENOMEM; } dev->_tx = tx; - return 0; -} -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) -{ - queue->dev = dev; - - /* Initialize queue lock */ - spin_lock_init(&queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); - queue->xmit_lock_owner = -1; -} - -static void netdev_init_queues(struct net_device *dev) -{ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); + + return 0; } /** @@ -5114,16 +5437,10 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = netif_alloc_rx_queues(dev); - if (ret) - goto out; - - ret = netif_alloc_netdev_queues(dev); - if (ret) + ret = dev_get_valid_name(dev, dev->name); + if (ret < 0) goto out; - netdev_init_queues(dev); - /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5134,35 +5451,25 @@ int register_netdevice(struct net_device *dev) } } - ret = dev_get_valid_name(dev, dev->name, 0); - if (ret) - goto err_uninit; - dev->ifindex = dev_new_index(net); if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Fix illegal checksum combinations */ - if ((dev->features & NETIF_F_HW_CSUM) && - (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", - dev->name); - dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); - } + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->features |= NETIF_F_SOFT_FEATURES; + dev->wanted_features = dev->features & dev->hw_features; - if ((dev->features & NETIF_F_NO_CSUM) && - (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", - dev->name); - dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); + /* Turn on no cache copy if HW is doing checksum */ + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if ((dev->features & NETIF_F_ALL_CSUM) && + !(dev->features & NETIF_F_NO_CSUM)) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; } - dev->features = netdev_fix_features(dev->features, dev->name); - - /* Enable software GSO if SG is supported. */ - if (dev->features & NETIF_F_SG) - dev->features |= NETIF_F_GSO; - /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features * are enabled only if supported by underlying device. @@ -5179,6 +5486,8 @@ int register_netdevice(struct net_device *dev) goto err_uninit; dev->reg_state = NETREG_REGISTERED; + __netdev_update_features(dev); + /* * Default initial state at registry is that the * device is present. @@ -5274,19 +5583,7 @@ int register_netdev(struct net_device *dev) int err; rtnl_lock(); - - /* - * If the name is a format string the caller wants us to do a - * name allocation. - */ - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto out; - } - err = register_netdevice(dev); -out: rtnl_unlock(); return err; } @@ -5416,7 +5713,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(dev->ip6_ptr); + WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) @@ -5427,34 +5724,6 @@ void netdev_run_todo(void) } } -/** - * dev_txq_stats_fold - fold tx_queues stats - * @dev: device to get statistics from - * @stats: struct rtnl_link_stats64 to hold results - */ -void dev_txq_stats_fold(const struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - spin_lock_bh(&txq->_xmit_lock); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - spin_unlock_bh(&txq->_xmit_lock); - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } -} -EXPORT_SYMBOL(dev_txq_stats_fold); - /* Convert net_device_stats to rtnl_link_stats64. They have the same * fields in the same order, with only the type differing. */ @@ -5498,7 +5767,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else { netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; @@ -5524,18 +5792,20 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) } /** - * alloc_netdev_mq - allocate network device + * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device - * @queue_count: the number of subqueues to allocate + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subquue structs - * for each queue on the device at the end of the netdevice. + * for each queue on the device. */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, - void (*setup)(struct net_device *), unsigned int queue_count) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t alloc_size; @@ -5543,12 +5813,20 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); - if (queue_count < 1) { + if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device " "with zero queues.\n"); return NULL; } +#ifdef CONFIG_RPS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero RX queues.\n"); + return NULL; + } +#endif + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ @@ -5579,14 +5857,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->num_tx_queues = queue_count; - dev->real_num_tx_queues = queue_count; - -#ifdef CONFIG_RPS - dev->num_rx_queues = queue_count; - dev->real_num_rx_queues = queue_count; -#endif - dev->gso_max_size = GSO_MAX_SIZE; INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); @@ -5596,16 +5866,39 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->link_watch_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); + + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_all; + +#ifdef CONFIG_RPS + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_all; +#endif + strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; return dev; +free_all: + free_netdev(dev); + return NULL; + free_pcpu: free_percpu(dev->pcpu_refcnt); + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + free_p: kfree(p); return NULL; } -EXPORT_SYMBOL(alloc_netdev_mq); +EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device @@ -5622,6 +5915,9 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif kfree(rcu_dereference_raw(dev->ingress_queue)); @@ -5769,7 +6065,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(dev, pat, 1)) + if (dev_get_valid_name(dev, pat) < 0) goto out; } @@ -5899,32 +6195,22 @@ static int dev_cpu_callback(struct notifier_block *nfb, * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ -unsigned long netdev_increment_features(unsigned long all, unsigned long one, - unsigned long mask) +u32 netdev_increment_features(u32 all, u32 one, u32 mask) { - /* If device needs checksumming, downgrade to it. */ - if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) - all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); - else if (mask & NETIF_F_ALL_CSUM) { - /* If one device supports v4/v6 checksumming, set for all. */ - if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && - !(all & NETIF_F_GEN_CSUM)) { - all &= ~NETIF_F_ALL_CSUM; - all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - } + if (mask & NETIF_F_GEN_CSUM) + mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_VLAN_CHALLENGED; - /* If one device supports hw checksumming, set for all. */ - if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { - all &= ~NETIF_F_ALL_CSUM; - all |= NETIF_F_HW_CSUM; - } - } + all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all &= one | ~NETIF_F_ALL_FOR_ALL; - one |= NETIF_F_ALL_CSUM; + /* If device needs checksumming, downgrade to it. */ + if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) + all &= ~NETIF_F_NO_CSUM; - one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; - all |= one & mask & NETIF_F_ONE_FOR_ALL; + /* If one device supports hw checksumming, set for all. */ + if (all & NETIF_F_GEN_CSUM) + all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); return all; } @@ -6085,7 +6371,7 @@ static void __net_exit default_device_exit(struct net *net) if (dev->rtnl_link_ops) continue; - /* Push remaing network devices to init_net */ + /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { @@ -6100,7 +6386,7 @@ static void __net_exit default_device_exit(struct net *net) static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network - * namespace. Do this in the reverse order of registeration. + * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ @@ -6118,6 +6404,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } } unregister_netdevice_many(&dev_kill_list); + list_del(&dev_kill_list); rtnl_unlock(); } diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 508f9c18992f..e2e66939ed00 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -68,14 +68,6 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); } -static void ha_rcu_free(struct rcu_head *head) -{ - struct netdev_hw_addr *ha; - - ha = container_of(head, struct netdev_hw_addr, rcu_head); - kfree(ha); -} - static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, unsigned char *addr, int addr_len, unsigned char addr_type, bool global) @@ -94,7 +86,7 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); + kfree_rcu(ha, rcu_head); list->count--; return 0; } @@ -144,7 +136,7 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, list_for_each_entry(ha, &from_list->list, list) { type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, addr_type); + __hw_addr_del(to_list, ha->addr, addr_len, type); } } EXPORT_SYMBOL(__hw_addr_del_multiple); @@ -197,7 +189,7 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list) list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); + kfree_rcu(ha, rcu_head); } list->count = 0; } @@ -357,8 +349,8 @@ EXPORT_SYMBOL(dev_addr_add_multiple); /** * dev_addr_del_multiple - Delete device addresses by another device * @to_dev: device where the addresses will be deleted - * @from_dev: device by which addresses the addresses will be deleted - * @addr_type: address type - 0 means type will used from from_dev + * @from_dev: device supplying the addresses to be deleted + * @addr_type: address type - 0 means type will be used from from_dev * * Deletes addresses in to device by the list of addresses in from device. * diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 36e603c78ce9..7f36b38e060f 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -207,14 +207,6 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) rcu_read_unlock(); } - -static void free_dm_hw_stat(struct rcu_head *head) -{ - struct dm_hw_stat_delta *n; - n = container_of(head, struct dm_hw_stat_delta, rcu); - kfree(n); -} - static int set_all_monitor_traces(int state) { int rc = 0; @@ -245,7 +237,7 @@ static int set_all_monitor_traces(int state) list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { if (new_stat->dev == NULL) { list_del_rcu(&new_stat->list); - call_rcu(&new_stat->rcu, free_dm_hw_stat); + kfree_rcu(new_stat, rcu); } } break; @@ -314,7 +306,7 @@ static int dropmon_net_event(struct notifier_block *ev_block, new_stat->dev = NULL; if (trace_state == TRACE_OFF) { list_del_rcu(&new_stat->list); - call_rcu(&new_stat->rcu, free_dm_hw_stat); + kfree_rcu(new_stat, rcu); break; } } @@ -350,7 +342,7 @@ static int __init init_net_drop_monitor(void) struct per_cpu_dm_data *data; int cpu, rc; - printk(KERN_INFO "Initalizing network drop monitor service\n"); + printk(KERN_INFO "Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); diff --git a/net/core/dst.c b/net/core/dst.c index 8abe628b79f1..81a4fa1c95ed 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -19,6 +19,7 @@ #include <linux/types.h> #include <net/net_namespace.h> #include <linux/sched.h> +#include <linux/prefetch.h> #include <net/dst.h> @@ -33,9 +34,6 @@ * 3) This list is guarded by a mutex, * so that the gc_task and dst_dev_event() can be synchronized. */ -#if RT_CACHE_DEBUG >= 2 -static atomic_t dst_total = ATOMIC_INIT(0); -#endif /* * We want to keep lock & list close together @@ -69,10 +67,6 @@ static void dst_gc_task(struct work_struct *work) unsigned long expires = ~0L; struct dst_entry *dst, *next, head; struct dst_entry *last = &head; -#if RT_CACHE_DEBUG >= 2 - ktime_t time_start = ktime_get(); - struct timespec elapsed; -#endif mutex_lock(&dst_gc_mutex); next = dst_busy_list; @@ -146,15 +140,6 @@ loop: spin_unlock_bh(&dst_garbage.lock); mutex_unlock(&dst_gc_mutex); -#if RT_CACHE_DEBUG >= 2 - elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start)); - printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d" - " expires: %lu elapsed: %lu us\n", - atomic_read(&dst_total), delayed, work_performed, - expires, - elapsed.tv_sec * USEC_PER_SEC + - elapsed.tv_nsec / NSEC_PER_USEC); -#endif } int dst_discard(struct sk_buff *skb) @@ -164,7 +149,10 @@ int dst_discard(struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard); -void *dst_alloc(struct dst_ops *ops) +const u32 dst_default_metrics[RTAX_MAX]; + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, int flags) { struct dst_entry *dst; @@ -172,17 +160,36 @@ void *dst_alloc(struct dst_ops *ops) if (ops->gc(ops)) return NULL; } - dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; - atomic_set(&dst->__refcnt, 0); + dst->child = NULL; + dst->dev = dev; + if (dev) + dev_hold(dev); dst->ops = ops; - dst->lastuse = jiffies; + dst_init_metrics(dst, dst_default_metrics, true); + dst->expires = 0UL; dst->path = dst; - dst->input = dst->output = dst_discard; -#if RT_CACHE_DEBUG >= 2 - atomic_inc(&dst_total); + dst->neighbour = NULL; + dst->hh = NULL; +#ifdef CONFIG_XFRM + dst->xfrm = NULL; #endif + dst->input = dst_discard; + dst->output = dst_discard; + dst->error = 0; + dst->obsolete = initial_obsolete; + dst->header_len = 0; + dst->trailer_len = 0; +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->tclassid = 0; +#endif + atomic_set(&dst->__refcnt, initial_ref); + dst->__use = 0; + dst->lastuse = jiffies; + dst->flags = flags; + dst->next = NULL; dst_entries_add(ops, 1); return dst; } @@ -242,9 +249,6 @@ again: dst->ops->destroy(dst); if (dst->dev) dev_put(dst->dev); -#if RT_CACHE_DEBUG >= 2 - atomic_dec(&dst_total); -#endif kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; @@ -282,6 +286,42 @@ void dst_release(struct dst_entry *dst) } EXPORT_SYMBOL(dst_release); +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + + if (p) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + kfree(p); + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} +EXPORT_SYMBOL(dst_cow_metrics_generic); + +/* Caller asserts that dst_metrics_read_only(dst) is false. */ +void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + unsigned long prev, new; + + new = (unsigned long) dst_default_metrics; + prev = cmpxchg(&dst->_metrics, old, new); + if (prev == old) + kfree(__DST_METRICS_PTR(old)); +} +EXPORT_SYMBOL(__dst_destroy_metrics_generic); + /** * skb_dst_set_noref - sets skb dst, without a reference * @skb: buffer @@ -370,6 +410,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, static struct notifier_block dst_dev_notifier = { .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ }; void __init dst_init(void) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 956a9f4971cb..84e7304532e6 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -21,6 +21,8 @@ #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/rtnetlink.h> +#include <linux/sched.h> /* * Some useful ethtool_ops methods that're device independent. @@ -34,12 +36,6 @@ u32 ethtool_op_get_link(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_link); -u32 ethtool_op_get_rx_csum(struct net_device *dev) -{ - return (dev->features & NETIF_F_ALL_CSUM) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_rx_csum); - u32 ethtool_op_get_tx_csum(struct net_device *dev) { return (dev->features & NETIF_F_ALL_CSUM) != 0; @@ -55,6 +51,7 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tx_csum); int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) { @@ -146,9 +143,24 @@ u32 ethtool_op_get_flags(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_flags); +/* Check if device can enable (or disable) particular feature coded in "data" + * argument. Flags "supported" describe features that can be toggled by device. + * If feature can not be toggled, it state (enabled or disabled) must match + * hardcoded device features state, otherwise flags are marked as invalid. + */ +bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) +{ + u32 features = dev->features & flags_dup_features; + /* "data" can contain only flags_dup_features bits, + * see __ethtool_set_flags */ + + return (features & ~supported) != (data & ~supported); +} +EXPORT_SYMBOL(ethtool_invalid_flags); + int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) { - if (data & ~supported) + if (ethtool_invalid_flags(dev, data, supported)) return -EINVAL; dev->features = ((dev->features & ~flags_dup_features) | @@ -171,6 +183,381 @@ EXPORT_SYMBOL(ethtool_ntuple_flush); /* Handlers for each ethtool command */ +#define ETHTOOL_DEV_FEATURE_WORDS 1 + +static void ethtool_get_features_compat(struct net_device *dev, + struct ethtool_get_features_block *features) +{ + if (!dev->ethtool_ops) + return; + + /* getting RX checksum */ + if (dev->ethtool_ops->get_rx_csum) + if (dev->ethtool_ops->get_rx_csum(dev)) + features[0].active |= NETIF_F_RXCSUM; + + /* mark legacy-changeable features */ + if (dev->ethtool_ops->set_sg) + features[0].available |= NETIF_F_SG; + if (dev->ethtool_ops->set_tx_csum) + features[0].available |= NETIF_F_ALL_CSUM; + if (dev->ethtool_ops->set_tso) + features[0].available |= NETIF_F_ALL_TSO; + if (dev->ethtool_ops->set_rx_csum) + features[0].available |= NETIF_F_RXCSUM; + if (dev->ethtool_ops->set_flags) + features[0].available |= flags_dup_features; +} + +static int ethtool_set_feature_compat(struct net_device *dev, + int (*legacy_set)(struct net_device *, u32), + struct ethtool_set_features_block *features, u32 mask) +{ + u32 do_set; + + if (!legacy_set) + return 0; + + if (!(features[0].valid & mask)) + return 0; + + features[0].valid &= ~mask; + + do_set = !!(features[0].requested & mask); + + if (legacy_set(dev, do_set) < 0) + netdev_info(dev, + "Legacy feature change (%s) failed for 0x%08x\n", + do_set ? "set" : "clear", mask); + + return 1; +} + +static int ethtool_set_features_compat(struct net_device *dev, + struct ethtool_set_features_block *features) +{ + int compat; + + if (!dev->ethtool_ops) + return 0; + + compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg, + features, NETIF_F_SG); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum, + features, NETIF_F_ALL_CSUM); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso, + features, NETIF_F_ALL_TSO); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum, + features, NETIF_F_RXCSUM); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_flags, + features, flags_dup_features); + + return compat; +} + +static int ethtool_get_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gfeatures cmd = { + .cmd = ETHTOOL_GFEATURES, + .size = ETHTOOL_DEV_FEATURE_WORDS, + }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { + { + .available = dev->hw_features, + .requested = dev->wanted_features, + .active = dev->features, + .never_changed = NETIF_F_NEVER_CHANGE, + }, + }; + u32 __user *sizeaddr; + u32 copy_size; + + ethtool_get_features_compat(dev, features); + + sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); + if (get_user(copy_size, sizeaddr)) + return -EFAULT; + + if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) + copy_size = ETHTOOL_DEV_FEATURE_WORDS; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_sfeatures cmd; + struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + int ret = 0; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + + if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) + return -EINVAL; + + if (copy_from_user(features, useraddr, sizeof(features))) + return -EFAULT; + + if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; + + if (ethtool_set_features_compat(dev, features)) + ret |= ETHTOOL_F_COMPAT; + + if (features[0].valid & ~dev->hw_features) { + features[0].valid &= dev->hw_features; + ret |= ETHTOOL_F_UNSUPPORTED; + } + + dev->wanted_features &= ~features[0].valid; + dev->wanted_features |= features[0].valid & features[0].requested; + __netdev_update_features(dev); + + if ((dev->wanted_features ^ dev->features) & features[0].valid) + ret |= ETHTOOL_F_WISH; + + return ret; +} + +static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = { + /* NETIF_F_SG */ "tx-scatter-gather", + /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4", + /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded", + /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic", + /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6", + /* NETIF_F_HIGHDMA */ "highdma", + /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist", + /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert", + + /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse", + /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter", + /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged", + /* NETIF_F_GSO */ "tx-generic-segmentation", + /* NETIF_F_LLTX */ "tx-lockless", + /* NETIF_F_NETNS_LOCAL */ "netns-local", + /* NETIF_F_GRO */ "rx-gro", + /* NETIF_F_LRO */ "rx-lro", + + /* NETIF_F_TSO */ "tx-tcp-segmentation", + /* NETIF_F_UFO */ "tx-udp-fragmentation", + /* NETIF_F_GSO_ROBUST */ "tx-gso-robust", + /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation", + /* NETIF_F_TSO6 */ "tx-tcp6-segmentation", + /* NETIF_F_FSO */ "tx-fcoe-segmentation", + "", + "", + + /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc", + /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp", + /* NETIF_F_FCOE_MTU */ "fcoe-mtu", + /* NETIF_F_NTUPLE */ "rx-ntuple-filter", + /* NETIF_F_RXHASH */ "rx-hashing", + /* NETIF_F_RXCSUM */ "rx-checksum", + /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy", + /* NETIF_F_LOOPBACK */ "loopback", +}; + +static int __ethtool_get_sset_count(struct net_device *dev, int sset) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (sset == ETH_SS_FEATURES) + return ARRAY_SIZE(netdev_features_strings); + + if (ops && ops->get_sset_count && ops->get_strings) + return ops->get_sset_count(dev, sset); + else + return -EOPNOTSUPP; +} + +static void __ethtool_get_strings(struct net_device *dev, + u32 stringset, u8 *data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (stringset == ETH_SS_FEATURES) + memcpy(data, netdev_features_strings, + sizeof(netdev_features_strings)); + else + /* ops->get_strings is valid because checked earlier */ + ops->get_strings(dev, stringset, data); +} + +static u32 ethtool_get_feature_mask(u32 eth_cmd) +{ + /* feature masks of legacy discrete ethtool ops */ + + switch (eth_cmd) { + case ETHTOOL_GTXCSUM: + case ETHTOOL_STXCSUM: + return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + case ETHTOOL_GRXCSUM: + case ETHTOOL_SRXCSUM: + return NETIF_F_RXCSUM; + case ETHTOOL_GSG: + case ETHTOOL_SSG: + return NETIF_F_SG; + case ETHTOOL_GTSO: + case ETHTOOL_STSO: + return NETIF_F_ALL_TSO; + case ETHTOOL_GUFO: + case ETHTOOL_SUFO: + return NETIF_F_UFO; + case ETHTOOL_GGSO: + case ETHTOOL_SGSO: + return NETIF_F_GSO; + case ETHTOOL_GGRO: + case ETHTOOL_SGRO: + return NETIF_F_GRO; + default: + BUG(); + } +} + +static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops) + return NULL; + + switch (ethcmd) { + case ETHTOOL_GTXCSUM: + return ops->get_tx_csum; + case ETHTOOL_GRXCSUM: + return ops->get_rx_csum; + case ETHTOOL_SSG: + return ops->get_sg; + case ETHTOOL_STSO: + return ops->get_tso; + case ETHTOOL_SUFO: + return ops->get_ufo; + default: + return NULL; + } +} + +static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev) +{ + return !!(dev->features & NETIF_F_ALL_CSUM); +} + +static int ethtool_get_one_feature(struct net_device *dev, + char __user *useraddr, u32 ethcmd) +{ + u32 mask = ethtool_get_feature_mask(ethcmd); + struct ethtool_value edata = { + .cmd = ethcmd, + .data = !!(dev->features & mask), + }; + + /* compatibility with discrete get_ ops */ + if (!(dev->hw_features & mask)) { + u32 (*actor)(struct net_device *); + + actor = __ethtool_get_one_feature_actor(dev, ethcmd); + + /* bug compatibility with old get_rx_csum */ + if (ethcmd == ETHTOOL_GRXCSUM && !actor) + actor = __ethtool_get_rx_csum_oldbug; + + if (actor) + edata.data = actor(dev); + } + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); +static int __ethtool_set_rx_csum(struct net_device *dev, u32 data); +static int __ethtool_set_sg(struct net_device *dev, u32 data); +static int __ethtool_set_tso(struct net_device *dev, u32 data); +static int __ethtool_set_ufo(struct net_device *dev, u32 data); + +static int ethtool_set_one_feature(struct net_device *dev, + void __user *useraddr, u32 ethcmd) +{ + struct ethtool_value edata; + u32 mask; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + mask = ethtool_get_feature_mask(ethcmd); + mask &= dev->hw_features; + if (mask) { + if (edata.data) + dev->wanted_features |= mask; + else + dev->wanted_features &= ~mask; + + __netdev_update_features(dev); + return 0; + } + + /* Driver is not converted to ndo_fix_features or does not + * support changing this offload. In the latter case it won't + * have corresponding ethtool_ops field set. + * + * Following part is to be removed after all drivers advertise + * their changeable features in netdev->hw_features and stop + * using discrete offload setting ops. + */ + + switch (ethcmd) { + case ETHTOOL_STXCSUM: + return __ethtool_set_tx_csum(dev, edata.data); + case ETHTOOL_SRXCSUM: + return __ethtool_set_rx_csum(dev, edata.data); + case ETHTOOL_SSG: + return __ethtool_set_sg(dev, edata.data); + case ETHTOOL_STSO: + return __ethtool_set_tso(dev, edata.data); + case ETHTOOL_SUFO: + return __ethtool_set_ufo(dev, edata.data); + default: + return -EOPNOTSUPP; + } +} + +int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + u32 changed; + + if (data & ~flags_dup_features) + return -EINVAL; + + /* legacy set_flags() op */ + if (dev->ethtool_ops->set_flags) { + if (unlikely(dev->hw_features & flags_dup_features)) + netdev_warn(dev, + "driver BUG: mixed hw_features and set_flags()\n"); + return dev->ethtool_ops->set_flags(dev, data); + } + + /* allow changing only bits set in hw_features */ + changed = (data ^ dev->features) & flags_dup_features; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (data & dev->hw_features); + + __netdev_update_features(dev); + + return 0; +} + static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; @@ -251,14 +638,10 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, void __user *useraddr) { struct ethtool_sset_info info; - const struct ethtool_ops *ops = dev->ethtool_ops; u64 sset_mask; int i, idx = 0, n_bits = 0, ret, rc; u32 *info_buf = NULL; - if (!ops->get_sset_count) - return -EOPNOTSUPP; - if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; @@ -285,7 +668,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, if (!(sset_mask & (1ULL << i))) continue; - rc = ops->get_sset_count(dev, i); + rc = __ethtool_get_sset_count(dev, i); if (rc >= 0) { info.sset_mask |= (1ULL << i); info_buf[idx++] = rc; @@ -527,6 +910,9 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; int ret; + if (!ops->set_rx_ntuple) + return -EOPNOTSUPP; + if (!(dev->features & NETIF_F_NTUPLE)) return -EINVAL; @@ -817,7 +1203,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = vmalloc(reglen); + regbuf = vzalloc(reglen); if (!regbuf) return -ENOMEM; @@ -891,6 +1277,20 @@ static int ethtool_nway_reset(struct net_device *dev) return dev->ethtool_ops->nway_reset(dev); } +static int ethtool_get_link(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; @@ -1046,6 +1446,35 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_ringparam(dev, &ringparam); } +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + if (copy_to_user(useraddr, &channels, sizeof(channels))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels; + + if (!dev->ethtool_ops->set_channels) + return -EOPNOTSUPP; + + if (copy_from_user(&channels, useraddr, sizeof(channels))) + return -EFAULT; + + return dev->ethtool_ops->set_channels(dev, &channels); +} + static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; @@ -1077,6 +1506,12 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) { int err; + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + + if (data && !(dev->features & NETIF_F_ALL_CSUM)) + return -EINVAL; + if (!data && dev->ethtool_ops->set_tso) { err = dev->ethtool_ops->set_tso(dev, 0); if (err) @@ -1091,143 +1526,55 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) return dev->ethtool_ops->set_sg(dev, data); } -static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) { - struct ethtool_value edata; int err; if (!dev->ethtool_ops->set_tx_csum) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (!edata.data && dev->ethtool_ops->set_sg) { + if (!data && dev->ethtool_ops->set_sg) { err = __ethtool_set_sg(dev, 0); if (err) return err; } - return dev->ethtool_ops->set_tx_csum(dev, edata.data); + return dev->ethtool_ops->set_tx_csum(dev, data); } -EXPORT_SYMBOL(ethtool_op_set_tx_csum); -static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_rx_csum(struct net_device *dev, u32 data) { - struct ethtool_value edata; - if (!dev->ethtool_ops->set_rx_csum) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (!edata.data && dev->ethtool_ops->set_sg) + if (!data) dev->features &= ~NETIF_F_GRO; - return dev->ethtool_ops->set_rx_csum(dev, edata.data); -} - -static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (!dev->ethtool_ops->set_sg) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (edata.data && - !(dev->features & NETIF_F_ALL_CSUM)) - return -EINVAL; - - return __ethtool_set_sg(dev, edata.data); + return dev->ethtool_ops->set_rx_csum(dev, data); } -static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_tso(struct net_device *dev, u32 data) { - struct ethtool_value edata; - if (!dev->ethtool_ops->set_tso) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (edata.data && !(dev->features & NETIF_F_SG)) + if (data && !(dev->features & NETIF_F_SG)) return -EINVAL; - return dev->ethtool_ops->set_tso(dev, edata.data); + return dev->ethtool_ops->set_tso(dev, data); } -static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_ufo(struct net_device *dev, u32 data) { - struct ethtool_value edata; - if (!dev->ethtool_ops->set_ufo) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - if (edata.data && !(dev->features & NETIF_F_SG)) + if (data && !(dev->features & NETIF_F_SG)) return -EINVAL; - if (edata.data && !(dev->features & NETIF_F_HW_CSUM)) + if (data && !((dev->features & NETIF_F_GEN_CSUM) || + (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) return -EINVAL; - return dev->ethtool_ops->set_ufo(dev, edata.data); -} - -static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GGSO }; - - edata.data = dev->features & NETIF_F_GSO; - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_gso(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - if (edata.data) - dev->features |= NETIF_F_GSO; - else - dev->features &= ~NETIF_F_GSO; - return 0; -} - -static int ethtool_get_gro(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GGRO }; - - edata.data = dev->features & NETIF_F_GRO; - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_gro(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (edata.data) { - u32 rxcsum = dev->ethtool_ops->get_rx_csum ? - dev->ethtool_ops->get_rx_csum(dev) : - ethtool_op_get_rx_csum(dev); - - if (!rxcsum) - return -EINVAL; - dev->features |= NETIF_F_GRO; - } else - dev->features &= ~NETIF_F_GRO; - - return 0; + return dev->ethtool_ops->set_ufo(dev, data); } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) @@ -1271,17 +1618,13 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) { struct ethtool_gstrings gstrings; - const struct ethtool_ops *ops = dev->ethtool_ops; u8 *data; int ret; - if (!ops->get_strings || !ops->get_sset_count) - return -EOPNOTSUPP; - if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; - ret = ops->get_sset_count(dev, gstrings.string_set); + ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; @@ -1291,7 +1634,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) if (!data) return -ENOMEM; - ops->get_strings(dev, gstrings.string_set, data); + __ethtool_get_strings(dev, gstrings.string_set, data); ret = -EFAULT; if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) @@ -1301,7 +1644,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) goto out; ret = 0; - out: +out: kfree(data); return ret; } @@ -1309,14 +1652,60 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; + static bool busy; + int rc; - if (!dev->ethtool_ops->phys_id) + if (!dev->ethtool_ops->set_phys_id) return -EOPNOTSUPP; + if (busy) + return -EBUSY; + if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; - return dev->ethtool_ops->phys_id(dev, id.data); + rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + if (rc < 0) + return rc; + + /* Drop the RTNL lock while waiting, but prevent reentry or + * removal of the device. + */ + busy = true; + dev_hold(dev); + rtnl_unlock(); + + if (rc == 0) { + /* Driver will handle this itself */ + schedule_timeout_interruptible( + id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); + } else { + /* Driver expects to be called at twice the frequency in rc */ + int n = rc * 2, i, interval = HZ / n; + + /* Count down seconds */ + do { + /* Count down iterations per second */ + i = n; + do { + rtnl_lock(); + rc = dev->ethtool_ops->set_phys_id(dev, + (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(interval); + } while (!signal_pending(current) && --i != 0); + } while (!signal_pending(current) && + (id.data == 0 || --id.data != 0)); + } + + rtnl_lock(); + dev_put(dev); + busy = false; + + (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + return rc; } static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) @@ -1434,6 +1823,87 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev, return dev->ethtool_ops->flash_device(dev, &efl); } +static int ethtool_set_dump(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_dump dump; + + if (!dev->ethtool_ops->set_dump) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + return dev->ethtool_ops->set_dump(dev, &dump); +} + +static int ethtool_get_dump_flag(struct net_device *dev, + void __user *useraddr) +{ + int ret; + struct ethtool_dump dump; + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!dev->ethtool_ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + ret = ops->get_dump_flag(dev, &dump); + if (ret) + return ret; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) + return -EFAULT; + return 0; +} + +static int ethtool_get_dump_data(struct net_device *dev, + void __user *useraddr) +{ + int ret; + __u32 len; + struct ethtool_dump dump, tmp; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data = NULL; + + if (!dev->ethtool_ops->get_dump_data || + !dev->ethtool_ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + memset(&tmp, 0, sizeof(tmp)); + tmp.cmd = ETHTOOL_GET_DUMP_FLAG; + ret = ops->get_dump_flag(dev, &tmp); + if (ret) + return ret; + + len = (tmp.len > dump.len) ? dump.len : tmp.len; + if (!len) + return -EFAULT; + + data = vzalloc(tmp.len); + if (!data) + return -ENOMEM; + ret = ops->get_dump_data(dev, &dump, data); + if (ret) + goto out; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) { + ret = -EFAULT; + goto out; + } + useraddr += offsetof(struct ethtool_dump, data); + if (copy_to_user(useraddr, data, len)) + ret = -EFAULT; +out: + vfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1442,7 +1912,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; - unsigned long old_features; + u32 old_features; if (!dev || !netif_device_present(dev)) return -ENODEV; @@ -1484,6 +1954,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GFEATURES: break; default: if (!capable(CAP_NET_ADMIN)) @@ -1528,8 +1999,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: - rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_link); + rc = ethtool_get_link(dev, useraddr); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); @@ -1555,42 +2025,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SPAUSEPARAM: rc = ethtool_set_pauseparam(dev, useraddr); break; - case ETHTOOL_GRXCSUM: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_rx_csum ? - dev->ethtool_ops->get_rx_csum : - ethtool_op_get_rx_csum)); - break; - case ETHTOOL_SRXCSUM: - rc = ethtool_set_rx_csum(dev, useraddr); - break; - case ETHTOOL_GTXCSUM: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_tx_csum ? - dev->ethtool_ops->get_tx_csum : - ethtool_op_get_tx_csum)); - break; - case ETHTOOL_STXCSUM: - rc = ethtool_set_tx_csum(dev, useraddr); - break; - case ETHTOOL_GSG: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_sg ? - dev->ethtool_ops->get_sg : - ethtool_op_get_sg)); - break; - case ETHTOOL_SSG: - rc = ethtool_set_sg(dev, useraddr); - break; - case ETHTOOL_GTSO: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_tso ? - dev->ethtool_ops->get_tso : - ethtool_op_get_tso)); - break; - case ETHTOOL_STSO: - rc = ethtool_set_tso(dev, useraddr); - break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; @@ -1606,21 +2040,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; - case ETHTOOL_GUFO: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_ufo ? - dev->ethtool_ops->get_ufo : - ethtool_op_get_ufo)); - break; - case ETHTOOL_SUFO: - rc = ethtool_set_ufo(dev, useraddr); - break; - case ETHTOOL_GGSO: - rc = ethtool_get_gso(dev, useraddr); - break; - case ETHTOOL_SGSO: - rc = ethtool_set_gso(dev, useraddr); - break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, (dev->ethtool_ops->get_flags ? @@ -1628,8 +2047,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) ethtool_op_get_flags)); break; case ETHTOOL_SFLAGS: - rc = ethtool_set_value(dev, useraddr, - dev->ethtool_ops->set_flags); + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); break; case ETHTOOL_GPFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, @@ -1651,12 +2069,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; - case ETHTOOL_GGRO: - rc = ethtool_get_gro(dev, useraddr); - break; - case ETHTOOL_SGRO: - rc = ethtool_set_gro(dev, useraddr); - break; case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, useraddr); break; @@ -1678,6 +2090,45 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; + case ETHTOOL_GFEATURES: + rc = ethtool_get_features(dev, useraddr); + break; + case ETHTOOL_SFEATURES: + rc = ethtool_set_features(dev, useraddr); + break; + case ETHTOOL_GTXCSUM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GTSO: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + rc = ethtool_get_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_STXCSUM: + case ETHTOOL_SRXCSUM: + case ETHTOOL_SSG: + case ETHTOOL_STSO: + case ETHTOOL_SUFO: + case ETHTOOL_SGSO: + case ETHTOOL_SGRO: + rc = ethtool_set_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + case ETHTOOL_SCHANNELS: + rc = ethtool_set_channels(dev, useraddr); + break; + case ETHTOOL_SET_DUMP: + rc = ethtool_set_dump(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_FLAG: + rc = ethtool_get_dump_flag(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_DATA: + rc = ethtool_get_dump_data(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1bc3f253ba6c..3911586e12e4 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -181,14 +181,13 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->iifindex && (rule->iifindex != fl->iif) && - !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF)) + if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; - if (rule->oifindex && (rule->oifindex != fl->oif)) + if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; - if ((rule->mark ^ fl->mark) & rule->mark_mask) + if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; ret = ops->match(rule, fl, flags); @@ -351,12 +350,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { - rule->ctarget = r; + RCU_INIT_POINTER(rule->ctarget, r); break; } } - if (rule->ctarget == NULL) + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; } else if (rule->action == FR_ACT_GOTO) goto errout_free; @@ -373,6 +372,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) fib_rule_get(rule); + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if @@ -381,7 +385,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref) { - BUG_ON(r->ctarget != NULL); + BUG_ON(rtnl_dereference(r->ctarget) != NULL); rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; @@ -395,11 +399,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (unresolved) ops->unresolved_rules++; - if (last) - list_add_rcu(&rule->list, &last->list); - else - list_add_rcu(&rule->list, &ops->rules_list); - notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); flush_route_cache(ops); rules_ops_put(ops); @@ -487,7 +486,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) */ if (ops->nr_goto_rules > 0) { list_for_each_entry(tmp, &ops->rules_list, list) { - if (tmp->ctarget == rule) { + if (rtnl_dereference(tmp->ctarget) == rule) { rcu_assign_pointer(tmp->ctarget, NULL); ops->unresolved_rules++; } @@ -545,7 +544,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->action = rule->action; frh->flags = rule->flags; - if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) + if (rule->action == FR_ACT_GOTO && + rcu_dereference_raw(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { @@ -590,7 +590,8 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, int idx = 0; struct fib_rule *rule; - list_for_each_entry(rule, &ops->rules_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; diff --git a/net/core/filter.c b/net/core/filter.c index 7adf50352918..0eb8c4466eaa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -37,9 +37,10 @@ #include <asm/uaccess.h> #include <asm/unaligned.h> #include <linux/filter.h> +#include <linux/reciprocal_div.h> /* No hurry in this branch */ -static void *__load_pointer(struct sk_buff *skb, int k) +static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size) { u8 *ptr = NULL; @@ -48,21 +49,17 @@ static void *__load_pointer(struct sk_buff *skb, int k) else if (k >= SKF_LL_OFF) ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr < skb_tail_pointer(skb)) + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; return NULL; } -static inline void *load_pointer(struct sk_buff *skb, int k, +static inline void *load_pointer(const struct sk_buff *skb, int k, unsigned int size, void *buffer) { if (k >= 0) return skb_header_pointer(skb, k, size, buffer); - else { - if (k >= SKF_AD_OFF) - return NULL; - return __load_pointer(skb, k); - } + return __load_pointer(skb, k, size); } /** @@ -86,14 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) if (err) return err; - rcu_read_lock_bh(); - filter = rcu_dereference_bh(sk->sk_filter); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } - rcu_read_unlock_bh(); + rcu_read_unlock(); return err; } @@ -102,49 +99,53 @@ EXPORT_SYMBOL(sk_filter); /** * sk_run_filter - run a filter on a socket * @skb: buffer to run the filter on - * @filter: filter to apply - * @flen: length of filter + * @fentry: filter to apply * * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. skb is the data we are - * filtering, filter is the array of filter instructions, and - * len is the number of filter blocks in the array. + * Return length to keep, 0 for none. @skb is the data we are + * filtering, @filter is the array of filter instructions. + * Because all jumps are guaranteed to be before last instruction, + * and last instruction guaranteed to be a RET, we dont need to check + * flen. (We used to pass to this function the length of filter) */ -unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +unsigned int sk_run_filter(const struct sk_buff *skb, + const struct sock_filter *fentry) { - struct sock_filter *fentry; /* We walk down these */ void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ u32 tmp; int k; - int pc; /* * Process array of filter instructions. */ - for (pc = 0; pc < flen; pc++) { - fentry = &filter[pc]; + for (;; fentry++) { +#if defined(CONFIG_X86_32) +#define K (fentry->k) +#else + const u32 K = fentry->k; +#endif switch (fentry->code) { case BPF_S_ALU_ADD_X: A += X; continue; case BPF_S_ALU_ADD_K: - A += fentry->k; + A += K; continue; case BPF_S_ALU_SUB_X: A -= X; continue; case BPF_S_ALU_SUB_K: - A -= fentry->k; + A -= K; continue; case BPF_S_ALU_MUL_X: A *= X; continue; case BPF_S_ALU_MUL_K: - A *= fentry->k; + A *= K; continue; case BPF_S_ALU_DIV_X: if (X == 0) @@ -152,89 +153,89 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int A /= X; continue; case BPF_S_ALU_DIV_K: - A /= fentry->k; + A = reciprocal_divide(A, K); continue; case BPF_S_ALU_AND_X: A &= X; continue; case BPF_S_ALU_AND_K: - A &= fentry->k; + A &= K; continue; case BPF_S_ALU_OR_X: A |= X; continue; case BPF_S_ALU_OR_K: - A |= fentry->k; + A |= K; continue; case BPF_S_ALU_LSH_X: A <<= X; continue; case BPF_S_ALU_LSH_K: - A <<= fentry->k; + A <<= K; continue; case BPF_S_ALU_RSH_X: A >>= X; continue; case BPF_S_ALU_RSH_K: - A >>= fentry->k; + A >>= K; continue; case BPF_S_ALU_NEG: A = -A; continue; case BPF_S_JMP_JA: - pc += fentry->k; + fentry += K; continue; case BPF_S_JMP_JGT_K: - pc += (A > fentry->k) ? fentry->jt : fentry->jf; + fentry += (A > K) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGE_K: - pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + fentry += (A >= K) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JEQ_K: - pc += (A == fentry->k) ? fentry->jt : fentry->jf; + fentry += (A == K) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JSET_K: - pc += (A & fentry->k) ? fentry->jt : fentry->jf; + fentry += (A & K) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGT_X: - pc += (A > X) ? fentry->jt : fentry->jf; + fentry += (A > X) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGE_X: - pc += (A >= X) ? fentry->jt : fentry->jf; + fentry += (A >= X) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JEQ_X: - pc += (A == X) ? fentry->jt : fentry->jf; + fentry += (A == X) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JSET_X: - pc += (A & X) ? fentry->jt : fentry->jf; + fentry += (A & X) ? fentry->jt : fentry->jf; continue; case BPF_S_LD_W_ABS: - k = fentry->k; + k = K; load_w: ptr = load_pointer(skb, k, 4, &tmp); if (ptr != NULL) { A = get_unaligned_be32(ptr); continue; } - break; + return 0; case BPF_S_LD_H_ABS: - k = fentry->k; + k = K; load_h: ptr = load_pointer(skb, k, 2, &tmp); if (ptr != NULL) { A = get_unaligned_be16(ptr); continue; } - break; + return 0; case BPF_S_LD_B_ABS: - k = fentry->k; + k = K; load_b: ptr = load_pointer(skb, k, 1, &tmp); if (ptr != NULL) { A = *(u8 *)ptr; continue; } - break; + return 0; case BPF_S_LD_W_LEN: A = skb->len; continue; @@ -242,32 +243,32 @@ load_b: X = skb->len; continue; case BPF_S_LD_W_IND: - k = X + fentry->k; + k = X + K; goto load_w; case BPF_S_LD_H_IND: - k = X + fentry->k; + k = X + K; goto load_h; case BPF_S_LD_B_IND: - k = X + fentry->k; + k = X + K; goto load_b; case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, fentry->k, 1, &tmp); + ptr = load_pointer(skb, K, 1, &tmp); if (ptr != NULL) { X = (*(u8 *)ptr & 0xf) << 2; continue; } return 0; case BPF_S_LD_IMM: - A = fentry->k; + A = K; continue; case BPF_S_LDX_IMM: - X = fentry->k; + X = K; continue; case BPF_S_LD_MEM: - A = mem[fentry->k]; + A = mem[K]; continue; case BPF_S_LDX_MEM: - X = mem[fentry->k]; + X = mem[K]; continue; case BPF_S_MISC_TAX: X = A; @@ -276,48 +277,44 @@ load_b: A = X; continue; case BPF_S_RET_K: - return fentry->k; + return K; case BPF_S_RET_A: return A; case BPF_S_ST: - mem[fentry->k] = A; + mem[K] = A; continue; case BPF_S_STX: - mem[fentry->k] = X; + mem[K] = X; continue; - default: - WARN_ON(1); - return 0; - } - - /* - * Handle ancillary data, which are impossible - * (or very difficult) to get parsing packet contents. - */ - switch (k-SKF_AD_OFF) { - case SKF_AD_PROTOCOL: + case BPF_S_ANC_PROTOCOL: A = ntohs(skb->protocol); continue; - case SKF_AD_PKTTYPE: + case BPF_S_ANC_PKTTYPE: A = skb->pkt_type; continue; - case SKF_AD_IFINDEX: + case BPF_S_ANC_IFINDEX: if (!skb->dev) return 0; A = skb->dev->ifindex; continue; - case SKF_AD_MARK: + case BPF_S_ANC_MARK: A = skb->mark; continue; - case SKF_AD_QUEUE: + case BPF_S_ANC_QUEUE: A = skb->queue_mapping; continue; - case SKF_AD_HATYPE: + case BPF_S_ANC_HATYPE: if (!skb->dev) return 0; A = skb->dev->type; continue; - case SKF_AD_NLATTR: { + case BPF_S_ANC_RXHASH: + A = skb->rxhash; + continue; + case BPF_S_ANC_CPU: + A = raw_smp_processor_id(); + continue; + case BPF_S_ANC_NLATTR: { struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -333,7 +330,7 @@ load_b: A = 0; continue; } - case SKF_AD_NLATTR_NEST: { + case BPF_S_ANC_NLATTR_NEST: { struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -353,6 +350,7 @@ load_b: continue; } default: + WARN_ON(1); return 0; } } @@ -361,6 +359,66 @@ load_b: } EXPORT_SYMBOL(sk_run_filter); +/* + * Security : + * A BPF program is able to use 16 cells of memory to store intermediate + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * As we dont want to clear mem[] array for each packet going through + * sk_run_filter(), we check that filter loaded by user never try to read + * a cell if not previously written, and we check all branches to be sure + * a malicious user doesn't try to abuse us. + */ +static int check_load_and_stores(struct sock_filter *filter, int flen) +{ + u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + int pc, ret = 0; + + BUILD_BUG_ON(BPF_MEMWORDS > 16); + masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + if (!masks) + return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); + + for (pc = 0; pc < flen; pc++) { + memvalid &= masks[pc]; + + switch (filter[pc].code) { + case BPF_S_ST: + case BPF_S_STX: + memvalid |= (1 << filter[pc].k); + break; + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + if (!(memvalid & (1 << filter[pc].k))) { + ret = -EINVAL; + goto error; + } + break; + case BPF_S_JMP_JA: + /* a jump must set masks on target */ + masks[pc + 1 + filter[pc].k] &= memvalid; + memvalid = ~0; + break; + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: + /* a jump must set masks on targets */ + masks[pc + 1 + filter[pc].jt] &= memvalid; + masks[pc + 1 + filter[pc].jf] &= memvalid; + memvalid = ~0; + break; + } + } +error: + kfree(masks); + return ret; +} + /** * sk_chk_filter - verify socket filter code * @filter: filter to verify @@ -377,7 +435,57 @@ EXPORT_SYMBOL(sk_run_filter); */ int sk_chk_filter(struct sock_filter *filter, int flen) { - struct sock_filter *ftest; + /* + * Valid instructions are initialized to non-0. + * Invalid instructions are initialized to 0. + */ + static const u8 codes[] = { + [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, + [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, + [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, + [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, + [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, + [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, + [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, + [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, + [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, + [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, + [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, + [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, + [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, + [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, + [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, + [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, + [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, + [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, + [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, + [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, + [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, + [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, + [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, + [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, + [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, + [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, + [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, + [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, + [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, + [BPF_RET|BPF_K] = BPF_S_RET_K, + [BPF_RET|BPF_A] = BPF_S_RET_A, + [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, + [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, + [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, + [BPF_ST] = BPF_S_ST, + [BPF_STX] = BPF_S_STX, + [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, + [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, + [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, + [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, + [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, + [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, + [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, + [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, + [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, + }; int pc; if (flen == 0 || flen > BPF_MAXINSNS) @@ -385,136 +493,31 @@ int sk_chk_filter(struct sock_filter *filter, int flen) /* check the filter code now */ for (pc = 0; pc < flen; pc++) { - ftest = &filter[pc]; - - /* Only allow valid instructions */ - switch (ftest->code) { - case BPF_ALU|BPF_ADD|BPF_K: - ftest->code = BPF_S_ALU_ADD_K; - break; - case BPF_ALU|BPF_ADD|BPF_X: - ftest->code = BPF_S_ALU_ADD_X; - break; - case BPF_ALU|BPF_SUB|BPF_K: - ftest->code = BPF_S_ALU_SUB_K; - break; - case BPF_ALU|BPF_SUB|BPF_X: - ftest->code = BPF_S_ALU_SUB_X; - break; - case BPF_ALU|BPF_MUL|BPF_K: - ftest->code = BPF_S_ALU_MUL_K; - break; - case BPF_ALU|BPF_MUL|BPF_X: - ftest->code = BPF_S_ALU_MUL_X; - break; - case BPF_ALU|BPF_DIV|BPF_X: - ftest->code = BPF_S_ALU_DIV_X; - break; - case BPF_ALU|BPF_AND|BPF_K: - ftest->code = BPF_S_ALU_AND_K; - break; - case BPF_ALU|BPF_AND|BPF_X: - ftest->code = BPF_S_ALU_AND_X; - break; - case BPF_ALU|BPF_OR|BPF_K: - ftest->code = BPF_S_ALU_OR_K; - break; - case BPF_ALU|BPF_OR|BPF_X: - ftest->code = BPF_S_ALU_OR_X; - break; - case BPF_ALU|BPF_LSH|BPF_K: - ftest->code = BPF_S_ALU_LSH_K; - break; - case BPF_ALU|BPF_LSH|BPF_X: - ftest->code = BPF_S_ALU_LSH_X; - break; - case BPF_ALU|BPF_RSH|BPF_K: - ftest->code = BPF_S_ALU_RSH_K; - break; - case BPF_ALU|BPF_RSH|BPF_X: - ftest->code = BPF_S_ALU_RSH_X; - break; - case BPF_ALU|BPF_NEG: - ftest->code = BPF_S_ALU_NEG; - break; - case BPF_LD|BPF_W|BPF_ABS: - ftest->code = BPF_S_LD_W_ABS; - break; - case BPF_LD|BPF_H|BPF_ABS: - ftest->code = BPF_S_LD_H_ABS; - break; - case BPF_LD|BPF_B|BPF_ABS: - ftest->code = BPF_S_LD_B_ABS; - break; - case BPF_LD|BPF_W|BPF_LEN: - ftest->code = BPF_S_LD_W_LEN; - break; - case BPF_LD|BPF_W|BPF_IND: - ftest->code = BPF_S_LD_W_IND; - break; - case BPF_LD|BPF_H|BPF_IND: - ftest->code = BPF_S_LD_H_IND; - break; - case BPF_LD|BPF_B|BPF_IND: - ftest->code = BPF_S_LD_B_IND; - break; - case BPF_LD|BPF_IMM: - ftest->code = BPF_S_LD_IMM; - break; - case BPF_LDX|BPF_W|BPF_LEN: - ftest->code = BPF_S_LDX_W_LEN; - break; - case BPF_LDX|BPF_B|BPF_MSH: - ftest->code = BPF_S_LDX_B_MSH; - break; - case BPF_LDX|BPF_IMM: - ftest->code = BPF_S_LDX_IMM; - break; - case BPF_MISC|BPF_TAX: - ftest->code = BPF_S_MISC_TAX; - break; - case BPF_MISC|BPF_TXA: - ftest->code = BPF_S_MISC_TXA; - break; - case BPF_RET|BPF_K: - ftest->code = BPF_S_RET_K; - break; - case BPF_RET|BPF_A: - ftest->code = BPF_S_RET_A; - break; + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + if (code >= ARRAY_SIZE(codes)) + return -EINVAL; + code = codes[code]; + if (!code) + return -EINVAL; /* Some instructions need special checks */ - + switch (code) { + case BPF_S_ALU_DIV_K: /* check for division by zero */ - case BPF_ALU|BPF_DIV|BPF_K: if (ftest->k == 0) return -EINVAL; - ftest->code = BPF_S_ALU_DIV_K; + ftest->k = reciprocal_value(ftest->k); break; - - /* check for invalid memory addresses */ - case BPF_LD|BPF_MEM: - if (ftest->k >= BPF_MEMWORDS) - return -EINVAL; - ftest->code = BPF_S_LD_MEM; - break; - case BPF_LDX|BPF_MEM: - if (ftest->k >= BPF_MEMWORDS) - return -EINVAL; - ftest->code = BPF_S_LDX_MEM; - break; - case BPF_ST: - if (ftest->k >= BPF_MEMWORDS) - return -EINVAL; - ftest->code = BPF_S_ST; - break; - case BPF_STX: + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + case BPF_S_ST: + case BPF_S_STX: + /* check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; - ftest->code = BPF_S_STX; break; - - case BPF_JMP|BPF_JA: + case BPF_S_JMP_JA: /* * Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, @@ -522,40 +525,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen) */ if (ftest->k >= (unsigned)(flen-pc-1)) return -EINVAL; - ftest->code = BPF_S_JMP_JA; - break; - - case BPF_JMP|BPF_JEQ|BPF_K: - ftest->code = BPF_S_JMP_JEQ_K; - break; - case BPF_JMP|BPF_JEQ|BPF_X: - ftest->code = BPF_S_JMP_JEQ_X; - break; - case BPF_JMP|BPF_JGE|BPF_K: - ftest->code = BPF_S_JMP_JGE_K; - break; - case BPF_JMP|BPF_JGE|BPF_X: - ftest->code = BPF_S_JMP_JGE_X; - break; - case BPF_JMP|BPF_JGT|BPF_K: - ftest->code = BPF_S_JMP_JGT_K; break; - case BPF_JMP|BPF_JGT|BPF_X: - ftest->code = BPF_S_JMP_JGT_X; - break; - case BPF_JMP|BPF_JSET|BPF_K: - ftest->code = BPF_S_JMP_JSET_K; - break; - case BPF_JMP|BPF_JSET|BPF_X: - ftest->code = BPF_S_JMP_JSET_X; - break; - - default: - return -EINVAL; - } - - /* for conditionals both must be safe */ - switch (ftest->code) { case BPF_S_JMP_JEQ_K: case BPF_S_JMP_JEQ_X: case BPF_S_JMP_JGE_K: @@ -564,42 +534,55 @@ int sk_chk_filter(struct sock_filter *filter, int flen) case BPF_S_JMP_JGT_X: case BPF_S_JMP_JSET_X: case BPF_S_JMP_JSET_K: + /* for conditionals both must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; + break; + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: +#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ + code = BPF_S_ANC_##CODE; \ + break + switch (ftest->k) { + ANCILLARY(PROTOCOL); + ANCILLARY(PKTTYPE); + ANCILLARY(IFINDEX); + ANCILLARY(NLATTR); + ANCILLARY(NLATTR_NEST); + ANCILLARY(MARK); + ANCILLARY(QUEUE); + ANCILLARY(HATYPE); + ANCILLARY(RXHASH); + ANCILLARY(CPU); + } } + ftest->code = code; } /* last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_S_RET_K: case BPF_S_RET_A: - return 0; - break; - default: - return -EINVAL; - } + return check_load_and_stores(filter, flen); + } + return -EINVAL; } EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release: Release a socket filter by rcu_head + * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -static void sk_filter_rcu_release(struct rcu_head *rcu) +void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_filter_release(fp); -} - -static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp) -{ - unsigned int size = sk_filter_len(fp); - - atomic_sub(size, &sk->sk_omem_alloc); - call_rcu_bh(&fp->rcu, sk_filter_rcu_release); + bpf_jit_free(fp); + kfree(fp); } +EXPORT_SYMBOL(sk_filter_release_rcu); /** * sk_attach_filter - attach a socket filter @@ -631,6 +614,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) atomic_set(&fp->refcnt, 1); fp->len = fprog->len; + fp->bpf_func = sk_run_filter; err = sk_chk_filter(fp->insns, fp->len); if (err) { @@ -638,12 +622,14 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return err; } + bpf_jit_compile(fp); + old_fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) - sk_filter_delayed_uncharge(sk, old_fp); + sk_filter_uncharge(sk, old_fp); return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -657,7 +643,7 @@ int sk_detach_filter(struct sock *sk) sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); - sk_filter_delayed_uncharge(sk, filter); + sk_filter_uncharge(sk, filter); ret = 0; } return ret; diff --git a/net/core/flow.c b/net/core/flow.c index 127c8a7ffd61..990703b8863b 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -172,9 +172,9 @@ static void flow_new_hash_rnd(struct flow_cache *fc, static u32 flow_hash_code(struct flow_cache *fc, struct flow_cache_percpu *fcp, - struct flowi *key) + const struct flowi *key) { - u32 *k = (u32 *) key; + const u32 *k = (const u32 *) key; return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) & (flow_cache_hash_size(fc) - 1); @@ -186,17 +186,17 @@ typedef unsigned long flow_compare_t; * important assumptions that we can here, such as alignment and * constant size. */ -static int flow_key_compare(struct flowi *key1, struct flowi *key2) +static int flow_key_compare(const struct flowi *key1, const struct flowi *key2) { - flow_compare_t *k1, *k1_lim, *k2; + const flow_compare_t *k1, *k1_lim, *k2; const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t)); - k1 = (flow_compare_t *) key1; + k1 = (const flow_compare_t *) key1; k1_lim = k1 + n_elem; - k2 = (flow_compare_t *) key2; + k2 = (const flow_compare_t *) key2; do { if (*k1++ != *k2++) @@ -207,7 +207,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) } struct flow_cache_object * -flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, +flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver, void *ctx) { struct flow_cache *fc = &flow_cache_global; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7c2373321b74..43b03dd71e85 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -249,13 +249,6 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, } EXPORT_SYMBOL(gen_new_estimator); -static void __gen_kill_estimator(struct rcu_head *head) -{ - struct gen_estimator *e = container_of(head, - struct gen_estimator, e_rcu); - kfree(e); -} - /** * gen_kill_estimator - remove a rate estimator * @bstats: basic statistics @@ -279,7 +272,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, write_unlock(&est_lock); list_del_rcu(&e->list); - call_rcu(&e->e_rcu, __gen_kill_estimator); + kfree_rcu(e, e_rcu); } spin_unlock_bh(&est_tree_lock); } diff --git a/net/core/iovec.c b/net/core/iovec.c index 72aceb1fe4fa..c40f27e7d208 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -35,10 +35,9 @@ * in any case. */ -long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { - int size, ct; - long err; + int size, ct, err; if (m->msg_namelen) { if (mode == VERIFY_READ) { @@ -62,14 +61,13 @@ long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, err = 0; for (ct = 0; ct < m->msg_iovlen; ct++) { - err += iov[ct].iov_len; - /* - * Goal is not to verify user data, but to prevent returning - * negative value, which is interpreted as errno. - * Overflow is still possible, but it is harmless. - */ - if (err < 0) - return -EMSGSIZE; + size_t len = iov[ct].iov_len; + + if (len > INT_MAX - err) { + len = INT_MAX - err; + iov[ct].iov_len = len; + } + err += len; } return err; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 01a1101b5936..a7b342131869 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -129,7 +129,7 @@ static void linkwatch_schedule_work(int urgent) if (!cancel_delayed_work(&linkwatch_work)) return; - /* Otherwise we reschedule it again for immediate exection. */ + /* Otherwise we reschedule it again for immediate execution. */ schedule_delayed_work(&linkwatch_work, 0); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8cc8f9a79db9..799f06e03a22 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -41,7 +41,6 @@ #define NEIGH_PRINTK(x...) printk(x) #define NEIGH_NOPRINTK(x...) do { ; } while(0) -#define NEIGH_PRINTK0 NEIGH_PRINTK #define NEIGH_PRINTK1 NEIGH_NOPRINTK #define NEIGH_PRINTK2 NEIGH_NOPRINTK @@ -317,7 +316,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) { size_t size = entries * sizeof(struct neighbour *); struct neigh_hash_table *ret; - struct neighbour **buckets; + struct neighbour __rcu **buckets; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) @@ -325,14 +324,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) if (size <= PAGE_SIZE) buckets = kzalloc(size, GFP_ATOMIC); else - buckets = (struct neighbour **) + buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); if (!buckets) { kfree(ret); return NULL; } - rcu_assign_pointer(ret->hash_buckets, buckets); + ret->hash_buckets = buckets; ret->hash_mask = entries - 1; get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; @@ -344,7 +343,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table, rcu); size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); - struct neighbour **buckets = nht->hash_buckets; + struct neighbour __rcu **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) kfree(buckets); @@ -1541,7 +1540,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour proc dir entry"); #endif - tbl->nht = neigh_hash_alloc(8); + RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); @@ -1603,7 +1602,8 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); + call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, + neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b143173e3eb2..11b98bc2aa8f 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -28,6 +28,7 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; +static const char fmt_udec[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -99,7 +100,7 @@ NETDEVICE_SHOW(addr_assign_type, fmt_dec); NETDEVICE_SHOW(addr_len, fmt_dec); NETDEVICE_SHOW(iflink, fmt_dec); NETDEVICE_SHOW(ifindex, fmt_dec); -NETDEVICE_SHOW(features, fmt_long_hex); +NETDEVICE_SHOW(features, fmt_hex); NETDEVICE_SHOW(type, fmt_dec); NETDEVICE_SHOW(link_mode, fmt_dec); @@ -145,13 +146,10 @@ static ssize_t show_speed(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && - netdev->ethtool_ops && - netdev->ethtool_ops->get_settings) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; - - if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!dev_ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); return ret; @@ -166,13 +164,11 @@ static ssize_t show_duplex(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && - netdev->ethtool_ops && - netdev->ethtool_ops->get_settings) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; - - if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) - ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!dev_ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", + cmd.duplex ? "full" : "half"); } rtnl_unlock(); return ret; @@ -295,6 +291,20 @@ static ssize_t show_ifalias(struct device *dev, return ret; } +NETDEVICE_SHOW(group, fmt_dec); + +static int change_group(struct net_device *net, unsigned long new_group) +{ + dev_set_group(net, (int) new_group); + return 0; +} + +static ssize_t store_group(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_group); +} + static struct device_attribute net_class_attributes[] = { __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), @@ -316,6 +326,7 @@ static struct device_attribute net_class_attributes[] = { __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, store_tx_queue_len), + __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), {} }; @@ -550,13 +561,6 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, return len; } -static void rps_map_release(struct rcu_head *rcu) -{ - struct rps_map *map = container_of(rcu, struct rps_map, rcu); - - kfree(map); -} - static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, const char *buf, size_t len) @@ -598,12 +602,13 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } spin_lock(&rps_map_lock); - old_map = queue->rps_map; + old_map = rcu_dereference_protected(queue->rps_map, + lockdep_is_held(&rps_map_lock)); rcu_assign_pointer(queue->rps_map, map); spin_unlock(&rps_map_lock); if (old_map) - call_rcu(&old_map->rcu, rps_map_release); + kfree_rcu(old_map, rcu); free_cpumask_var(mask); return len; @@ -677,7 +682,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, table = NULL; spin_lock(&rps_dev_flow_lock); - old_table = queue->rps_flow_table; + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); @@ -704,17 +710,24 @@ static struct attribute *rx_queue_default_attrs[] = { static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); - struct netdev_rx_queue *first = queue->first; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; - if (queue->rps_map) - call_rcu(&queue->rps_map->rcu, rps_map_release); - if (queue->rps_flow_table) - call_rcu(&queue->rps_flow_table->rcu, - rps_dev_flow_table_release); + map = rcu_dereference_raw(queue->rps_map); + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); + kfree_rcu(map, rcu); + } + + flow_table = rcu_dereference_raw(queue->rps_flow_table); + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } - if (atomic_dec_and_test(&first->count)) - kfree(first); + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); } static struct kobj_type rx_queue_ktype = { @@ -726,7 +739,6 @@ static struct kobj_type rx_queue_ktype = { static int rx_queue_add_kobject(struct net_device *net, int index) { struct netdev_rx_queue *queue = net->_rx + index; - struct netdev_rx_queue *first = queue->first; struct kobject *kobj = &queue->kobj; int error = 0; @@ -739,14 +751,16 @@ static int rx_queue_add_kobject(struct net_device *net, int index) } kobject_uevent(kobj, KOBJ_ADD); - atomic_inc(&first->count); + dev_hold(queue->dev); return error; } +#endif /* CONFIG_RPS */ int net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { +#ifdef CONFIG_RPS int i; int error = 0; @@ -762,23 +776,408 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) kobject_put(&net->_rx[i].kobj); return error; +#else + return 0; +#endif +} + +#ifdef CONFIG_XPS +/* + * netdev_queue sysfs structures and functions. + */ +struct netdev_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_netdev_queue_attr(_attr) container_of(_attr, \ + struct netdev_queue_attribute, attr) + +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) + +static ssize_t netdev_queue_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t netdev_queue_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops netdev_queue_sysfs_ops = { + .show = netdev_queue_attr_show, + .store = netdev_queue_attr_store, +}; + +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + + +static ssize_t show_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + cpumask_var_t mask; + unsigned long index; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + for_each_possible_cpu(i) { + struct xps_map *map = + rcu_dereference(dev_maps->cpu_map[i]); + if (map) { + int j; + for (j = 0; j < map->len; j++) { + if (map->queues[j] == index) { + cpumask_set_cpu(i, mask); + break; + } + } + } + } + } + rcu_read_unlock(); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + free_cpumask_var(mask); + return -EINVAL; + } + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static ssize_t store_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + cpumask_var_t mask; + int err, i, cpu, pos, map_len, alloc_len, need_set; + unsigned long index; + struct xps_map *map, *new_map; + struct xps_dev_maps *dev_maps, *new_dev_maps; + int nonempty = 0; + int numa_node = -2; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + new_dev_maps = kzalloc(max_t(unsigned, + XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); + if (!new_dev_maps) { + free_cpumask_var(mask); + return -ENOMEM; + } + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + new_map = map; + if (map) { + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + map_len = map->len; + alloc_len = map->alloc_len; + } else + pos = map_len = alloc_len = 0; + + need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); +#ifdef CONFIG_NUMA + if (need_set) { + if (numa_node == -2) + numa_node = cpu_to_node(cpu); + else if (numa_node != cpu_to_node(cpu)) + numa_node = -1; + } +#endif + if (need_set && pos >= map_len) { + /* Need to add queue to this CPU's map */ + if (map_len >= alloc_len) { + alloc_len = alloc_len ? + 2 * alloc_len : XPS_MIN_MAP_ALLOC; + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + goto error; + new_map->alloc_len = alloc_len; + for (i = 0; i < map_len; i++) + new_map->queues[i] = map->queues[i]; + new_map->len = map_len; + } + new_map->queues[new_map->len++] = index; + } else if (!need_set && pos < map_len) { + /* Need to remove queue from this CPU's map */ + if (map_len > 1) + new_map->queues[pos] = + new_map->queues[--new_map->len]; + else + new_map = NULL; + } + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); + } + + /* Cleanup old maps */ + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) + kfree_rcu(map, rcu); + if (new_dev_maps->cpu_map[cpu]) + nonempty = 1; + } + + if (nonempty) + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + else { + kfree(new_dev_maps); + rcu_assign_pointer(dev->xps_maps, NULL); + } + + if (dev_maps) + kfree_rcu(dev_maps, rcu); + + netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : + NUMA_NO_NODE); + + mutex_unlock(&xps_map_mutex); + + free_cpumask_var(mask); + return len; + +error: + mutex_unlock(&xps_map_mutex); + + if (new_dev_maps) + for_each_possible_cpu(i) + kfree(rcu_dereference_protected( + new_dev_maps->cpu_map[i], + 1)); + kfree(new_dev_maps); + free_cpumask_var(mask); + return -ENOMEM; +} + +static struct netdev_queue_attribute xps_cpus_attribute = + __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); + +static struct attribute *netdev_queue_default_attrs[] = { + &xps_cpus_attribute.attr, + NULL +}; + +static void netdev_queue_release(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (dev_maps) { + for_each_possible_cpu(i) { + map = xmap_dereference(dev_maps->cpu_map[i]); + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + kfree_rcu(map, rcu); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + } + + mutex_unlock(&xps_map_mutex); + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type netdev_queue_ktype = { + .sysfs_ops = &netdev_queue_sysfs_ops, + .release = netdev_queue_release, + .default_attrs = netdev_queue_default_attrs, +}; + +static int netdev_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_queue *queue = net->_tx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; } +#endif /* CONFIG_XPS */ -static int rx_queue_register_kobjects(struct net_device *net) +int +netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { +#ifdef CONFIG_XPS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = netdev_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_tx[i].kobj); + + return error; +#else + return 0; +#endif +} + +static int register_queue_kobjects(struct net_device *net) +{ + int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; + +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); if (!net->queues_kset) return -ENOMEM; - return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); +#endif + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + error = net_rx_queue_update_kobjects(net, 0, real_rx); + if (error) + goto error; + rxq = real_rx; + + error = netdev_queue_update_kobjects(net, 0, real_tx); + if (error) + goto error; + txq = real_tx; + + return 0; + +error: + netdev_queue_update_kobjects(net, txq, 0); + net_rx_queue_update_kobjects(net, rxq, 0); + return error; } -static void rx_queue_remove_kobjects(struct net_device *net) +static void remove_queue_kobjects(struct net_device *net) { - net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0); + int real_rx = 0, real_tx = 0; + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + net_rx_queue_update_kobjects(net, real_rx, 0); + netdev_queue_update_kobjects(net, real_tx, 0); +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) kset_unregister(net->queues_kset); +#endif } -#endif /* CONFIG_RPS */ static const void *net_current_ns(void) { @@ -877,9 +1276,7 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); -#ifdef CONFIG_RPS - rx_queue_remove_kobjects(net); -#endif + remove_queue_kobjects(net); device_del(dev); } @@ -918,13 +1315,11 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; -#ifdef CONFIG_RPS - error = rx_queue_register_kobjects(net); + error = register_queue_kobjects(net); if (error) { device_del(dev); return error; } -#endif return error; } diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 778e1571548d..bd7751ec1c4d 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -4,8 +4,8 @@ int netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); -#ifdef CONFIG_RPS int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); -#endif +int netdev_queue_update_kobjects(struct net_device *net, + int old_num, int new_num); #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c988e685433a..2e2dce6583e1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -27,14 +27,6 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ -static void net_generic_release(struct rcu_head *rcu) -{ - struct net_generic *ng; - - ng = container_of(rcu, struct net_generic, rcu); - kfree(ng); -} - static int net_assign_generic(struct net *net, int id, void *data) { struct net_generic *ng, *old_ng; @@ -42,7 +34,9 @@ static int net_assign_generic(struct net *net, int id, void *data) BUG_ON(!mutex_is_locked(&net_mutex)); BUG_ON(id == 0); - ng = old_ng = net->gen; + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&net_mutex)); + ng = old_ng; if (old_ng->len >= id) goto assign; @@ -66,7 +60,7 @@ static int net_assign_generic(struct net *net, int id, void *data) memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); rcu_assign_pointer(net->gen, ng); - call_rcu(&old_ng->rcu, net_generic_release); + kfree_rcu(old_ng, rcu); assign: ng->ptr[id - 1] = data; return 0; @@ -214,11 +208,14 @@ static void net_free(struct net *net) kmem_cache_free(net_cachep, net); } -static struct net *net_create(void) +struct net *copy_net_ns(unsigned long flags, struct net *old_net) { struct net *net; int rv; + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + net = net_alloc(); if (!net) return ERR_PTR(-ENOMEM); @@ -237,13 +234,6 @@ static struct net *net_create(void) return net; } -struct net *copy_net_ns(unsigned long flags, struct net *old_net) -{ - if (!(flags & CLONE_NEWNET)) - return get_net(old_net); - return net_create(); -} - static DEFINE_SPINLOCK(cleanup_list_lock); static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4e98ffac3af0..2d7d6d473781 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -35,7 +35,6 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 -#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) static struct sk_buff_head skb_pool; @@ -76,8 +75,7 @@ static void queue_process(struct work_struct *work) local_irq_save(flags); __netif_tx_lock(txq, smp_processor_id()); - if (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq) || + if (netif_tx_queue_frozen_or_stopped(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); @@ -195,6 +193,17 @@ void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); + if (dev->priv_flags & IFF_SLAVE) { + if (dev->npinfo) { + struct net_device *bond_dev = dev->master; + struct sk_buff *skb; + while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + skb->dev = bond_dev; + skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + } + } + } + service_arp_queue(dev->npinfo); zap_completion_queue(); @@ -315,9 +324,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, tries > 0; --tries) { if (__netif_tx_trylock(txq)) { if (!netif_tx_queue_stopped(txq)) { - dev->priv_flags |= IFF_IN_NETPOLL; status = ops->ndo_start_xmit(skb, dev); - dev->priv_flags &= ~IFF_IN_NETPOLL; if (status == NETDEV_TX_OK) txq_trans_update(txq); } @@ -532,7 +539,7 @@ int __netpoll_rx(struct sk_buff *skb) { int proto, len, ulen; int hits = 0; - struct iphdr *iph; + const struct iphdr *iph; struct udphdr *uh; struct netpoll_info *npinfo = skb->dev->npinfo; struct netpoll *np, *tmp; @@ -691,32 +698,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if (*cur != 0) { /* MAC address */ - if ((delim = strchr(cur, ':')) == NULL) - goto parse_failed; - *delim = 0; - np->remote_mac[0] = simple_strtol(cur, NULL, 16); - cur = delim + 1; - if ((delim = strchr(cur, ':')) == NULL) + if (!mac_pton(cur, np->remote_mac)) goto parse_failed; - *delim = 0; - np->remote_mac[1] = simple_strtol(cur, NULL, 16); - cur = delim + 1; - if ((delim = strchr(cur, ':')) == NULL) - goto parse_failed; - *delim = 0; - np->remote_mac[2] = simple_strtol(cur, NULL, 16); - cur = delim + 1; - if ((delim = strchr(cur, ':')) == NULL) - goto parse_failed; - *delim = 0; - np->remote_mac[3] = simple_strtol(cur, NULL, 16); - cur = delim + 1; - if ((delim = strchr(cur, ':')) == NULL) - goto parse_failed; - *delim = 0; - np->remote_mac[4] = simple_strtol(cur, NULL, 16); - cur = delim + 1; - np->remote_mac[5] = simple_strtol(cur, NULL, 16); } netpoll_print_options(np); @@ -925,7 +908,7 @@ void __netpoll_cleanup(struct netpoll *np) skb_queue_purge(&npinfo->arp_tx); skb_queue_purge(&npinfo->txq); - cancel_rearming_delayed_work(&npinfo->tx_work); + cancel_delayed_work_sync(&npinfo->tx_work); /* clean after last, unfinished work */ __skb_queue_purge(&npinfo->txq); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2c0df0f95b3d..67870e9fd097 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -156,6 +156,7 @@ #include <linux/wait.h> #include <linux/etherdevice.h> #include <linux/kthread.h> +#include <linux/prefetch.h> #include <net/net_namespace.h> #include <net/checksum.h> #include <net/ipv6.h> @@ -251,6 +252,7 @@ struct pktgen_dev { int max_pkt_size; /* = ETH_ZLEN; */ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; + struct page *page; u64 delay; /* nano-seconds */ __u64 count; /* Default No packets to send */ @@ -378,6 +380,7 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; + __u32 skb_priority; /* skb priority field */ int node; /* Memory node */ #ifdef CONFIG_XFRM @@ -394,6 +397,8 @@ struct pktgen_hdr { __be32 tv_usec; }; +static bool pktgen_exiting __read_mostly; + struct pktgen_thread { spinlock_t if_lock; /* for list of devices */ struct list_head if_list; /* All device here */ @@ -445,7 +450,6 @@ static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); static unsigned int scan_ip6(const char *s, char ip[16]); -static unsigned int fmt_ip6(char *s, const char ip[16]); /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; @@ -547,22 +551,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->queue_map_min, pkt_dev->queue_map_max); - if (pkt_dev->flags & F_IPV6) { - char b1[128], b2[128], b3[128]; - fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); - fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr); - fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr); - seq_printf(seq, - " saddr: %s min_saddr: %s max_saddr: %s\n", b1, - b2, b3); + if (pkt_dev->skb_priority) + seq_printf(seq, " skb_priority: %u\n", + pkt_dev->skb_priority); - fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr); - fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr); - fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr); + if (pkt_dev->flags & F_IPV6) { seq_printf(seq, - " daddr: %s min_daddr: %s max_daddr: %s\n", b1, - b2, b3); - + " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n" + " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n", + &pkt_dev->in6_saddr, + &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr, + &pkt_dev->in6_daddr, + &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr); } else { seq_printf(seq, " dst_min: %s dst_max: %s\n", @@ -698,10 +698,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->cur_src_mac_offset); if (pkt_dev->flags & F_IPV6) { - char b1[128], b2[128]; - fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr); - fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr); - seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1); + seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n", + &pkt_dev->cur_in6_saddr, + &pkt_dev->cur_in6_daddr); } else seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", pkt_dev->cur_saddr, pkt_dev->cur_daddr); @@ -771,10 +770,10 @@ done: static unsigned long num_arg(const char __user * user_buffer, unsigned long maxlen, unsigned long *num) { - int i = 0; + int i; *num = 0; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -789,9 +788,9 @@ static unsigned long num_arg(const char __user * user_buffer, static int strn_len(const char __user * user_buffer, unsigned int maxlen) { - int i = 0; + int i; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -846,7 +845,7 @@ static ssize_t pktgen_if_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i = 0, max, len; + int i, max, len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; @@ -860,13 +859,13 @@ static ssize_t pktgen_if_write(struct file *file, return -EINVAL; } - max = count - i; - tmp = count_trail_chars(&user_buffer[i], max); + max = count; + tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { pr_warning("illegal format\n"); return tmp; } - i += tmp; + i = tmp; /* Read variable name */ @@ -887,10 +886,11 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - char tb[count + 1]; - if (copy_from_user(tb, user_buffer, count)) + size_t copy = min_t(size_t, count, 1023); + char tb[copy + 1]; + if (copy_from_user(tb, user_buffer, copy)) return -EFAULT; - tb[count] = 0; + tb[copy] = 0; printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, (unsigned long)count, tb); } @@ -1126,6 +1126,10 @@ static ssize_t pktgen_if_write(struct file *file, if (node_possible(value)) { pkt_dev->node = value; sprintf(pg_result, "OK: node=%d", pkt_dev->node); + if (pkt_dev->page) { + put_page(pkt_dev->page); + pkt_dev->page = NULL; + } } else sprintf(pg_result, "ERROR: node not possible"); @@ -1296,7 +1300,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); - fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); @@ -1319,7 +1323,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); - fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->min_in6_daddr); @@ -1342,7 +1346,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); - fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr); if (debug) printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf); @@ -1363,7 +1367,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); - fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); @@ -1417,11 +1421,6 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_mac")) { - char *v = valstr; - unsigned char old_dmac[ETH_ALEN]; - unsigned char *m = pkt_dev->dst_mac; - memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN); - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); if (len < 0) return len; @@ -1429,35 +1428,16 @@ static ssize_t pktgen_if_write(struct file *file, memset(valstr, 0, sizeof(valstr)); if (copy_from_user(valstr, &user_buffer[i], len)) return -EFAULT; - i += len; - - for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) { - int value; - - value = hex_to_bin(*v); - if (value >= 0) - *m = *m * 16 + value; - - if (*v == ':') { - m++; - *m = 0; - } - } + if (!mac_pton(valstr, pkt_dev->dst_mac)) + return -EINVAL; /* Set up Dest MAC */ - if (compare_ether_addr(old_dmac, pkt_dev->dst_mac)) - memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); + memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); - sprintf(pg_result, "OK: dstmac"); + sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); return count; } if (!strcmp(name, "src_mac")) { - char *v = valstr; - unsigned char old_smac[ETH_ALEN]; - unsigned char *m = pkt_dev->src_mac; - - memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN); - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); if (len < 0) return len; @@ -1465,26 +1445,13 @@ static ssize_t pktgen_if_write(struct file *file, memset(valstr, 0, sizeof(valstr)); if (copy_from_user(valstr, &user_buffer[i], len)) return -EFAULT; - i += len; - - for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) { - int value; - - value = hex_to_bin(*v); - if (value >= 0) - *m = *m * 16 + value; - - if (*v == ':') { - m++; - *m = 0; - } - } + if (!mac_pton(valstr, pkt_dev->src_mac)) + return -EINVAL; /* Set up Src MAC */ - if (compare_ether_addr(old_smac, pkt_dev->src_mac)) - memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN); + memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); - sprintf(pg_result, "OK: srcmac"); + sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); return count; } @@ -1710,6 +1677,18 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + if (!strcmp(name, "skb_priority")) { + len = num_arg(&user_buffer[i], 9, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->skb_priority = value; + sprintf(pg_result, "OK: skb_priority=%i", + pkt_dev->skb_priority); + return count; + } + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); return -EINVAL; } @@ -1764,7 +1743,7 @@ static ssize_t pktgen_thread_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i = 0, max, len, ret; + int i, max, len, ret; char name[40]; char *pg_result; @@ -1773,12 +1752,12 @@ static ssize_t pktgen_thread_write(struct file *file, return -EINVAL; } - max = count - i; - len = count_trail_chars(&user_buffer[i], max); + max = count; + len = count_trail_chars(user_buffer, max); if (len < 0) return len; - i += len; + i = len; /* Read variable name */ @@ -1975,7 +1954,7 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, const char *ifname) { char b[IFNAMSIZ+5]; - int i = 0; + int i; for (i = 0; ifname[i] != '@'; i++) { if (i == IFNAMSIZ) @@ -2489,7 +2468,6 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int err = 0; - struct iphdr *iph; if (!x) return 0; @@ -2499,7 +2477,6 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) return 0; spin_lock(&x->lock); - iph = ip_hdr(skb); err = x->outer_mode->output(x, skb); if (err) @@ -2519,8 +2496,8 @@ static void free_SAs(struct pktgen_dev *pkt_dev) { if (pkt_dev->cflows) { /* let go of the SAs if we have them */ - int i = 0; - for (; i < pkt_dev->cflows; i++) { + int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; if (x) { xfrm_state_put(x); @@ -2585,6 +2562,72 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi, return htons(id | (cfi << 12) | (prio << 13)); } +static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, + int datalen) +{ + struct timeval timestamp; + struct pktgen_hdr *pgh; + + pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); + datalen -= sizeof(*pgh); + + if (pkt_dev->nfrags <= 0) { + memset(skb_put(skb, datalen), 0, datalen); + } else { + int frags = pkt_dev->nfrags; + int i, len; + int frag_len; + + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + len = datalen - frags * PAGE_SIZE; + if (len > 0) { + memset(skb_put(skb, len), 0, len); + datalen = frags * PAGE_SIZE; + } + + i = 0; + frag_len = (datalen/frags) < PAGE_SIZE ? + (datalen/frags) : PAGE_SIZE; + while (datalen > 0) { + if (unlikely(!pkt_dev->page)) { + int node = numa_node_id(); + + if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE)) + node = pkt_dev->node; + pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!pkt_dev->page) + break; + } + skb_shinfo(skb)->frags[i].page = pkt_dev->page; + get_page(pkt_dev->page); + skb_shinfo(skb)->frags[i].page_offset = 0; + /*last fragment, fill rest of data*/ + if (i == (frags - 1)) + skb_shinfo(skb)->frags[i].size = + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + else + skb_shinfo(skb)->frags[i].size = frag_len; + datalen -= skb_shinfo(skb)->frags[i].size; + skb->len += skb_shinfo(skb)->frags[i].size; + skb->data_len += skb_shinfo(skb)->frags[i].size; + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, + * convert them to network byte order + */ + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); +} + static struct sk_buff *fill_packet_ipv4(struct net_device *odev, struct pktgen_dev *pkt_dev) { @@ -2593,7 +2636,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, struct udphdr *udph; int datalen, iplen; struct iphdr *iph; - struct pktgen_hdr *pgh = NULL; __be16 protocol = htons(ETH_P_IP); __be32 *mpls; __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ @@ -2611,8 +2653,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; datalen = (odev->hard_header_len + 16) & ~0xf; @@ -2640,6 +2682,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, sprintf(pkt_dev->result, "No memory"); return NULL; } + prefetchw(skb->data); skb_reserve(skb, datalen); @@ -2670,6 +2713,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->transport_header = skb->network_header + sizeof(struct iphdr); skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + iph = ip_hdr(skb); udph = udp_hdr(skb); @@ -2706,76 +2751,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, pkt_dev->pkt_overhead); skb->dev = odev; skb->pkt_type = PACKET_HOST; - - if (pkt_dev->nfrags <= 0) { - pgh = (struct pktgen_hdr *)skb_put(skb, datalen); - memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr)); - } else { - int frags = pkt_dev->nfrags; - int i, len; - - pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8); - - if (frags > MAX_SKB_FRAGS) - frags = MAX_SKB_FRAGS; - if (datalen > frags * PAGE_SIZE) { - len = datalen - frags * PAGE_SIZE; - memset(skb_put(skb, len), 0, len); - datalen = frags * PAGE_SIZE; - } - - i = 0; - while (datalen > 0) { - struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); - skb_shinfo(skb)->frags[i].page = page; - skb_shinfo(skb)->frags[i].page_offset = 0; - skb_shinfo(skb)->frags[i].size = - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); - datalen -= skb_shinfo(skb)->frags[i].size; - skb->len += skb_shinfo(skb)->frags[i].size; - skb->data_len += skb_shinfo(skb)->frags[i].size; - i++; - skb_shinfo(skb)->nr_frags = i; - } - - while (i < frags) { - int rem; - - if (i == 0) - break; - - rem = skb_shinfo(skb)->frags[i - 1].size / 2; - if (rem == 0) - break; - - skb_shinfo(skb)->frags[i - 1].size -= rem; - - skb_shinfo(skb)->frags[i] = - skb_shinfo(skb)->frags[i - 1]; - get_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb)->frags[i].page = - skb_shinfo(skb)->frags[i - 1].page; - skb_shinfo(skb)->frags[i].page_offset += - skb_shinfo(skb)->frags[i - 1].size; - skb_shinfo(skb)->frags[i].size = rem; - i++; - skb_shinfo(skb)->nr_frags = i; - } - } - - /* Stamp the time, and sequence number, - * convert them to network byte order - */ - if (pgh) { - struct timeval timestamp; - - pgh->pgh_magic = htonl(PKTGEN_MAGIC); - pgh->seq_num = htonl(pkt_dev->seq_num); - - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); - } + pktgen_finalize_skb(pkt_dev, skb, datalen); #ifdef CONFIG_XFRM if (!process_ipsec(pkt_dev, skb, protocol)) @@ -2876,79 +2852,6 @@ static unsigned int scan_ip6(const char *s, char ip[16]) return len; } -static char tohex(char hexdigit) -{ - return hexdigit > 9 ? hexdigit + 'a' - 10 : hexdigit + '0'; -} - -static int fmt_xlong(char *s, unsigned int i) -{ - char *bak = s; - *s = tohex((i >> 12) & 0xf); - if (s != bak || *s != '0') - ++s; - *s = tohex((i >> 8) & 0xf); - if (s != bak || *s != '0') - ++s; - *s = tohex((i >> 4) & 0xf); - if (s != bak || *s != '0') - ++s; - *s = tohex(i & 0xf); - return s - bak + 1; -} - -static unsigned int fmt_ip6(char *s, const char ip[16]) -{ - unsigned int len; - unsigned int i; - unsigned int temp; - unsigned int compressing; - int j; - - len = 0; - compressing = 0; - for (j = 0; j < 16; j += 2) { - -#ifdef V4MAPPEDPREFIX - if (j == 12 && !memcmp(ip, V4mappedprefix, 12)) { - inet_ntoa_r(*(struct in_addr *)(ip + 12), s); - temp = strlen(s); - return len + temp; - } -#endif - temp = ((unsigned long)(unsigned char)ip[j] << 8) + - (unsigned long)(unsigned char)ip[j + 1]; - if (temp == 0) { - if (!compressing) { - compressing = 1; - if (j == 0) { - *s++ = ':'; - ++len; - } - } - } else { - if (compressing) { - compressing = 0; - *s++ = ':'; - ++len; - } - i = fmt_xlong(s, temp); - len += i; - s += i; - if (j < 14) { - *s++ = ':'; - ++len; - } - } - } - if (compressing) { - *s++ = ':'; - ++len; - } - *s = 0; - return len; -} - static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct pktgen_dev *pkt_dev) { @@ -2957,7 +2860,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct udphdr *udph; int datalen; struct ipv6hdr *iph; - struct pktgen_hdr *pgh = NULL; __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ @@ -2975,8 +2877,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; skb = __netdev_alloc_skb(odev, pkt_dev->cur_pkt_size + 64 @@ -2985,6 +2887,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, sprintf(pkt_dev->result, "No memory"); return NULL; } + prefetchw(skb->data); skb_reserve(skb, 16); @@ -3015,6 +2918,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; iph = ipv6_hdr(skb); udph = udp_hdr(skb); @@ -3058,75 +2962,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->dev = odev; skb->pkt_type = PACKET_HOST; - if (pkt_dev->nfrags <= 0) - pgh = (struct pktgen_hdr *)skb_put(skb, datalen); - else { - int frags = pkt_dev->nfrags; - int i; - - pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8); - - if (frags > MAX_SKB_FRAGS) - frags = MAX_SKB_FRAGS; - if (datalen > frags * PAGE_SIZE) { - skb_put(skb, datalen - frags * PAGE_SIZE); - datalen = frags * PAGE_SIZE; - } - - i = 0; - while (datalen > 0) { - struct page *page = alloc_pages(GFP_KERNEL, 0); - skb_shinfo(skb)->frags[i].page = page; - skb_shinfo(skb)->frags[i].page_offset = 0; - skb_shinfo(skb)->frags[i].size = - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); - datalen -= skb_shinfo(skb)->frags[i].size; - skb->len += skb_shinfo(skb)->frags[i].size; - skb->data_len += skb_shinfo(skb)->frags[i].size; - i++; - skb_shinfo(skb)->nr_frags = i; - } - - while (i < frags) { - int rem; - - if (i == 0) - break; - - rem = skb_shinfo(skb)->frags[i - 1].size / 2; - if (rem == 0) - break; - - skb_shinfo(skb)->frags[i - 1].size -= rem; - - skb_shinfo(skb)->frags[i] = - skb_shinfo(skb)->frags[i - 1]; - get_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb)->frags[i].page = - skb_shinfo(skb)->frags[i - 1].page; - skb_shinfo(skb)->frags[i].page_offset += - skb_shinfo(skb)->frags[i - 1].size; - skb_shinfo(skb)->frags[i].size = rem; - i++; - skb_shinfo(skb)->nr_frags = i; - } - } - - /* Stamp the time, and sequence number, - * convert them to network byte order - * should we update cloned packets too ? - */ - if (pgh) { - struct timeval timestamp; - - pgh->pgh_magic = htonl(PKTGEN_MAGIC); - pgh->seq_num = htonl(pkt_dev->seq_num); - - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); - } - /* pkt_dev->seq_num++; FF: you really mean this? */ + pktgen_finalize_skb(pkt_dev, skb, datalen); return skb; } @@ -3296,7 +3132,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pkt_dev->started_at); ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); - p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n", + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", (unsigned long long)ktime_to_us(elapsed), (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), (unsigned long long)ktime_to_us(idle), @@ -3430,11 +3266,6 @@ static void pktgen_rem_thread(struct pktgen_thread *t) remove_proc_entry(t->tsk->comm, pg_proc_dir); - mutex_lock(&pktgen_thread_lock); - - list_del(&t->th_list); - - mutex_unlock(&pktgen_thread_lock); } static void pktgen_resched(struct pktgen_dev *pkt_dev) @@ -3509,7 +3340,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) __netif_tx_lock_bh(txq); - if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) { + if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; @@ -3533,8 +3364,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) break; default: /* Drivers are not supposed to return other values! */ if (net_ratelimit()) - pr_info("pktgen: %s xmit error: %d\n", - pkt_dev->odevname, ret); + pr_info("%s xmit error: %d\n", pkt_dev->odevname, ret); pkt_dev->errors++; /* fallthru */ case NETDEV_TX_LOCKED: @@ -3581,6 +3411,8 @@ static int pktgen_thread_worker(void *arg) pkt_dev = next_to_run(t); if (unlikely(!pkt_dev && t->control == 0)) { + if (pktgen_exiting) + break; wait_event_interruptible_timeout(t->queue, t->control != 0, HZ/10); @@ -3633,6 +3465,13 @@ static int pktgen_thread_worker(void *arg) pr_debug("%s removing thread\n", t->tsk->comm); pktgen_rem_thread(t); + /* Wait for kthread_stop */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; } @@ -3784,7 +3623,10 @@ static int __init pktgen_create_thread(int cpu) list_add_tail(&t->th_list, &pktgen_threads); init_completion(&t->start_done); - p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); if (IS_ERR(p)) { pr_err("kernel_thread() failed for cpu %d\n", t->cpu); list_del(&t->th_list); @@ -3856,6 +3698,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, free_SAs(pkt_dev); #endif vfree(pkt_dev->flows); + if (pkt_dev->page) + put_page(pkt_dev->page); kfree(pkt_dev); return 0; } @@ -3907,6 +3751,7 @@ static void __exit pg_cleanup(void) struct list_head *q, *n; /* Stop all interfaces & threads */ + pktgen_exiting = true; list_for_each_safe(q, n, &pktgen_threads) { t = list_entry(q, struct pktgen_thread, th_list); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 7552495aff7a..182236b2510a 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -33,6 +33,7 @@ * Note : Dont forget somaxconn that may limit backlog too. */ int sysctl_max_syn_backlog = 256; +EXPORT_SYMBOL(sysctl_max_syn_backlog); int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) @@ -45,9 +46,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = __vmalloc(lopt_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8121268ddbdd..d2ba2597c75a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -196,7 +196,7 @@ EXPORT_SYMBOL_GPL(__rtnl_register); * as failure of this function is very unlikely, it can only happen due * to lack of memory when allocating the chain to store all message * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continueing. + * of memory implies no sense in continuing. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit) @@ -347,16 +347,106 @@ static size_t rtnl_link_get_size(const struct net_device *dev) if (!ops) return 0; - size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ - nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ - size += nlmsg_total_size(sizeof(struct nlattr)) + + size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) - size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); + + return size; +} + +static LIST_HEAD(rtnl_af_ops); + +static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +{ + const struct rtnl_af_ops *ops; + + list_for_each_entry(ops, &rtnl_af_ops, list) { + if (ops->family == family) + return ops; + } + + return NULL; +} + +/** + * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * The caller must hold the rtnl_mutex. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_af_register(struct rtnl_af_ops *ops) +{ + list_add_tail(&ops->list, &rtnl_af_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_af_register); + +/** + * rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_af_register(struct rtnl_af_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_af_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_af_register); + +/** + * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_af_unregister); + +/** + * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + */ +void rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + rtnl_lock(); + __rtnl_af_unregister(ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_af_unregister); + +static size_t rtnl_link_get_af_size(const struct net_device *dev) +{ + struct rtnl_af_ops *af_ops; + size_t size; + + /* IFLA_AF_SPEC */ + size = nla_total_size(sizeof(struct nlattr)); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_link_af_size) { + /* AF_* + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + af_ops->get_link_af_size(dev); + } + } return size; } @@ -670,7 +760,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(4) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ - + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ + + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -756,7 +847,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats; - struct nlattr *attr; + struct nlattr *attr, *af_spec; + struct rtnl_af_ops *af_ops; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) @@ -776,6 +868,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, netif_running(dev) ? dev->operstate : IF_OPER_DOWN); NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + NLA_PUT_U32(skb, IFLA_GROUP, dev->group); if (dev->ifindex != dev->iflink) NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); @@ -865,6 +958,36 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } + if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_link_af) { + struct nlattr *af; + int err; + + if (!(af = nla_nest_start(skb, af_ops->family))) + goto nla_put_failure; + + err = af_ops->fill_link_af(skb, dev); + + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, af_spec); + return nlmsg_end(skb, nlh); nla_put_failure: @@ -884,10 +1007,11 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_idx = cb->args[1]; + rcu_read_lock(); for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, node, head, index_hlist) { + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { if (idx < s_idx) goto cont; if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -900,6 +1024,7 @@ cont: } } out: + rcu_read_unlock(); cb->args[1] = idx; cb->args[0] = h; @@ -913,6 +1038,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, + [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, @@ -923,6 +1049,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, + [IFLA_AF_SPEC] = { .type = NLA_NESTED }, }; EXPORT_SYMBOL(ifla_policy); @@ -984,6 +1111,27 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem, err; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + return -EAFNOSUPPORT; + + if (!af_ops->set_link_af) + return -EOPNOTSUPP; + + if (af_ops->validate_link_af) { + err = af_ops->validate_link_af(dev, af); + if (err < 0) + return err; + } + } + } + return 0; } @@ -1033,6 +1181,41 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) return err; } +static int do_set_master(struct net_device *dev, int ifindex) +{ + struct net_device *master_dev; + const struct net_device_ops *ops; + int err; + + if (dev->master) { + if (dev->master->ifindex == ifindex) + return 0; + ops = dev->master->netdev_ops; + if (ops->ndo_del_slave) { + err = ops->ndo_del_slave(dev->master, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + + if (ifindex) { + master_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!master_dev) + return -EINVAL; + ops = master_dev->netdev_ops; + if (ops->ndo_add_slave) { + err = ops->ndo_add_slave(master_dev, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + return 0; +} + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { @@ -1120,6 +1303,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, modified = 1; } + if (tb[IFLA_GROUP]) { + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + modified = 1; + } + /* * Interface selected by interface index but interface * name provided implies that a name change has been @@ -1151,6 +1339,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, goto errout; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + if (err) + goto errout; + modified = 1; + } + if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); @@ -1224,12 +1419,30 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, goto errout; modified = 1; } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + BUG(); + + err = af_ops->set_link_af(dev, af); + if (err < 0) + goto errout; + + modified = 1; + } + } err = 0; errout: if (err < 0 && modified && net_ratelimit()) printk(KERN_WARNING "A link change request failed with " - "some changes comitted already. Interface %s may " + "some changes committed already. Interface %s may " "have been left with an inconsistent configuration, " "please check.\n", dev->name); @@ -1288,6 +1501,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; + LIST_HEAD(list_kill); err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1311,7 +1525,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (!ops) return -EOPNOTSUPP; - ops->dellink(dev, NULL); + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + list_del(&list_kill); return 0; } @@ -1359,12 +1575,6 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, dev->rtnl_link_state = RTNL_LINK_INITIALIZING; dev->real_num_tx_queues = real_num_queues; - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto err_free; - } - if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); if (tb[IFLA_ADDRESS]) @@ -1379,16 +1589,34 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (tb[IFLA_GROUP]) + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); return dev; -err_free: - free_netdev(dev); err: return ERR_PTR(err); } EXPORT_SYMBOL(rtnl_create_link); +static int rtnl_group_changelink(struct net *net, int group, + struct ifinfomsg *ifm, + struct nlattr **tb) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + if (dev->group == group) { + err = do_setlink(dev, ifm, tb, NULL, 0); + if (err < 0) + return err; + } + } + + return 0; +} + static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct net *net = sock_net(skb->sk); @@ -1416,10 +1644,12 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; + else { + if (ifname[0]) + dev = __dev_get_by_name(net, ifname); + else + dev = NULL; + } err = validate_linkmsg(dev, tb); if (err < 0) @@ -1483,8 +1713,13 @@ replay: return do_setlink(dev, ifm, tb, ifname, modified); } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, tb); return -ENODEV; + } if (ifm->ifi_index) return -EOPNOTSUPP; @@ -1509,6 +1744,9 @@ replay: snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); dest_net = rtnl_link_get_net(net, tb); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); if (IS_ERR(dev)) @@ -1638,7 +1876,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int min_len; int family; int type; - int err; type = nlh->nlmsg_type; if (type > RTM_MAX) @@ -1665,11 +1902,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (dumpit == NULL) return -EOPNOTSUPP; - __rtnl_unlock(); rtnl = net->rtnl; - err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); - rtnl_lock(); - return err; + return netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); } memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); @@ -1739,7 +1973,7 @@ static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + rtnetlink_rcv, NULL, THIS_MODULE); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/scm.c b/net/core/scm.c index 413cab89017d..4c1ef026d695 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -79,10 +79,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->max = SCM_MAX_FD; } fpp = &fpl->fp[fpl->count]; - if (fpl->count + num > SCM_MAX_FD) + if (fpl->count + num > fpl->max) return -EINVAL; /* @@ -94,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) int fd = fdp[i]; struct file *file; - if (fd < 0 || !(file = fget(fd))) + if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; *fpp++ = file; fpl->count++; @@ -331,11 +332,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (!fpl) return NULL; - new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), + GFP_KERNEL); if (new_fpl) { - for (i=fpl->count-1; i>=0; i--) + for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); - memcpy(new_fpl, fpl, sizeof(*fpl)); + new_fpl->max = new_fpl->count; } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 104f8444754a..46cbd28f40f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -57,6 +57,7 @@ #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/errqueue.h> +#include <linux/prefetch.h> #include <net/protocol.h> #include <net/dst.h> @@ -210,6 +211,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); + kmemcheck_annotate_variable(shinfo->destructor_arg); if (fclone) { struct sk_buff *child = skb + 1; @@ -380,6 +382,8 @@ static void skb_release_head_state(struct sk_buff *skb) } #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb->nfct); +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED nf_conntrack_put_reasm(skb->nfct_reasm); #endif #ifdef CONFIG_BRIDGE_NETFILTER @@ -520,7 +524,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); new->priority = old->priority; - new->deliver_no_wcard = old->deliver_no_wcard; #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif @@ -778,6 +781,28 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, size = SKB_DATA_ALIGN(size); + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; + + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } + + if (fastpath && + size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { + memmove(skb->head + size, skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + memmove(skb->head + nhead, skb->head, + skb_tail_pointer(skb) - skb->head); + off = nhead; + goto adjust_others; + } + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); if (!data) goto nodata; @@ -791,17 +816,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); - /* Check if we can avoid taking references on fragments if we own - * the last reference on skb->head. (see skb_release_data()) - */ - if (!skb->cloned) - fastpath = true; - else { - int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - - fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; - } - if (fastpath) { kfree(skb->head); } else { @@ -816,6 +830,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, off = (data + nhead) - skb->head; skb->head = data; +adjust_others: skb->data += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->end = size; @@ -1812,7 +1827,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) long csstart; if (skb->ip_summed == CHECKSUM_PARTIAL) - csstart = skb->csum_start - skb_headroom(skb); + csstart = skb_checksum_start_offset(skb); else csstart = skb_headlen(skb); @@ -2253,7 +2268,7 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * - * Note 1: The size of each block of data returned can be arbitary, + * Note 1: The size of each block of data returned can be arbitrary, * this limitation is the cost for zerocopy seqeuental * reads of potentially non linear data. * @@ -2419,8 +2434,6 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, return -ENOMEM; /* initialize the next frag */ - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; skb_fill_page_desc(skb, frg_cnt, page, 0, 0); skb->truesize += PAGE_SIZE; atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); @@ -2440,7 +2453,6 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, return -EFAULT; /* copy was successful so update the size parameters */ - sk->sk_sndmsg_off += copy; frag->size += copy; skb->len += copy; skb->data_len += copy; @@ -2483,7 +2495,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum); * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, int features) +struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; @@ -2493,7 +2505,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) unsigned int offset = doffset; unsigned int headroom; unsigned int len; - int sg = features & NETIF_F_SG; + int sg = !!(features & NETIF_F_SG); int nfrags = skb_shinfo(skb)->nr_frags; int err = -ENOMEM; int i = 0; @@ -2730,8 +2742,12 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) merge: if (offset > headlen) { - skbinfo->frags[0].page_offset += offset - headlen; - skbinfo->frags[0].size -= offset - headlen; + unsigned int eat = offset - headlen; + + skbinfo->frags[0].page_offset += eat; + skbinfo->frags[0].size -= eat; + skb->data_len -= eat; + skb->len -= eat; offset = headlen; } @@ -2978,6 +2994,9 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + /* before exiting rcu section, make sure dst is refcounted */ + skb_dst_force(skb); + skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb->len); diff --git a/net/core/sock.c b/net/core/sock.c index 11db43632df8..6e819780c232 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -157,7 +157,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , - "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { @@ -173,7 +173,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , - "slock-AF_IEEE802154", "slock-AF_CAIF" , + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { @@ -189,7 +189,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-27" , "clock-28" , "clock-AF_CAN" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , - "clock-AF_IEEE802154", "clock-AF_CAIF" , + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_MAX" }; @@ -215,7 +215,7 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -/* Maximal space eaten by iovec or ancilliary data plus some space */ +/* Maximal space eaten by iovec or ancillary data plus some space */ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); @@ -992,23 +992,54 @@ static inline void sock_lock_init(struct sock *sk) /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif - BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != - sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) + - sizeof(osk->sk_tx_queue_mapping)); - memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, - osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); + + memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } +/* + * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * un-modified. Special care is taken when initializing object to zero. + */ +static inline void sk_prot_clear_nulls(struct sock *sk, int size) +{ + if (offsetof(struct sock, sk_node.next) != 0) + memset(sk, 0, offsetof(struct sock, sk_node.next)); + memset(&sk->sk_node.pprev, 0, + size - offsetof(struct sock, sk_node.pprev)); +} + +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) +{ + unsigned long nulls1, nulls2; + + nulls1 = offsetof(struct sock, __sk_common.skc_node.next); + nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); + if (nulls1 > nulls2) + swap(nulls1, nulls2); + + if (nulls1 != 0) + memset((char *)sk, 0, nulls1); + memset((char *)sk + nulls1 + sizeof(void *), 0, + nulls2 - nulls1 - sizeof(void *)); + memset((char *)sk + nulls2 + sizeof(void *), 0, + size - nulls2 - sizeof(void *)); +} +EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); + static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { @@ -1021,19 +1052,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!sk) return sk; if (priority & __GFP_ZERO) { - /* - * caches using SLAB_DESTROY_BY_RCU should let - * sk_node.next un-modified. Special care is taken - * when initializing object to zero. - */ - if (offsetof(struct sock, sk_node.next) != 0) - memset(sk, 0, offsetof(struct sock, sk_node.next)); - memset(&sk->sk_node.pprev, 0, - prot->obj_size - offsetof(struct sock, - sk_node.pprev)); + if (prot->clear_sk) + prot->clear_sk(sk, prot->obj_size); + else + sk_prot_clear_nulls(sk, prot->obj_size); } - } - else + } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { @@ -1151,7 +1175,7 @@ static void __sk_free(struct sock *sk) void sk_free(struct sock *sk) { /* - * We substract one from sk_wmem_alloc and can know if + * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ @@ -1161,10 +1185,10 @@ void sk_free(struct sock *sk) EXPORT_SYMBOL(sk_free); /* - * Last sock_put should drop referrence to sk->sk_net. It has already - * been dropped in sk_change_net. Taking referrence to stopping namespace + * Last sock_put should drop reference to sk->sk_net. It has already + * been dropped in sk_change_net. Taking reference to stopping namespace * is not an option. - * Take referrence to a socket to remove it from hash _alive_ and after that + * Take reference to a socket to remove it from hash _alive_ and after that * destroy it in the context of init_net. */ void sk_release_kernel(struct sock *sk) @@ -1225,7 +1249,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - filter = newsk->sk_filter; + filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) sk_filter_charge(newsk, filter); @@ -1653,10 +1677,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); - int allocated; + long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_add_return(amt, prot->memory_allocated); + allocated = atomic_long_add_return(amt, prot->memory_allocated); /* Under limit. */ if (allocated <= prot->sysctl_mem[0]) { @@ -1714,7 +1738,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, prot->memory_allocated); return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1727,12 +1751,12 @@ void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, prot->memory_allocated); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (prot->memory_pressure && *prot->memory_pressure && - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -1884,7 +1908,7 @@ static void sock_def_readable(struct sock *sk, int len) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (wq_has_sleeper(wq)) - wake_up_interruptible_sync_poll(&wq->wait, POLLIN | + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); @@ -2452,12 +2476,12 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 01eee5d984be..a829e3f60aeb 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,7 +34,8 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rps_sock_flow_table; + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); @@ -121,6 +122,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_BPF_JIT + { + .procname = "bpf_jit_enable", + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .procname = "netdev_tstamp_prequeue", .data = &netdev_tstamp_prequeue, diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 0ae6c22da85b..7e7ca375d431 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -26,12 +26,12 @@ static struct sock_filter ptp_filter[] = { PTP_FILTER }; -static unsigned int classify(struct sk_buff *skb) +static unsigned int classify(const struct sk_buff *skb) { if (likely(skb->dev && skb->dev->phydev && skb->dev->phydev->drv)) - return sk_run_filter(skb, ptp_filter, ARRAY_SIZE(ptp_filter)); + return sk_run_filter(skb, ptp_filter); else return PTP_CLASS_NONE; } @@ -96,11 +96,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; - skb_push(skb, ETH_HLEN); + if (skb_headroom(skb) < ETH_HLEN) + return false; + __skb_push(skb, ETH_HLEN); type = classify(skb); - skb_pull(skb, ETH_HLEN); + __skb_pull(skb, ETH_HLEN); switch (type) { case PTP_CLASS_V1_IPV4: diff --git a/net/core/utils.c b/net/core/utils.c index 5fea0ab21902..2012bc797f9c 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -296,3 +296,27 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, csum_unfold(*sum))); } EXPORT_SYMBOL(inet_proto_csum_replace4); + +int mac_pton(const char *s, u8 *mac) +{ + int i; + + /* XX:XX:XX:XX:XX:XX */ + if (strlen(s) < 3 * ETH_ALEN - 1) + return 0; + + /* Don't dirty result unless string is valid MAC. */ + for (i = 0; i < ETH_ALEN; i++) { + if (!strchr("0123456789abcdefABCDEF", s[i * 3])) + return 0; + if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1])) + return 0; + if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':') + return 0; + } + for (i = 0; i < ETH_ALEN; i++) { + mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]); + } + return 1; +} +EXPORT_SYMBOL(mac_pton); |