diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 3 | ||||
-rw-r--r-- | net/core/datagram.c | 9 | ||||
-rw-r--r-- | net/core/dev.c | 152 | ||||
-rw-r--r-- | net/core/devlink.c | 1768 | ||||
-rw-r--r-- | net/core/dst.c | 10 | ||||
-rw-r--r-- | net/core/dst_cache.c | 168 | ||||
-rw-r--r-- | net/core/ethtool.c | 645 | ||||
-rw-r--r-- | net/core/fib_rules.c | 4 | ||||
-rw-r--r-- | net/core/filter.c | 395 | ||||
-rw-r--r-- | net/core/flow.c | 14 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 58 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 2 | ||||
-rw-r--r-- | net/core/gen_stats.c | 36 | ||||
-rw-r--r-- | net/core/hwbm.c | 87 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 37 | ||||
-rw-r--r-- | net/core/neighbour.c | 22 | ||||
-rw-r--r-- | net/core/net-procfs.c | 3 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 18 | ||||
-rw-r--r-- | net/core/netclassid_cgroup.c | 1 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 1 | ||||
-rw-r--r-- | net/core/pktgen.c | 5 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 400 | ||||
-rw-r--r-- | net/core/skbuff.c | 444 | ||||
-rw-r--r-- | net/core/sock.c | 166 | ||||
-rw-r--r-- | net/core/sock_diag.c | 3 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 9 |
26 files changed, 4098 insertions, 362 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 0b835de04de3..d6508c2ddca5 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,3 +24,6 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o +obj-$(CONFIG_HWBM) += hwbm.o +obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/datagram.c b/net/core/datagram.c index fa9dc6450b08..b7de71f8d5d3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) { bool slow; if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + else if (likely(!atomic_dec_and_test(&skb->users))) { + sk_peek_offset_bwd(sk, len); return; + } slow = lock_sock_fast(sk); + sk_peek_offset_bwd(sk, len); skb_orphan(skb); sk_mem_reclaim_partial(sk); unlock_sock_fast(sk, slow); @@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) /* skb is now orphaned, can be freed outside of locked section */ __kfree_skb(skb); } -EXPORT_SYMBOL(skb_free_datagram_locked); +EXPORT_SYMBOL(__skb_free_datagram_locked); /** * skb_kill_datagram - Free a datagram skbuff forcibly diff --git a/net/core/dev.c b/net/core/dev.c index 0ef061b2badc..904ff431d570 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) +bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { unsigned int len; @@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) * taps currently in use. */ -static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; struct sk_buff *skb2 = NULL; @@ -1907,6 +1907,7 @@ out_unlock: pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change @@ -2711,6 +2712,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); @@ -2802,7 +2816,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_CSUM_MASK; + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2825,14 +2839,45 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb, return vlan_features_check(skb, features); } +static netdev_features_t gso_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + u16 gso_segs = skb_shinfo(skb)->gso_segs; + + if (gso_segs > dev->gso_max_segs) + return features & ~NETIF_F_GSO_MASK; + + /* Support for GSO partial features requires software + * intervention before we can actually process the packets + * so we need to strip support for any partial features now + * and we can pull them back in after we have partially + * segmented the frame. + */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) + features &= ~dev->gso_partial_features; + + /* Make sure to clear the IPv4 ID mangling feature if the + * IPv4 header has the potential to be fragmented. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + struct iphdr *iph = skb->encapsulation ? + inner_ip_hdr(skb) : ip_hdr(skb); + + if (!(iph->frag_off & htons(IP_DF))) + features &= ~NETIF_F_TSO_MANGLEID; + } + + return features; +} + netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; - u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) - features &= ~NETIF_F_GSO_MASK; + if (skb_is_gso(skb)) + features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard @@ -2915,9 +2960,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device { netdev_features_t features; - if (skb->next) - return skb; - features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -2960,6 +3002,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device out_kfree_skb: kfree_skb(skb); out_null: + atomic_long_inc(&dev->tx_dropped); return NULL; } @@ -3143,12 +3186,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) case TC_ACT_SHOT: qdisc_qstats_cpu_drop(cl->q); *ret = NET_XMIT_DROP; - goto drop; + kfree_skb(skb); + return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: *ret = NET_XMIT_SUCCESS; -drop: - kfree_skb(skb); + consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ @@ -3349,7 +3392,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb = validate_xmit_skb(skb, dev); if (!skb) - goto drop; + goto out; HARD_TX_LOCK(dev, txq, cpu); @@ -3376,7 +3419,6 @@ recursion_alert: } rc = -ENETDOWN; -drop: rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); @@ -3428,6 +3470,7 @@ u32 rps_cpu_mask __read_mostly; EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; +EXPORT_SYMBOL(rps_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -3829,8 +3872,14 @@ static void net_tx_action(struct softirq_action *h) trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); } + + __kfree_skb_flush(); } if (sd->output_queue) { @@ -3908,9 +3957,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: qdisc_qstats_cpu_drop(cl->q); + kfree_skb(skb); + return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: - kfree_skb(skb); + consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -4154,7 +4205,10 @@ ncls: ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); + if (!deliver_exact) + atomic_long_inc(&skb->dev->rx_dropped); + else + atomic_long_inc(&skb->dev->rx_nohandler); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -4429,7 +4483,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->udp_mark = 0; + NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->is_fou = 0; + NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ @@ -4654,6 +4710,8 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } @@ -4928,8 +4986,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock) netpoll_poll_unlock(have); } if (rc > 0) - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + __NET_ADD_STATS(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); local_bh_enable(); if (rc == LL_FLUSH_FAILED) @@ -5152,6 +5210,7 @@ static void net_rx_action(struct softirq_action *h) } } + __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -6435,6 +6494,7 @@ EXPORT_SYMBOL(dev_get_phys_port_id); * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name + * @len: limit of bytes to copy to name * * Get device physical port name */ @@ -6664,6 +6724,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_TSO6; } + /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ + if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) + features &= ~NETIF_F_TSO_MANGLEID; + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -6692,6 +6756,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + /* GSO partial features require GSO partial be set */ + if ((features & dev->gso_partial_features) && + !(features & NETIF_F_GSO_PARTIAL)) { + netdev_dbg(dev, + "Dropping partially supported GSO features since no GSO partial.\n"); + features &= ~dev->gso_partial_features; + } + #ifdef CONFIG_NET_RX_BUSY_POLL if (dev->netdev_ops->ndo_busy_poll) features |= NETIF_F_BUSY_POLL; @@ -6962,9 +7034,22 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_SOFT_FEATURES; dev->wanted_features = dev->features & dev->hw_features; - if (!(dev->flags & IFF_LOOPBACK)) { + if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; - } + + /* If IPv4 TCP segmentation offload is supported we should also + * allow the device to enable segmenting the frame with the option + * of ignoring a static IP ID value. This doesn't enable the + * feature itself but allows the user to enable it later. + */ + if (dev->hw_features & NETIF_F_TSO) + dev->hw_features |= NETIF_F_TSO_MANGLEID; + if (dev->vlan_features & NETIF_F_TSO) + dev->vlan_features |= NETIF_F_TSO_MANGLEID; + if (dev->mpls_features & NETIF_F_TSO) + dev->mpls_features |= NETIF_F_TSO_MANGLEID; + if (dev->hw_enc_features & NETIF_F_TSO) + dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ @@ -6972,7 +7057,7 @@ int register_netdevice(struct net_device *dev) /* Make NETIF_F_SG inheritable to tunnel devices. */ - dev->hw_enc_features |= NETIF_F_SG; + dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ @@ -7253,24 +7338,31 @@ void netdev_run_todo(void) } } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { #if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); memcpy(stats64, netdev_stats, sizeof(*stats64)); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + sizeof(*netdev_stats), 0, + sizeof(*stats64) - sizeof(*netdev_stats)); #else - size_t i, n = sizeof(*stats64) / sizeof(u64); + size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); const unsigned long *src = (const unsigned long *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = src[i]; + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); #endif } EXPORT_SYMBOL(netdev_stats_to_stats64); @@ -7300,6 +7392,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); storage->tx_dropped += atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -7407,7 +7500,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); diff --git a/net/core/devlink.c b/net/core/devlink.c new file mode 100644 index 000000000000..933e8d4d3968 --- /dev/null +++ b/net/core/devlink.c @@ -0,0 +1,1768 @@ +/* + * net/core/devlink.c - Network physical/parent device Netlink interface + * + * Heavily inspired by net/wireless/ + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/device.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <rdma/ib_verbs.h> +#include <net/netlink.h> +#include <net/genetlink.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/devlink.h> + +static LIST_HEAD(devlink_list); + +/* devlink_mutex + * + * An overall lock guarding every operation coming from userspace. + * It also guards devlink devices list and it is taken when + * driver registers/unregisters it. + */ +static DEFINE_MUTEX(devlink_mutex); + +/* devlink_port_mutex + * + * Shared lock to guard lists of ports in all devlink devices. + */ +static DEFINE_MUTEX(devlink_port_mutex); + +static struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} + +static void devlink_net_set(struct devlink *devlink, struct net *net) +{ + write_pnet(&devlink->_net, net); +} + +static struct devlink *devlink_get_from_attrs(struct net *net, + struct nlattr **attrs) +{ + struct devlink *devlink; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + list_for_each_entry(devlink, &devlink_list, list) { + if (strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0 && + net_eq(devlink_net(devlink), net)) + return devlink; + } + + return ERR_PTR(-ENODEV); +} + +static struct devlink *devlink_get_from_info(struct genl_info *info) +{ + return devlink_get_from_attrs(genl_info_net(info), info->attrs); +} + +static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, + int port_index) +{ + struct devlink_port *devlink_port; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (devlink_port->index == port_index) + return devlink_port; + } + return NULL; +} + +static bool devlink_port_index_exists(struct devlink *devlink, int port_index) +{ + return devlink_port_get_by_index(devlink, port_index); +} + +static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) + return ERR_PTR(-ENODEV); + return devlink_port; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_port_get_from_attrs(devlink, info->attrs); +} + +struct devlink_sb { + struct list_head list; + unsigned int index; + u32 size; + u16 ingress_pools_count; + u16 egress_pools_count; + u16 ingress_tc_count; + u16 egress_tc_count; +}; + +static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb) +{ + return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count; +} + +static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink, + unsigned int sb_index) +{ + struct devlink_sb *devlink_sb; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (devlink_sb->index == sb_index) + return devlink_sb; + } + return NULL; +} + +static bool devlink_sb_index_exists(struct devlink *devlink, + unsigned int sb_index) +{ + return devlink_sb_get_by_index(devlink, sb_index); +} + +static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_SB_INDEX]) { + u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]); + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); + if (!devlink_sb) + return ERR_PTR(-ENODEV); + return devlink_sb; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_sb_get_from_attrs(devlink, info->attrs); +} + +static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb, + struct nlattr **attrs, + u16 *p_pool_index) +{ + u16 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX]) + return -EINVAL; + + val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]); + if (val >= devlink_sb_pool_count(devlink_sb)) + return -EINVAL; + *p_pool_index = val; + return 0; +} + +static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb, + struct genl_info *info, + u16 *p_pool_index) +{ + return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs, + p_pool_index); +} + +static int +devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs, + enum devlink_sb_pool_type *p_pool_type) +{ + u8 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE]) + return -EINVAL; + + val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]); + if (val != DEVLINK_SB_POOL_TYPE_INGRESS && + val != DEVLINK_SB_POOL_TYPE_EGRESS) + return -EINVAL; + *p_pool_type = val; + return 0; +} + +static int +devlink_sb_pool_type_get_from_info(struct genl_info *info, + enum devlink_sb_pool_type *p_pool_type) +{ + return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type); +} + +static int +devlink_sb_th_type_get_from_attrs(struct nlattr **attrs, + enum devlink_sb_threshold_type *p_th_type) +{ + u8 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]) + return -EINVAL; + + val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]); + if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC && + val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC) + return -EINVAL; + *p_th_type = val; + return 0; +} + +static int +devlink_sb_th_type_get_from_info(struct genl_info *info, + enum devlink_sb_threshold_type *p_th_type) +{ + return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type); +} + +static int +devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, + struct nlattr **attrs, + enum devlink_sb_pool_type pool_type, + u16 *p_tc_index) +{ + u16 val; + + if (!attrs[DEVLINK_ATTR_SB_TC_INDEX]) + return -EINVAL; + + val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]); + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS && + val >= devlink_sb->ingress_tc_count) + return -EINVAL; + if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS && + val >= devlink_sb->egress_tc_count) + return -EINVAL; + *p_tc_index = val; + return 0; +} + +static int +devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, + struct genl_info *info, + enum devlink_sb_pool_type pool_type, + u16 *p_tc_index) +{ + return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs, + pool_type, p_tc_index); +} + +#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) +#define DEVLINK_NL_FLAG_NEED_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_SB BIT(2) +#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3) + /* port is not needed but we need to ensure they don't + * change in the middle of command + */ + +static int devlink_nl_pre_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_info(info); + if (IS_ERR(devlink)) { + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink); + } + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + info->user_ptr[0] = devlink; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + struct devlink_port *devlink_port; + + mutex_lock(&devlink_port_mutex); + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink_port); + } + info->user_ptr[0] = devlink_port; + } + if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) { + mutex_lock(&devlink_port_mutex); + } + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) { + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink_sb); + } + info->user_ptr[1] = devlink_sb; + } + return 0; +} + +static void devlink_nl_post_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT || + ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); +} + +static struct genl_family devlink_nl_family = { + .id = GENL_ID_GENERATE, + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, +}; + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) +{ + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + return -EMSGSIZE; + return 0; +} + +static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) + goto nla_put_failure; + if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && + nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, + devlink_port->desired_type)) + goto nla_put_failure; + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net_device *netdev = devlink_port->type_dev; + + if (netdev && + (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, + netdev->ifindex) || + nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, + netdev->name))) + goto nla_put_failure; + } + if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { + struct ib_device *ibdev = devlink_port->type_dev; + + if (ibdev && + nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, + ibdev->name)) + goto nla_put_failure; + } + if (devlink_port->split && + nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + devlink_port->split_group)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_port->devlink; + struct sk_buff *msg; + int err; + + if (!devlink_port->registered) + return; + + WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + goto out; + idx++; + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_PORT_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_port *devlink_port; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) + goto out; + idx++; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_port_type_set(struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_port_type port_type) + +{ + int err; + + if (devlink->ops && devlink->ops->port_type_set) { + if (port_type == DEVLINK_PORT_TYPE_NOTSET) + return -EINVAL; + err = devlink->ops->port_type_set(devlink_port, port_type); + if (err) + return err; + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; + } + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; + int err; + + if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { + enum devlink_port_type port_type; + + port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); + err = devlink_port_type_set(devlink, devlink_port, port_type); + if (err) + return err; + } + return 0; +} + +static int devlink_port_split(struct devlink *devlink, + u32 port_index, u32 count) + +{ + if (devlink->ops && devlink->ops->port_split) + return devlink->ops->port_split(devlink, port_index, count); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + u32 count; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] || + !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + return devlink_port_split(devlink, port_index, count); +} + +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) + +{ + if (devlink->ops && devlink->ops->port_unsplit) + return devlink->ops->port_unsplit(devlink, port_index); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + return devlink_port_unsplit(devlink, port_index); +} + +static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_sb *devlink_sb, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT, + devlink_sb->ingress_pools_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT, + devlink_sb->egress_pools_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT, + devlink_sb->ingress_tc_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT, + devlink_sb->egress_tc_count)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) + goto out; + idx++; + } + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_sb *devlink_sb, + u16 pool_index, enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + struct devlink_sb_pool_info pool_info; + void *hdr; + int err; + + err = devlink->ops->sb_pool_get(devlink, devlink_sb->index, + pool_index, &pool_info); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, + pool_info.threshold_type)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + u16 pool_index; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!devlink->ops || !devlink->ops->sb_pool_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, + DEVLINK_CMD_SB_POOL_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq) +{ + u16 pool_count = devlink_sb_pool_count(devlink_sb); + u16 pool_index; + int err; + + for (pool_index = 0; pool_index < pool_count; pool_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_pool_fill(msg, devlink, + devlink_sb, + pool_index, + DEVLINK_CMD_SB_POOL_NEW, + portid, seq, NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + return 0; +} + +static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || + !devlink->ops || !devlink->ops->sb_pool_get) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_pool_get_dumpit(msg, start, &idx, devlink, + devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err && err != -EOPNOTSUPP) + goto out; + } + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type) + +{ + const struct devlink_ops *ops = devlink->ops; + + if (ops && ops->sb_pool_set) + return ops->sb_pool_set(devlink, sb_index, pool_index, + size, threshold_type); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + enum devlink_sb_threshold_type threshold_type; + u16 pool_index; + u32 size; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + err = devlink_sb_th_type_get_from_info(info, &threshold_type); + if (err) + return err; + + if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]) + return -EINVAL; + + size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); + return devlink_sb_pool_set(devlink, devlink_sb->index, + pool_index, size, threshold_type); +} + +static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_port *devlink_port, + struct devlink_sb *devlink_sb, + u16 pool_index, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + u32 threshold; + void *hdr; + int err; + + err = ops->sb_port_pool_get(devlink_port, devlink_sb->index, + pool_index, &threshold); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) + goto nla_put_failure; + + if (ops->sb_occ_port_pool_get) { + u32 cur; + u32 max; + + err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, + pool_index, &cur, &max); + if (err && err != -EOPNOTSUPP) + return err; + if (!err) { + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) + goto nla_put_failure; + } + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + u16 pool_index; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!devlink->ops || !devlink->ops->sb_port_pool_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, + devlink_sb, pool_index, + DEVLINK_CMD_SB_PORT_POOL_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq) +{ + struct devlink_port *devlink_port; + u16 pool_count = devlink_sb_pool_count(devlink_sb); + u16 pool_index; + int err; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + for (pool_index = 0; pool_index < pool_count; pool_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_port_pool_fill(msg, devlink, + devlink_port, + devlink_sb, + pool_index, + DEVLINK_CMD_SB_PORT_POOL_NEW, + portid, seq, + NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + } + return 0; +} + +static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || + !devlink->ops || !devlink->ops->sb_port_pool_get) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_port_pool_get_dumpit(msg, start, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err && err != -EOPNOTSUPP) + goto out; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, + unsigned int sb_index, u16 pool_index, + u32 threshold) + +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (ops && ops->sb_port_pool_set) + return ops->sb_port_pool_set(devlink_port, sb_index, + pool_index, threshold); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + u16 pool_index; + u32 threshold; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + return -EINVAL; + + threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); + return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, + pool_index, threshold); +} + +static int +devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + struct devlink_sb *devlink_sb, u16 tc_index, + enum devlink_sb_pool_type pool_type, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + u16 pool_index; + u32 threshold; + void *hdr; + int err; + + err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index, + tc_index, pool_type, + &pool_index, &threshold); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) + goto nla_put_failure; + + if (ops->sb_occ_tc_port_bind_get) { + u32 cur; + u32 max; + + err = ops->sb_occ_tc_port_bind_get(devlink_port, + devlink_sb->index, + tc_index, pool_type, + &cur, &max); + if (err && err != -EOPNOTSUPP) + return err; + if (!err) { + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) + goto nla_put_failure; + } + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + enum devlink_sb_pool_type pool_type; + u16 tc_index; + int err; + + err = devlink_sb_pool_type_get_from_info(info, &pool_type); + if (err) + return err; + + err = devlink_sb_tc_index_get_from_info(devlink_sb, info, + pool_type, &tc_index); + if (err) + return err; + + if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, + devlink_sb, tc_index, pool_type, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + info->snd_portid, + info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, + int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq) +{ + struct devlink_port *devlink_port; + u16 tc_index; + int err; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + for (tc_index = 0; + tc_index < devlink_sb->ingress_tc_count; tc_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, + devlink_port, + devlink_sb, + tc_index, + DEVLINK_SB_POOL_TYPE_INGRESS, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + portid, seq, + NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + for (tc_index = 0; + tc_index < devlink_sb->egress_tc_count; tc_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, + devlink_port, + devlink_sb, + tc_index, + DEVLINK_SB_POOL_TYPE_EGRESS, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + portid, seq, + NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + } + return 0; +} + +static int +devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || + !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, + devlink, + devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err && err != -EOPNOTSUPP) + goto out; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold) + +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (ops && ops->sb_tc_pool_bind_set) + return ops->sb_tc_pool_bind_set(devlink_port, sb_index, + tc_index, pool_type, + pool_index, threshold); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + enum devlink_sb_pool_type pool_type; + u16 tc_index; + u16 pool_index; + u32 threshold; + int err; + + err = devlink_sb_pool_type_get_from_info(info, &pool_type); + if (err) + return err; + + err = devlink_sb_tc_index_get_from_info(devlink_sb, info, + pool_type, &tc_index); + if (err) + return err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + return -EINVAL; + + threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); + return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, + tc_index, pool_type, + pool_index, threshold); +} + +static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + const struct devlink_ops *ops = devlink->ops; + + if (ops && ops->sb_occ_snapshot) + return ops->sb_occ_snapshot(devlink, devlink_sb->index); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + const struct devlink_ops *ops = devlink->ops; + + if (ops && ops->sb_occ_max_clear) + return ops->sb_occ_max_clear(devlink, devlink_sb->index); + return -EOPNOTSUPP; +} + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, +}; + +static const struct genl_ops devlink_nl_ops[] = { + { + .cmd = DEVLINK_CMD_GET, + .doit = devlink_nl_cmd_get_doit, + .dumpit = devlink_nl_cmd_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .doit = devlink_nl_cmd_port_get_doit, + .dumpit = devlink_nl_cmd_port_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_SET, + .doit = devlink_nl_cmd_port_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_SPLIT, + .doit = devlink_nl_cmd_port_split_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .doit = devlink_nl_cmd_port_unsplit_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_SB_GET, + .doit = devlink_nl_cmd_sb_get_doit, + .dumpit = devlink_nl_cmd_sb_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_POOL_GET, + .doit = devlink_nl_cmd_sb_pool_get_doit, + .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_POOL_SET, + .doit = devlink_nl_cmd_sb_pool_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .doit = devlink_nl_cmd_sb_port_pool_get_doit, + .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, + .doit = devlink_nl_cmd_sb_port_pool_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, + .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, + .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, + .doit = devlink_nl_cmd_sb_occ_snapshot_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB | + DEVLINK_NL_FLAG_LOCK_PORTS, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, + .doit = devlink_nl_cmd_sb_occ_max_clear_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB | + DEVLINK_NL_FLAG_LOCK_PORTS, + }, +}; + +/** + * devlink_alloc - Allocate new devlink instance resources + * + * @ops: ops + * @priv_size: size of user private data + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +{ + struct devlink *devlink; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + devlink->ops = ops; + devlink_net_set(devlink, &init_net); + INIT_LIST_HEAD(&devlink->port_list); + INIT_LIST_HEAD(&devlink->sb_list); + return devlink; +} +EXPORT_SYMBOL_GPL(devlink_alloc); + +/** + * devlink_register - Register devlink instance + * + * @devlink: devlink + */ +int devlink_register(struct devlink *devlink, struct device *dev) +{ + mutex_lock(&devlink_mutex); + devlink->dev = dev; + list_add_tail(&devlink->list, &devlink_list); + devlink_notify(devlink, DEVLINK_CMD_NEW); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devlink_unregister - Unregister devlink instance + * + * @devlink: devlink + */ +void devlink_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink_notify(devlink, DEVLINK_CMD_DEL); + list_del(&devlink->list); + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + kfree(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +/** + * devlink_port_register - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + */ +int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + mutex_lock(&devlink_port_mutex); + if (devlink_port_index_exists(devlink, port_index)) { + mutex_unlock(&devlink_port_mutex); + return -EEXIST; + } + devlink_port->devlink = devlink; + devlink_port->index = port_index; + devlink_port->registered = true; + list_add_tail(&devlink_port->list, &devlink->port_list); + mutex_unlock(&devlink_port_mutex); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_register); + +/** + * devlink_port_unregister - Unregister devlink port + * + * @devlink_port: devlink port + */ +void devlink_port_unregister(struct devlink_port *devlink_port) +{ + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + mutex_lock(&devlink_port_mutex); + list_del(&devlink_port->list); + mutex_unlock(&devlink_port_mutex); +} +EXPORT_SYMBOL_GPL(devlink_port_unregister); + +static void __devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type type, + void *type_dev) +{ + devlink_port->type = type; + devlink_port->type_dev = type_dev; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * @netdev: related netdevice + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_ETH, netdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); + +/** + * devlink_port_type_ib_set - Set port type to InfiniBand + * + * @devlink_port: devlink port + * @ibdev: related IB device + */ +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_IB, ibdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); + +/** + * devlink_port_type_clear - Clear port type + * + * @devlink_port: devlink port + */ +void devlink_port_type_clear(struct devlink_port *devlink_port) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_NOTSET, NULL); +} +EXPORT_SYMBOL_GPL(devlink_port_type_clear); + +/** + * devlink_port_split_set - Set port is split + * + * @devlink_port: devlink port + * @split_group: split group - identifies group split port is part of + */ +void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group) +{ + devlink_port->split = true; + devlink_port->split_group = split_group; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_split_set); + +int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count) +{ + struct devlink_sb *devlink_sb; + int err = 0; + + mutex_lock(&devlink_mutex); + if (devlink_sb_index_exists(devlink, sb_index)) { + err = -EEXIST; + goto unlock; + } + + devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); + if (!devlink_sb) { + err = -ENOMEM; + goto unlock; + } + devlink_sb->index = sb_index; + devlink_sb->size = size; + devlink_sb->ingress_pools_count = ingress_pools_count; + devlink_sb->egress_pools_count = egress_pools_count; + devlink_sb->ingress_tc_count = ingress_tc_count; + devlink_sb->egress_tc_count = egress_tc_count; + list_add_tail(&devlink_sb->list, &devlink->sb_list); +unlock: + mutex_unlock(&devlink_mutex); + return err; +} +EXPORT_SYMBOL_GPL(devlink_sb_register); + +void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) +{ + struct devlink_sb *devlink_sb; + + mutex_lock(&devlink_mutex); + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); + WARN_ON(!devlink_sb); + list_del(&devlink_sb->list); + mutex_unlock(&devlink_mutex); + kfree(devlink_sb); +} +EXPORT_SYMBOL_GPL(devlink_sb_unregister); + +static int __init devlink_module_init(void) +{ + return genl_register_family_with_ops_groups(&devlink_nl_family, + devlink_nl_ops, + devlink_nl_mcgrps); +} + +static void __exit devlink_module_exit(void) +{ + genl_unregister_family(&devlink_nl_family); +} + +module_init(devlink_module_init); +module_exit(devlink_module_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); +MODULE_DESCRIPTION("Network physical device Netlink interface"); +MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME); diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3b8d72..b5cbbe07f786 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000000..554d36449231 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,168 @@ +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index daf04709dd3c..bdb4013581b1 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -79,12 +79,16 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", @@ -98,6 +102,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", }; static const char @@ -386,43 +391,464 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } -int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, + u32 legacy_u32) { + bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); + dst[0] = legacy_u32; +} +EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); + +/* return false if src had higher bits set. lower bits always updated. */ +bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, + const unsigned long *src) +{ + bool retval = true; + + /* TODO: following test will soon always be true */ + if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); + + bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_fill(ext, 32); + bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + if (bitmap_intersects(ext, src, + __ETHTOOL_LINK_MODE_MASK_NBITS)) { + /* src mask goes beyond bit 31 */ + retval = false; + } + } + *legacy_u32 = src[0]; + return retval; +} +EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); + +/* return false if legacy contained non-0 deprecated fields + * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +static bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->transceiver || + legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +/* return false if ksettings link modes had higher bits + * set. legacy_settings always updated (best effort) + */ +static bool +convert_link_ksettings_to_legacy_settings( + struct ethtool_cmd *legacy_settings, + const struct ethtool_link_ksettings *link_ksettings) +{ + bool retval = true; + + memset(legacy_settings, 0, sizeof(*legacy_settings)); + /* this also clears the deprecated fields in legacy structure: + * __u8 transceiver; + * __u32 maxtxpkt; + * __u32 maxrxpkt; + */ + + retval &= ethtool_convert_link_mode_to_legacy_u32( + &legacy_settings->supported, + link_ksettings->link_modes.supported); + retval &= ethtool_convert_link_mode_to_legacy_u32( + &legacy_settings->advertising, + link_ksettings->link_modes.advertising); + retval &= ethtool_convert_link_mode_to_legacy_u32( + &legacy_settings->lp_advertising, + link_ksettings->link_modes.lp_advertising); + ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); + legacy_settings->duplex + = link_ksettings->base.duplex; + legacy_settings->port + = link_ksettings->base.port; + legacy_settings->phy_address + = link_ksettings->base.phy_address; + legacy_settings->autoneg + = link_ksettings->base.autoneg; + legacy_settings->mdio_support + = link_ksettings->base.mdio_support; + legacy_settings->eth_tp_mdix + = link_ksettings->base.eth_tp_mdix; + legacy_settings->eth_tp_mdix_ctrl + = link_ksettings->base.eth_tp_mdix_ctrl; + return retval; +} + +/* number of 32-bit words to store the user's link mode bitmaps */ +#define __ETHTOOL_LINK_MODE_MASK_NU32 \ + DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) + +/* layout of the struct passed from/to userland */ +struct ethtool_link_usettings { + struct ethtool_link_settings base; + struct { + __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + } link_modes; +}; + +/* Internal kernel helper to query a device ethtool_link_settings. + * + * Backward compatibility note: for compatibility with legacy drivers + * that implement only the ethtool_cmd API, this has to work with both + * drivers implementing get_link_ksettings API and drivers + * implementing get_settings API. When drivers implement get_settings + * and report ethtool_cmd deprecated fields + * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored + * because the resulting struct ethtool_link_settings does not report them. + */ +int __ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *link_ksettings) +{ + int err; + struct ethtool_cmd cmd; + ASSERT_RTNL(); + if (dev->ethtool_ops->get_link_ksettings) { + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, + link_ksettings); + } + + /* driver doesn't support %ethtool_link_ksettings API. revert to + * legacy %ethtool_cmd API, unless it's not supported either. + * TODO: remove when ethtool_ops::get_settings disappears internally + */ if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt + */ + convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); + return err; } -EXPORT_SYMBOL(__ethtool_get_settings); +EXPORT_SYMBOL(__ethtool_get_link_ksettings); -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +/* convert ethtool_link_usettings in user space to a kernel internal + * ethtool_link_ksettings. return 0 on success, errno on error. + */ +static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, + const void __user *from) { - int err; - struct ethtool_cmd cmd; + struct ethtool_link_usettings link_usettings; + + if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) + return -EFAULT; + + memcpy(&to->base, &link_usettings.base, sizeof(to->base)); + bitmap_from_u32array(to->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + + return 0; +} + +/* convert a kernel internal ethtool_link_ksettings to + * ethtool_link_usettings in user space. return 0 on success, errno on + * error. + */ +static int +store_link_ksettings_for_user(void __user *to, + const struct ethtool_link_ksettings *from) +{ + struct ethtool_link_usettings link_usettings; + + memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); + bitmap_to_u32array(link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + + if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) + return -EFAULT; + + return 0; +} + +/* Query device for its ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::get_link_ksettings, even if legacy + * ethtool_ops::get_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_GSET for + * this driver, so that they can correctly access the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_get_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err = 0; + struct ethtool_link_ksettings link_ksettings; - err = __ethtool_get_settings(dev, &cmd); + ASSERT_RTNL(); + + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + /* handle bitmap nbits handshake */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) { + /* wrong link mode nbits requested */ + memset(&link_ksettings, 0, sizeof(link_ksettings)); + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + /* send back number of words required as negative val */ + compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, + "need too many bits for link modes!"); + link_ksettings.base.link_mode_masks_nwords + = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); + + /* copy the base fields back to user, not the link + * mode bitmaps + */ + if (copy_to_user(useraddr, &link_ksettings.base, + sizeof(link_ksettings.base))) + return -EFAULT; + + return 0; + } + + /* handshake successful: user/kernel agree on + * link_mode_masks_nwords + */ + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; + /* make sure we tell the right values to user */ + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + + return store_link_ksettings_for_user(useraddr, &link_ksettings); +} + +/* Update device ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::set_link_ksettings, even if legacy + * ethtool_ops::set_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_SSET for + * this driver, so that they can correctly update the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_set_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err; + struct ethtool_link_ksettings link_ksettings; + + ASSERT_RTNL(); + + if (!dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + /* make sure nbits field has expected value */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + /* copy the whole structure, now that we know it has expected + * format + */ + err = load_link_ksettings_from_user(&link_ksettings, useraddr); + if (err) + return err; + + /* re-check nwords field, just in case */ + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); +} + +static void +warn_incomplete_ethtool_legacy_settings_conversion(const char *details) +{ + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", + get_task_comm(name, current), details); +} + +/* Query device for its ethtool_cmd settings. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing get_link_ksettings + * API and drivers implementing get_settings API. When drivers + * implement get_link_ksettings and report higher link mode bits, a + * kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful + * (only the lower link mode bits reported back to user). + */ +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + ASSERT_RTNL(); + + if (dev->ethtool_ops->get_link_ksettings) { + /* First, use link_ksettings API if it is supported */ + int err; + struct ethtool_link_ksettings link_ksettings; + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, + &link_ksettings); + if (err < 0) + return err; + if (!convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings)) + warn_incomplete_ethtool_legacy_settings_conversion( + "link modes are only partially reported"); + + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; + } else { + /* driver doesn't support %ethtool_link_ksettings + * API. revert to legacy %ethtool_cmd API, unless it's + * not supported either. + */ + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + } + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; + return 0; } +/* Update device link settings with given ethtool_cmd. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing set_link_ksettings + * API and drivers implementing set_settings API. When drivers + * implement set_link_ksettings and user's request updates deprecated + * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, and the request is rejected. + */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd; - if (!dev->ethtool_ops->set_settings) - return -EOPNOTSUPP; + ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; + /* first, try new %ethtool_link_ksettings API. */ + if (dev->ethtool_ops->set_link_ksettings) { + struct ethtool_link_ksettings link_ksettings; + + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, + &cmd)) + return -EINVAL; + + link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, + &link_ksettings); + } + + /* legacy %ethtool_cmd API */ + + /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings + * disappears internally + */ + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + return dev->ethtool_ops->set_settings(dev, &cmd); } @@ -632,7 +1058,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } -u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { @@ -642,6 +1068,37 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -738,6 +1195,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); @@ -897,6 +1362,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rss_config); @@ -1227,14 +1700,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels; + struct ethtool_channels channels, max; + u32 max_rx_in_use = 0; - if (!dev->ethtool_ops->set_channels) + if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; + dev->ethtool_ops->get_channels(dev, &max); + + /* ensure new counts are within the maximums */ + if ((channels.rx_count > max.max_rx) || + (channels.tx_count > max.max_tx) || + (channels.combined_count > max.max_combined) || + (channels.other_count > max.max_other)) + return -EINVAL; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } @@ -1823,13 +2313,121 @@ out: return ret; } +static int ethtool_get_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int ret; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if (!dev->ethtool_ops->get_per_queue_coalesce) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + return ret; + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + useraddr += sizeof(coalesce); + } + + return 0; +} + +static int ethtool_set_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int i, ret = 0; + int n_queue; + struct ethtool_coalesce *backup = NULL, *tmp = NULL; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if ((!dev->ethtool_ops->set_per_queue_coalesce) || + (!dev->ethtool_ops->get_per_queue_coalesce)) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); + tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); + if (ret != 0) + goto roll_back; + + tmp++; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { + ret = -EFAULT; + goto roll_back; + } + + ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + goto roll_back; + + useraddr += sizeof(coalesce); + } + +roll_back: + if (ret != 0) { + tmp = backup; + for_each_set_bit(i, queue_mask, bit) { + dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); + tmp++; + } + } + kfree(backup); + + return ret; +} + +static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_per_queue_op per_queue_opt; + + if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) + return -EFAULT; + + switch (per_queue_opt.sub_command) { + case ETHTOOL_GCOALESCE: + return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); + case ETHTOOL_SCOALESCE: + return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); + default: + return -EOPNOTSUPP; + }; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; - u32 ethcmd; + u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; @@ -1839,8 +2437,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (ethcmd == ETHTOOL_PERQUEUE) { + if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) + return -EFAULT; + } else { + sub_cmd = ethcmd; + } /* Allow some commands to be done by anyone */ - switch (ethcmd) { + switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: @@ -2070,6 +2674,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; + case ETHTOOL_PERQUEUE: + rc = ethtool_set_per_queue(dev, useraddr); + break; + case ETHTOOL_GLINKSETTINGS: + rc = ethtool_get_link_ksettings(dev, useraddr); + break; + case ETHTOOL_SLINKSETTINGS: + rc = ethtool_set_link_ksettings(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 365de66436ac..840acebbb80c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -549,7 +549,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ - + nla_total_size(8); /* FRA_TUN_ID */ + + nla_total_size_64bit(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -607,7 +607,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index bba502f7cd57..68adb5f52110 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -530,12 +530,14 @@ do_pass: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - /* RET_K, RET_A are remaped into 2 insns. */ + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: - *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? - BPF_K : BPF_X, BPF_REG_0, - BPF_REG_A, fp->k); + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); *insn = BPF_EXIT_INSN(); break; @@ -992,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - bpf_prog_select_runtime(fp); + /* We are guaranteed to never error here with cBPF to eBPF + * transitions, since there's no issue with type compatibility + * checks on program arrays. + */ + fp = bpf_prog_select_runtime(fp, &err); kfree(old_prog); return fp; @@ -1164,7 +1170,7 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) } old_fp = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) @@ -1181,7 +1187,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) if (bpf_prog_size(prog->len) > sysctl_optmem_max) return -ENOMEM; - if (sk_unhashed(sk)) { + if (sk_unhashed(sk) && sk->sk_reuseport) { err = reuseport_alloc(sk); if (err) return err; @@ -1333,18 +1339,40 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_LDST_LEN 16U +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); + +static inline int bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err; + + if (!skb_cloned(skb)) + return 0; + if (skb_clone_writable(skb, write_len)) + return 0; + err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (!err) + bpf_compute_data_end(skb); + return err; +} static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); struct sk_buff *skb = (struct sk_buff *) (long) r1; int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[BPF_LDST_LEN]; void *ptr; - if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; /* bpf verifier guarantees that: @@ -1355,14 +1383,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + len))) + if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, len, buf); + ptr = skb_header_pointer(skb, offset, len, sp->buff); if (unlikely(!ptr)) return -EFAULT; @@ -1371,17 +1397,19 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) memcpy(ptr, from, len); - if (ptr == buf) + if (ptr == sp->buff) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); return 0; } -const struct bpf_func_proto bpf_skb_store_bytes_proto = { +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1400,25 +1428,28 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) - return -EFAULT; + if (unlikely((u32) offset > 0xffff)) + goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) - return -EFAULT; + goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; } -const struct bpf_func_proto bpf_skb_load_bytes_proto = { +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_PTR_TO_RAW_STACK, .arg4_type = ARG_CONST_STACK_SIZE, }; @@ -1432,9 +1463,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1442,6 +1471,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EFAULT; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; case 2: csum_replace2(ptr, from, to); break; @@ -1459,7 +1494,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l3_csum_replace_proto = { +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1474,23 +1509,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); if (unlikely(!ptr)) return -EFAULT; + if (is_mmzero && !*ptr) + return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1501,6 +1544,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; if (ptr == &sum) /* skb_store_bits guaranteed to not return -EFAULT here */ skb_store_bits(skb, offset, ptr, sizeof(sum)); @@ -1508,7 +1553,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l4_csum_replace_proto = { +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1519,6 +1564,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u64 diff_size = from_size + to_size; + __be32 *from = (__be32 *) (long) r1; + __be32 *to = (__be32 *) (long) r3; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +static const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; @@ -1543,11 +1627,10 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) } skb2->dev = dev; - skb_sender_cpu_clear(skb2); return dev_queue_xmit(skb2); } -const struct bpf_func_proto bpf_clone_redirect_proto = { +static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1596,11 +1679,10 @@ int skb_do_redirect(struct sk_buff *skb) } skb->dev = dev; - skb_sender_cpu_clear(skb); return dev_queue_xmit(skb); } -const struct bpf_func_proto bpf_redirect_proto = { +static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1622,14 +1704,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { -#ifdef CONFIG_IP_ROUTE_CLASSID - const struct dst_entry *dst; - - dst = skb_dst((struct sk_buff *) (unsigned long) r1); - if (dst) - return dst->tclassid; -#endif - return 0; + return dst_tclassid((struct sk_buff *) (unsigned long) r1); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1643,12 +1718,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; __be16 vlan_proto = (__force __be16) r2; + int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); - return skb_vlan_push(skb, vlan_proto, vlan_tci); + ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_push_proto = { @@ -1664,8 +1742,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; + int ret; - return skb_vlan_pop(skb); + ret = skb_vlan_pop(skb); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_pop_proto = { @@ -1682,6 +1763,13 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_vlan_pop) return true; + if (func == bpf_skb_store_bytes) + return true; + if (func == bpf_l3_csum_replace) + return true; + if (func == bpf_l4_csum_replace) + return true; + return false; } @@ -1696,23 +1784,34 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; + void *to_orig = to; + int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) - return -EINVAL; - if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) - return -EPROTO; + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + err = -EINVAL; + goto err_clear; + } + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { + err = -EPROTO; + goto err_clear; + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + err = -EINVAL; switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) - return -EINVAL; + goto err_clear; +set_compat: to = (struct bpf_tunnel_key *)compat; break; default: - return -EINVAL; + goto err_clear; } } @@ -1720,28 +1819,69 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - if (flags & BPF_F_TUNINFO_IPV6) + if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); - else + to->tunnel_label = be32_to_cpu(info->key.label); + } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) - memcpy((void *)(long) r2, to, size); + memcpy(to_orig, to, size); return 0; +err_clear: + memset(to_orig, 0, size); + return err; } -const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_PTR_TO_RAW_STACK, .arg3_type = ARG_CONST_STACK_SIZE, .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + int err; + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + err = -ENOENT; + goto err_clear; + } + if (unlikely(size < info->options_len)) { + err = -ENOMEM; + goto err_clear; + } + + ip_tunnel_info_opts_get(to, info); + if (size > info->options_len) + memset(to + info->options_len, 0, size - info->options_len); + + return info->options_len; +err_clear: + memset(to, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_RAW_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1752,10 +1892,13 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1768,6 +1911,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) + return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1776,7 +1922,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; @@ -1785,6 +1934,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); if (flags & BPF_F_ZERO_CSUM_TX) @@ -1794,7 +1945,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1804,17 +1955,53 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > IP_TUNNEL_OPTS_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1851,6 +2038,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -1866,11 +2055,17 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; + case BPF_FUNC_perf_event_output: + return bpf_get_event_output_proto(); default: return sk_filter_func_proto(func_id); } @@ -1878,16 +2073,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static bool __is_valid_access(int off, int size, enum bpf_access_type type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; - - /* disallow misaligned access */ + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - - /* all __sk_buff fields are __u32 */ - if (size != 4) + if (size != sizeof(__u32)) return false; return true; @@ -1896,13 +2087,17 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { - if (off == offsetof(struct __sk_buff, tc_classid)) + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + case offsetof(struct __sk_buff, data): + case offsetof(struct __sk_buff, data_end): return false; + } if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): break; default: return false; @@ -1915,16 +2110,14 @@ static bool sk_filter_is_valid_access(int off, int size, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type) { - if (off == offsetof(struct __sk_buff, tc_classid)) - return type == BPF_WRITE ? true : false; - if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): + case offsetof(struct __sk_buff, tc_classid): break; default: return false; @@ -2041,8 +2234,24 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, tc_classid); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); - WARN_ON(type != BPF_WRITE); - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + else + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + break; + + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + dst_reg, src_reg, + offsetof(struct sk_buff, data)); + break; + + case offsetof(struct __sk_buff, data_end): + ctx_off -= offsetof(struct __sk_buff, data_end); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), + dst_reg, src_reg, ctx_off); break; case offsetof(struct __sk_buff, tc_index): @@ -2069,30 +2278,30 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, } static const struct bpf_verifier_ops sk_filter_ops = { - .get_func_proto = sk_filter_func_proto, - .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { - .get_func_proto = tc_cls_act_func_proto, - .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { - .ops = &sk_filter_ops, - .type = BPF_PROG_TYPE_SOCKET_FILTER, + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_CLS, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_ACT, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, }; static int __init register_sk_filter_ops(void) @@ -2114,7 +2323,7 @@ int sk_detach_filter(struct sock *sk) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); @@ -2134,7 +2343,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (!filter) goto out; diff --git a/net/core/flow.c b/net/core/flow.c index 1033725be40b..3937b1b68d5b 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -92,8 +92,11 @@ static void flow_cache_gc_task(struct work_struct *work) list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); spin_unlock_bh(&xfrm->flow_cache_gc_lock); - list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) { flow_entry_kill(fce, xfrm); + atomic_dec(&xfrm->flow_cache_gc_count); + WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0); + } } static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, @@ -101,6 +104,7 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, struct netns_xfrm *xfrm) { if (deleted) { + atomic_add(deleted, &xfrm->flow_cache_gc_count); fcp->hash_count -= deleted; spin_lock_bh(&xfrm->flow_cache_gc_lock); list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); @@ -232,6 +236,13 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, if (fcp->hash_count > fc->high_watermark) flow_cache_shrink(fc, fcp); + if (fcp->hash_count > 2 * fc->high_watermark || + atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) { + atomic_inc(&net->xfrm.flow_cache_genid); + flo = ERR_PTR(-ENOBUFS); + goto ret_object; + } + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); if (fle) { fle->net = net; @@ -446,6 +457,7 @@ int flow_cache_init(struct net *net) INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); mutex_init(&net->xfrm.flow_flush_sem); + atomic_set(&net->xfrm.flow_cache_gc_count, 0); fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 12e700332010..a669dea146c6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,25 +19,12 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -static bool dissector_uses_key(const struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) -{ - return flow_dissector->used_keys & (1 << key_id); -} - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } -static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id, - void *target_container) -{ - return ((char *) target_container) + flow_dissector->offset[key_id]; -} - void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) @@ -178,15 +165,16 @@ ip: ip_proto = iph->protocol; - if (!dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) - break; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -219,13 +207,12 @@ ipv6: if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { - struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; - - key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, - target_container); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); - memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + memcpy(&key_addrs->v6addrs, &iph->saddr, + sizeof(key_addrs->v6addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -339,8 +326,11 @@ mpls: } case htons(ETH_P_FCOE): - key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); - /* fall through */ + if ((hlen - nhoff) < FCOE_HEADER_LEN) + goto out_bad; + + nhoff += FCOE_HEADER_LEN; + goto out_good; default: goto out_bad; } @@ -447,13 +437,12 @@ ip_proto_again: key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; - if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { - ip_proto = fh->nexthdr; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) goto ip_proto_again; - } } goto out_good; } @@ -740,6 +729,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, { u32 poff = keys->control.thoff; + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 92d886f4adcb..4573d81093fe 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -191,6 +191,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats /** * gen_new_estimator - create a new rate estimator * @bstats: basic statistics + * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock * @opt: rate estimator configuration TLV @@ -287,6 +288,7 @@ EXPORT_SYMBOL(gen_kill_estimator); /** * gen_replace_estimator - replace rate estimator configuration * @bstats: basic statistics + * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock * @opt: rate estimator configuration TLV diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1e2f46a69d50..f96ee8b9478d 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -25,9 +25,9 @@ static inline int -gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) { - if (nla_put(d->skb, type, size, buf)) + if (nla_put_64bit(d->skb, type, size, buf, padattr)) goto nla_put_failure; return 0; @@ -59,7 +59,8 @@ nla_put_failure: */ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, - int xstats_type, spinlock_t *lock, struct gnet_dump *d) + int xstats_type, spinlock_t *lock, + struct gnet_dump *d, int padattr) __acquires(lock) { memset(d, 0, sizeof(*d)); @@ -71,16 +72,17 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; + d->padattr = padattr; if (d->tail) - return gnet_stats_copy(d, type, NULL, 0); + return gnet_stats_copy(d, type, NULL, 0, padattr); return 0; } EXPORT_SYMBOL(gnet_stats_start_copy_compat); /** - * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * gnet_stats_start_copy - start dumping procedure in compatibility mode * @skb: socket buffer to put statistics TLVs into * @type: TLV type for top level statistic TLV * @lock: statistics lock @@ -94,9 +96,9 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat); */ int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, - struct gnet_dump *d) + struct gnet_dump *d, int padattr) { - return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr); } EXPORT_SYMBOL(gnet_stats_start_copy); @@ -140,6 +142,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle + * @cpu: copy statistic per cpu * @b: basic statistics * * Appends the basic statistics to the top level TLV created by @@ -168,7 +171,8 @@ gnet_stats_copy_basic(struct gnet_dump *d, memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb), + TCA_STATS_PAD); } return 0; } @@ -207,11 +211,13 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } if (d->tail) { - res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), + TCA_STATS_PAD); if (res < 0 || est.bps == r->bps) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r), + TCA_STATS_PAD); } return 0; @@ -285,7 +291,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, if (d->tail) return gnet_stats_copy(d, TCA_STATS_QUEUE, - &qstats, sizeof(qstats)); + &qstats, sizeof(qstats), + TCA_STATS_PAD); return 0; } @@ -315,7 +322,8 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_APP, st, len); + return gnet_stats_copy(d, TCA_STATS_APP, st, len, + TCA_STATS_PAD); return 0; @@ -346,12 +354,12 @@ gnet_stats_finish_copy(struct gnet_dump *d) if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, - sizeof(d->tc_stats)) < 0) + sizeof(d->tc_stats), d->padattr) < 0) return -1; if (d->compat_xstats && d->xstats) { if (gnet_stats_copy(d, d->compat_xstats, d->xstats, - d->xstats_len) < 0) + d->xstats_len, d->padattr) < 0) return -1; } diff --git a/net/core/hwbm.c b/net/core/hwbm.c new file mode 100644 index 000000000000..941c28486896 --- /dev/null +++ b/net/core/hwbm.c @@ -0,0 +1,87 @@ +/* Support for hardware buffer manager. + * + * Copyright (C) 2016 Marvell + * + * Gregory CLEMENT <gregory.clement@free-electrons.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/skbuff.h> +#include <net/hwbm.h> + +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) +{ + if (likely(bm_pool->frag_size <= PAGE_SIZE)) + skb_free_frag(buf); + else + kfree(buf); +} +EXPORT_SYMBOL_GPL(hwbm_buf_free); + +/* Refill processing for HW buffer management */ +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) +{ + int frag_size = bm_pool->frag_size; + void *buf; + + if (likely(frag_size <= PAGE_SIZE)) + buf = netdev_alloc_frag(frag_size); + else + buf = kmalloc(frag_size, gfp); + + if (!buf) + return -ENOMEM; + + if (bm_pool->construct) + if (bm_pool->construct(bm_pool, buf)) { + hwbm_buf_free(bm_pool, buf); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(hwbm_pool_refill); + +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +{ + int err, i; + unsigned long flags; + + spin_lock_irqsave(&bm_pool->lock, flags); + if (bm_pool->buf_num == bm_pool->size) { + pr_warn("pool already filled\n"); + return bm_pool->buf_num; + } + + if (buf_num + bm_pool->buf_num > bm_pool->size) { + pr_warn("cannot allocate %d buffers for pool\n", + buf_num); + return 0; + } + + if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { + pr_warn("Adding %d buffers to the %d current buffers will overflow\n", + buf_num, bm_pool->buf_num); + return 0; + } + + for (i = 0; i < buf_num; i++) { + err = hwbm_pool_refill(bm_pool, gfp); + if (err < 0) + break; + } + + /* Update BM driver with number of buffers added to pool */ + bm_pool->buf_num += i; + + pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); + spin_unlock_irqrestore(&bm_pool->lock, flags); + + return i; +} +EXPORT_SYMBOL_GPL(hwbm_pool_add); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 299cfc24d888..669ecc9f884e 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -27,6 +27,31 @@ #include <net/rtnetlink.h> #include <net/ip6_fib.h> +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; @@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + rcu_read_unlock(); + request_module("rtnl-lwt-%s", encap_type_str); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + } + } +#endif if (likely(ops && ops->build_state)) ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f18ae91b652e..29dd8cc22bbf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1763,21 +1763,22 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, - NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || + NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, - NEIGH_VAR(parms, GC_STALETIME)) || + NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, - NEIGH_VAR(parms, DELAY_PROBE_TIME)) || + NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, - NEIGH_VAR(parms, RETRANS_TIME)) || + NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, - NEIGH_VAR(parms, ANYCAST_DELAY)) || + NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, - NEIGH_VAR(parms, PROXY_DELAY)) || + NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, - NEIGH_VAR(parms, LOCKTIME))) + NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1804,7 +1805,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || - nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) || + nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) @@ -1856,7 +1857,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndst.ndts_table_fulls += st->table_fulls; } - if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) + if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, + NDTA_PAD)) goto nla_put_failure; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 2bf83299600a..14d09345f00d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps, flow_limit_count); + 0, /* was cpu_collision */ + sd->received_rps, flow_limit_count); return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b6c8a6629b39..2b3f76fe65f4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -29,7 +29,6 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; -static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; - switch (cmd.duplex) { + + switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; @@ -574,6 +576,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] = { &dev_attr_rx_packets.attr, @@ -599,6 +602,7 @@ static struct attribute *netstat_attrs[] = { &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, NULL }; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0260c84ed83c..11fce17274f6 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -9,7 +9,6 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f1efbc39ef6b..2ec86fc552df 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1474cfd2dc1c..8604ae245960 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2856,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); @@ -2983,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); @@ -3472,7 +3472,6 @@ xmit_more: pkt_dev->odevname, ret); pkt_dev->errors++; /* fallthru */ - case NETDEV_TX_LOCKED: case NETDEV_TX_BUSY: /* Retry it next time */ atomic_dec(&(pkt_dev->skb->users)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8261d95dd846..d69c4644f8f2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -804,11 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; -} -static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) -{ - memcpy(v, b, sizeof(*b)); + a->rx_nohandler = b->rx_nohandler; } /* All VF info */ @@ -828,17 +825,17 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + /* IFLA_VF_STATS_RX_PACKETS */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -879,9 +876,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ - + nla_total_size(sizeof(struct rtnl_link_ifmap)) + + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size(sizeof(struct rtnl_link_stats64)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -893,6 +890,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */ + + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -905,6 +904,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1049,25 +1049,23 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { - const struct rtnl_link_stats64 *stats; - struct rtnl_link_stats64 temp; + struct rtnl_link_stats64 *sp; struct nlattr *attr; - stats = dev_get_stats(dev, &temp); - - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); + attr = nla_reserve_64bit(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; - copy_rtnl_link_stats(nla_data(attr), stats); + sp = nla_data(attr); + dev_get_stats(dev, sp); - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; - copy_rtnl_link_stats64(nla_data(attr), stats); + copy_rtnl_link_stats(nla_data(attr), sp); return 0; } @@ -1155,18 +1153,18 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } - if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast) || - nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast)) + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD)) return -EMSGSIZE; nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); @@ -1175,15 +1173,17 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + + if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; @@ -1221,6 +1221,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1387,15 +1389,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, -}; - -static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { - [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 }, - [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 }, - [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 }, - [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 }, - [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 }, - [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 }, + [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, + [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1412,6 +1407,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +{ + const struct rtnl_link_ops *ops = NULL; + struct nlattr *linfo[IFLA_INFO_MAX + 1]; + + if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0) + return NULL; + + if (linfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } + + return ops; +} + +static bool link_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool link_kind_filtered(const struct net_device *dev, + const struct rtnl_link_ops *kind_ops) +{ + if (kind_ops && dev->rtnl_link_ops != kind_ops) + return true; + + return false; +} + +static bool link_dump_filtered(struct net_device *dev, + int master_idx, + const struct rtnl_link_ops *kind_ops) +{ + if (link_master_filtered(dev, master_idx) || + link_kind_filtered(dev, kind_ops)) + return true; + + return false; +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -1421,6 +1468,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; + const struct rtnl_link_ops *kind_ops = NULL; + unsigned int flags = NLM_F_MULTI; + int master_idx = 0; int err; int hdrlen; @@ -1443,18 +1493,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + if (tb[IFLA_MASTER]) + master_idx = nla_get_u32(tb[IFLA_MASTER]); + + if (tb[IFLA_LINKINFO]) + kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); + + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, + flags, ext_filter_mask); /* If we ran out of room on the first message, * we're in trouble @@ -1534,6 +1595,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return 0; } +static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, + int guid_type) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); +} + +static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) +{ + if (dev->type != ARPHRD_INFINIBAND) + return -EOPNOTSUPP; + + return handle_infiniband_guid(dev, ivt, guid_type); +} + static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1636,6 +1713,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } + if (tb[IFLA_VF_IB_NODE_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); + } + + if (tb[IFLA_VF_IB_PORT_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); + } + return err; } @@ -3351,6 +3446,202 @@ out: return err; } +static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) +{ + return (mask & IFLA_STATS_FILTER_BIT(attrid)) && + (!idxattr || idxattr == attrid); +} + +static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags, unsigned int filter_mask, + int *idxattr, int *prividx) +{ + struct if_stats_msg *ifsm; + struct nlmsghdr *nlh; + struct nlattr *attr; + int s_prividx = *prividx; + + ASSERT_RTNL(); + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); + if (!nlh) + return -EMSGSIZE; + + ifsm = nlmsg_data(nlh); + ifsm->ifindex = dev->ifindex; + ifsm->filter_mask = filter_mask; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { + struct rtnl_link_stats64 *sp; + + attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, + sizeof(struct rtnl_link_stats64), + IFLA_STATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + sp = nla_data(attr); + dev_get_stats(dev, sp); + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->fill_linkxstats) { + int err; + + *idxattr = IFLA_STATS_LINK_XSTATS; + attr = nla_nest_start(skb, + IFLA_STATS_LINK_XSTATS); + if (!attr) + goto nla_put_failure; + + err = ops->fill_linkxstats(skb, dev, prividx); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + nlmsg_end(skb, nlh); + + return 0; + +nla_put_failure: + /* not a multi message or no progress mean a real error */ + if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) + nlmsg_cancel(skb, nlh); + else + nlmsg_end(skb, nlh); + + return -EMSGSIZE; +} + +static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { + [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, +}; + +static size_t if_nlmsg_stats_size(const struct net_device *dev, + u32 filter_mask) +{ + size_t size = 0; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) + size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->get_linkxstats_size) { + size += nla_total_size(ops->get_linkxstats_size(dev)); + /* for IFLA_STATS_LINK_XSTATS */ + size += nla_total_size(0); + } + } + + return size; +} + +static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + int idxattr = 0, prividx = 0; + struct if_stats_msg *ifsm; + struct sk_buff *nskb; + u32 filter_mask; + int err; + + ifsm = nlmsg_data(nlh); + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + filter_mask = ifsm->filter_mask; + if (!filter_mask) + return -EINVAL; + + nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); + if (!nskb) + return -ENOBUFS; + + err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, + NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + 0, filter_mask, &idxattr, &prividx); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else { + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); + } + + return err; +} + +static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int h, s_h, err, s_idx, s_idxattr, s_prividx; + struct net *net = sock_net(skb->sk); + unsigned int flags = NLM_F_MULTI; + struct if_stats_msg *ifsm; + struct hlist_head *head; + struct net_device *dev; + u32 filter_mask = 0; + int idx = 0; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + s_idxattr = cb->args[2]; + s_prividx = cb->args[3]; + + cb->seq = net->dev_base_seq; + + ifsm = nlmsg_data(cb->nlh); + filter_mask = ifsm->filter_mask; + if (!filter_mask) + return -EINVAL; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, filter_mask, + &s_idxattr, &s_prividx); + /* If we ran out of room on the first message, + * we're in trouble + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err < 0) + goto out; + s_prividx = 0; + s_idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +out: + cb->args[3] = s_prividx; + cb->args[2] = s_idxattr; + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -3500,4 +3791,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, + NULL); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8616d1147c93..f2b77e549c03 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -349,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +#define NAPI_SKB_CACHE_SIZE 64 + +struct napi_alloc_cache { + struct page_frag_cache page; + size_t skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -380,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(nc, fragsz, gfp_mask); + return __alloc_page_frag(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -476,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; @@ -496,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(nc, len, gfp_mask); + data = __alloc_page_frag(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -507,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->pfmemalloc) + if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -749,6 +757,73 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +void __kfree_skb_flush(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* flush skb_cache if containing objects */ + if (nc->skb_count) { + kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, + nc->skb_cache); + nc->skb_count = 0; + } +} + +static inline void _kfree_skb_defer(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* drop skb->head and call any destructors for packet */ + skb_release_all(skb); + + /* record skb to CPU local list */ + nc->skb_cache[nc->skb_count++] = skb; + +#ifdef CONFIG_SLUB + /* SLUB writes into objects when freeing */ + prefetchw(skb); +#endif + + /* flush skb_cache if it is filled */ + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, + nc->skb_cache); + nc->skb_count = 0; + } +} +void __kfree_skb_defer(struct sk_buff *skb) +{ + _kfree_skb_defer(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + if (unlikely(!skb)) + return; + + /* Zero budget indicate non-NAPI context called us, like netpoll */ + if (unlikely(!budget)) { + dev_consume_skb_any(skb); + return; + } + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + _kfree_skb_defer(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + /* Make sure a field is enclosed inside headers_start/headers_end section */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ @@ -1843,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, struct splice_pipe_desc *spd, struct sock *sk) { int seg; + struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -1869,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return true; } + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + return true; + } + return false; } @@ -1895,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk, /* * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. + * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, @@ -1916,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *frag_iter; int ret = 0; - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) - goto done; - else if (!tlen) - goto done; - - /* - * now see if we have a frag_list to map - */ - skb_walk_frags(skb, frag_iter) { - if (!tlen) - break; - if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) - break; - } + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); -done: if (spd.nr_pages) ret = splice_cb(sk, pipe, &spd); @@ -3008,11 +3076,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); + unsigned int partial_segs = 0; unsigned int headroom; - unsigned int len; + unsigned int len = head_skb->len; __be16 proto; - bool csum; - int sg = !!(features & NETIF_F_SG); + bool csum, sg; int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; @@ -3024,8 +3092,20 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !head_skb->encap_hdr_csum && - !!can_checksum_protocol(features, proto); + sg = !!(features & NETIF_F_SG); + csum = !!can_checksum_protocol(features, proto); + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ + if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + partial_segs = len / mss; + if (partial_segs > 1) + mss *= partial_segs; + else + partial_segs = 0; + } headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3118,13 +3198,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg && !nskb->remcsum_offload) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); + if (!sg) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, len), + len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; continue; } @@ -3190,12 +3272,19 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum && !nskb->remcsum_offload) { - nskb->csum = skb_checksum(nskb, doffset, - nskb->len - doffset, 0); - nskb->ip_summed = CHECKSUM_NONE; + if (!csum) { + if (skb_has_shared_frag(nskb)) { + err = __skb_linearize(nskb); + if (err) + goto err; + } + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); @@ -3205,6 +3294,23 @@ perform_csum_check: */ segs->prev = tail; + /* Update GSO info on first skb in partial sequence. */ + if (partial_segs) { + int type = skb_shinfo(head_skb)->gso_type; + + /* Update type to add partial and then remove dodgy if set */ + type |= SKB_GSO_PARTIAL; + type &= ~SKB_GSO_DODGY; + + /* Update GSO info and prepare to start updating headers on + * our way back down the stack of protocols. + */ + skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; + skb_shinfo(segs)->gso_segs = partial_segs; + skb_shinfo(segs)->gso_type = type; + SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + } + /* Following permits correct backpressure, for protocols * using skb_set_owner_w(). * Idea is to tranfert ownership from head_skb to last segment. @@ -4237,7 +4343,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - skb_sender_cpu_clear(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); @@ -4427,15 +4532,16 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) __skb_push(skb, offset); err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); - if (err) + if (err) { + __skb_pull(skb, offset); return err; + } + skb->protocol = skb->vlan_proto; skb->mac_len += VLAN_HLEN; - __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + __skb_pull(skb, offset); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; @@ -4519,3 +4625,239 @@ failure: return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); + +/* carve out the first off bytes from skb when off < headlen */ +static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + const int headlen, gfp_t gfp_mask) +{ + int i; + int size = skb_end_offset(skb); + int new_hlen = headlen - off; + u8 *data; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + /* Copy real data, and all frags */ + skb_copy_from_linear_data_offset(skb, off, data, new_hlen); + skb->len -= off; + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb); + } else { + /* we can reuse existing recount- all we did was + * relocate values + */ + skb_free_head(skb); + } + + skb->head = data; + skb->data = data; + skb->head_frag = 0; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; +#else + skb->end = skb->head + size; +#endif + skb_set_tail_pointer(skb, skb_headlen(skb)); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + return 0; +} + +static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); + +/* carve out the first eat bytes from skb's frag_list. May recurse into + * pskb_carve() + */ +static int pskb_carve_frag_list(struct sk_buff *skb, + struct skb_shared_info *shinfo, int eat, + gfp_t gfp_mask) +{ + struct sk_buff *list = shinfo->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) { + pr_err("Not enough bytes to eat. Want %d\n", eat); + return -EFAULT; + } + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_shared(list)) { + clone = skb_clone(list, gfp_mask); + if (!clone) + return -ENOMEM; + insp = list->next; + list = clone; + } else { + /* This may be pulled without problems. */ + insp = list; + } + if (pskb_carve(list, eat, gfp_mask) < 0) { + kfree_skb(clone); + return -ENOMEM; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = shinfo->frag_list) != insp) { + shinfo->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + shinfo->frag_list = clone; + } + return 0; +} + +/* carve off first len bytes from skb. Split line (off) is in the + * non-linear part of skb + */ +static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + int pos, gfp_t gfp_mask) +{ + int i, k = 0; + int size = skb_end_offset(skb); + u8 *data; + const int nfrags = skb_shinfo(skb)->nr_frags; + struct skb_shared_info *shinfo; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); + for (i = 0; i < nfrags; i++) { + int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + fsize > off) { + shinfo->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < off) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + shinfo->frags[0].page_offset += off - pos; + skb_frag_size_sub(&shinfo->frags[0], off - pos); + } + skb_frag_ref(skb, i); + k++; + } + pos += fsize; + } + shinfo->nr_frags = k; + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + if (k == 0) { + /* split line is in frag list */ + pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask); + } + skb_release_data(skb); + + skb->head = data; + skb->head_frag = 0; + skb->data = data; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; +#else + skb->end = skb->head + size; +#endif + skb_reset_tail_pointer(skb); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + skb->len -= off; + skb->data_len = skb->len; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; +} + +/* remove len bytes from the beginning of the skb */ +static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) +{ + int headlen = skb_headlen(skb); + + if (len < headlen) + return pskb_carve_inside_header(skb, len, headlen, gfp); + else + return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); +} + +/* Extract to_copy bytes starting at off from skb, and return this in + * a new skb + */ +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, + int to_copy, gfp_t gfp) +{ + struct sk_buff *clone = skb_clone(skb, gfp); + + if (!clone) + return NULL; + + if (pskb_carve(clone, off, gfp) < 0 || + pskb_trim(clone, to_copy)) { + kfree_skb(clone); + return NULL; + } + return clone; +} +EXPORT_SYMBOL(pskb_extract); diff --git a/net/core/sock.c b/net/core/sock.c index 6c1c8bc93412..08bf97eceeb3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -221,7 +221,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , + "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -237,7 +238,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , + "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -253,7 +255,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , + "clock-AF_MAX" }; /* @@ -402,9 +405,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) } -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -414,10 +416,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; } - err = sk_filter(sk, skb); - if (err) - return err; - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; @@ -440,6 +438,18 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk->sk_data_ready(sk); return 0; } +EXPORT_SYMBOL(__sock_queue_rcv_skb); + +int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = sk_filter(sk, skb); + if (err) + return err; + + return __sock_queue_rcv_skb(sk, skb); +} EXPORT_SYMBOL(sock_queue_rcv_skb); int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) @@ -832,7 +842,8 @@ set_rcvbuf: !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - if (sk->sk_state != TCP_ESTABLISHED) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) { ret = -EINVAL; break; } @@ -987,6 +998,10 @@ set_rcvbuf: sk->sk_incoming_cpu = val; break; + case SO_CNX_ADVICE: + if (val == 1) + dst_negative_advice(sk); + break; default: ret = -ENOPROTOOPT; break; @@ -1414,8 +1429,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -void sk_destruct(struct sock *sk) +/* Sockets having SOCK_RCU_FREE will call this function after one RCU + * grace period. This is the case for UDP sockets and TCP listeners. + */ +static void __sk_destruct(struct rcu_head *head) { + struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) @@ -1444,6 +1463,14 @@ void sk_destruct(struct sock *sk) sk_prot_free(sk->sk_prot_creator, sk); } +void sk_destruct(struct sock *sk) +{ + if (sock_flag(sk, SOCK_RCU_FREE)) + call_rcu(&sk->sk_rcu, __sk_destruct); + else + __sk_destruct(&sk->sk_rcu); +} + static void __sk_free(struct sock *sk) { if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) @@ -1508,6 +1535,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_cache = NULL; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -1531,6 +1559,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk = NULL; goto out; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; newsk->sk_priority = 0; @@ -1626,6 +1655,17 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +/* This variant of sock_wfree() is used by TCP, + * since it sets SOCK_USE_WRITE_QUEUE. + */ +void __sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + __sk_free(sk); +} + void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); @@ -1648,8 +1688,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +/* This helper is used by netem, as it can hold packets in its + * delay queue. We want to allow the owner socket to send more + * packets, as if they were already TX completed by a typical driver. + * But we also want to keep skb->sk set because some packet schedulers + * rely on it (sch_fq for example). So we set skb->truesize to a small + * amount (1) and decrease sk_wmem_alloc accordingly. + */ void skb_orphan_partial(struct sk_buff *skb) { + /* If this skb is a TCP pure ACK or already went here, + * we have nothing to do. 2 is already a very small truesize. + */ + if (skb->truesize <= 2) + return; + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, * so we do not completely orphan skb, but transfert all * accounted bytes but one, to avoid unexpected reorders. @@ -1861,27 +1914,51 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) +{ + u32 tsflags; + + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + case SO_TIMESTAMPING: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + + tsflags = *(u32 *)CMSG_DATA(cmsg); + if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) + return -EINVAL; + + sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; + sockc->tsflags |= tsflags; + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(__sock_cmsg_send); + int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; + int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; - switch (cmsg->cmsg_type) { - case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) - return -EINVAL; - sockc->mark = *(u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } + ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + if (ret) + return ret; } return 0; } @@ -1903,7 +1980,7 @@ EXPORT_SYMBOL(sock_cmsg_send); bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { - if (atomic_read(&pfrag->page->_count) == 1) { + if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } @@ -1966,33 +2043,27 @@ static void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { - struct sk_buff *skb = sk->sk_backlog.head; + struct sk_buff *skb, *next; - do { + while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; - bh_unlock_sock(sk); - do { - struct sk_buff *next = skb->next; + spin_unlock_bh(&sk->sk_lock.slock); + do { + next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); - /* - * We are in process context here with softirqs - * disabled, use cond_resched_softirq() to preempt. - * This is safe to do because we've taken the backlog - * queue private: - */ - cond_resched_softirq(); + cond_resched(); skb = next; } while (skb != NULL); - bh_lock_sock(sk); - } while ((skb = sk->sk_backlog.head) != NULL); + spin_lock_bh(&sk->sk_lock.slock); + } /* * Doing the zeroing here guarantee we can not loop forever @@ -2001,6 +2072,13 @@ static void __release_sock(struct sock *sk) sk->sk_backlog.len = 0; } +void __sk_flush_backlog(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + __release_sock(sk); + spin_unlock_bh(&sk->sk_lock.slock); +} + /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on @@ -2137,6 +2215,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount) } EXPORT_SYMBOL(__sk_mem_reclaim); +int sk_set_peek_off(struct sock *sk, int val) +{ + if (val < 0) + return -EINVAL; + + sk->sk_peek_off = val; + return 0; +} +EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when @@ -2424,11 +2511,6 @@ EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { - /* - * The sk_lock has mutex_unlock() semantics: - */ - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a996ce8c8fb2..6b10573cc9fa 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -67,6 +67,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); return nla_put(skb, attrtype, sizeof(mem), &mem); } @@ -119,7 +120,7 @@ static size_t sock_diag_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ - + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ + + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } static void sock_diag_broadcast_destroy_work(struct work_struct *work) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a6beb7b6ae55..0df2aa652530 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +# ifdef CONFIG_HAVE_EBPF_JIT + { + .procname = "bpf_jit_harden", + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, +# endif #endif { .procname = "netdev_tstamp_prequeue", |