summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/datagram.c9
-rw-r--r--net/core/dev.c152
-rw-r--r--net/core/devlink.c1768
-rw-r--r--net/core/dst.c10
-rw-r--r--net/core/dst_cache.c168
-rw-r--r--net/core/ethtool.c645
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c395
-rw-r--r--net/core/flow.c14
-rw-r--r--net/core/flow_dissector.c58
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gen_stats.c36
-rw-r--r--net/core/hwbm.c87
-rw-r--r--net/core/lwtunnel.c37
-rw-r--r--net/core/neighbour.c22
-rw-r--r--net/core/net-procfs.c3
-rw-r--r--net/core/net-sysfs.c18
-rw-r--r--net/core/netclassid_cgroup.c1
-rw-r--r--net/core/netprio_cgroup.c1
-rw-r--r--net/core/pktgen.c5
-rw-r--r--net/core/rtnetlink.c400
-rw-r--r--net/core/skbuff.c444
-rw-r--r--net/core/sock.c166
-rw-r--r--net/core/sock_diag.c3
-rw-r--r--net/core/sysctl_net_core.c9
26 files changed, 4098 insertions, 362 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0b835de04de3..d6508c2ddca5 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,3 +24,6 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fa9dc6450b08..b7de71f8d5d3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_free_datagram);
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
{
bool slow;
if (likely(atomic_read(&skb->users) == 1))
smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users)))
+ else if (likely(!atomic_dec_and_test(&skb->users))) {
+ sk_peek_offset_bwd(sk, len);
return;
+ }
slow = lock_sock_fast(sk);
+ sk_peek_offset_bwd(sk, len);
skb_orphan(skb);
sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);
@@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
/* skb is now orphaned, can be freed outside of locked section */
__kfree_skb(skb);
}
-EXPORT_SYMBOL(skb_free_datagram_locked);
+EXPORT_SYMBOL(__skb_free_datagram_locked);
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ef061b2badc..904ff431d570 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
__net_timestamp(SKB); \
} \
-bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
+bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
unsigned int len;
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
* taps currently in use.
*/
-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ out_unlock:
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/**
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
@@ -2711,6 +2712,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
return ERR_PTR(err);
}
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
@@ -2802,7 +2816,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
- features &= ~NETIF_F_CSUM_MASK;
+ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
@@ -2825,14 +2839,45 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb,
return vlan_features_check(skb, features);
}
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+ if (gso_segs > dev->gso_max_segs)
+ return features & ~NETIF_F_GSO_MASK;
+
+ /* Support for GSO partial features requires software
+ * intervention before we can actually process the packets
+ * so we need to strip support for any partial features now
+ * and we can pull them back in after we have partially
+ * segmented the frame.
+ */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+ features &= ~dev->gso_partial_features;
+
+ /* Make sure to clear the IPv4 ID mangling feature if the
+ * IPv4 header has the potential to be fragmented.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ struct iphdr *iph = skb->encapsulation ?
+ inner_ip_hdr(skb) : ip_hdr(skb);
+
+ if (!(iph->frag_off & htons(IP_DF)))
+ features &= ~NETIF_F_TSO_MANGLEID;
+ }
+
+ return features;
+}
+
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
- u16 gso_segs = skb_shinfo(skb)->gso_segs;
- if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
- features &= ~NETIF_F_GSO_MASK;
+ if (skb_is_gso(skb))
+ features = gso_features_check(skb, dev, features);
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
@@ -2915,9 +2960,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
{
netdev_features_t features;
- if (skb->next)
- return skb;
-
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -2960,6 +3002,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
out_kfree_skb:
kfree_skb(skb);
out_null:
+ atomic_long_inc(&dev->tx_dropped);
return NULL;
}
@@ -3143,12 +3186,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
case TC_ACT_SHOT:
qdisc_qstats_cpu_drop(cl->q);
*ret = NET_XMIT_DROP;
- goto drop;
+ kfree_skb(skb);
+ return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*ret = NET_XMIT_SUCCESS;
-drop:
- kfree_skb(skb);
+ consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
@@ -3349,7 +3392,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb = validate_xmit_skb(skb, dev);
if (!skb)
- goto drop;
+ goto out;
HARD_TX_LOCK(dev, txq, cpu);
@@ -3376,7 +3419,6 @@ recursion_alert:
}
rc = -ENETDOWN;
-drop:
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
@@ -3428,6 +3470,7 @@ u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
+EXPORT_SYMBOL(rps_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3829,8 +3872,14 @@ static void net_tx_action(struct softirq_action *h)
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
- __kfree_skb(skb);
+
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
}
+
+ __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -3908,9 +3957,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
break;
case TC_ACT_SHOT:
qdisc_qstats_cpu_drop(cl->q);
+ kfree_skb(skb);
+ return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
- kfree_skb(skb);
+ consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
@@ -4154,7 +4205,10 @@ ncls:
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
drop:
- atomic_long_inc(&skb->dev->rx_dropped);
+ if (!deliver_exact)
+ atomic_long_inc(&skb->dev->rx_dropped);
+ else
+ atomic_long_inc(&skb->dev->rx_nohandler);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
@@ -4429,7 +4483,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->udp_mark = 0;
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->is_fou = 0;
+ NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
@@ -4654,6 +4710,8 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
napi_reuse_skb(napi, skb);
return NULL;
}
@@ -4928,8 +4986,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
netpoll_poll_unlock(have);
}
if (rc > 0)
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+ __NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_BUSYPOLLRXPACKETS, rc);
local_bh_enable();
if (rc == LL_FLUSH_FAILED)
@@ -5152,6 +5210,7 @@ static void net_rx_action(struct softirq_action *h)
}
}
+ __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -6435,6 +6494,7 @@ EXPORT_SYMBOL(dev_get_phys_port_id);
* dev_get_phys_port_name - Get device physical port name
* @dev: device
* @name: port name
+ * @len: limit of bytes to copy to name
*
* Get device physical port name
*/
@@ -6664,6 +6724,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_TSO6;
}
+ /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
+ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
+ features &= ~NETIF_F_TSO_MANGLEID;
+
/* TSO ECN requires that TSO is present as well. */
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
@@ -6692,6 +6756,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ /* GSO partial features require GSO partial be set */
+ if ((features & dev->gso_partial_features) &&
+ !(features & NETIF_F_GSO_PARTIAL)) {
+ netdev_dbg(dev,
+ "Dropping partially supported GSO features since no GSO partial.\n");
+ features &= ~dev->gso_partial_features;
+ }
+
#ifdef CONFIG_NET_RX_BUSY_POLL
if (dev->netdev_ops->ndo_busy_poll)
features |= NETIF_F_BUSY_POLL;
@@ -6962,9 +7034,22 @@ int register_netdevice(struct net_device *dev)
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
- if (!(dev->flags & IFF_LOOPBACK)) {
+ if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
- }
+
+ /* If IPv4 TCP segmentation offload is supported we should also
+ * allow the device to enable segmenting the frame with the option
+ * of ignoring a static IP ID value. This doesn't enable the
+ * feature itself but allows the user to enable it later.
+ */
+ if (dev->hw_features & NETIF_F_TSO)
+ dev->hw_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->vlan_features & NETIF_F_TSO)
+ dev->vlan_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->mpls_features & NETIF_F_TSO)
+ dev->mpls_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->hw_enc_features & NETIF_F_TSO)
+ dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
@@ -6972,7 +7057,7 @@ int register_netdevice(struct net_device *dev)
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
- dev->hw_enc_features |= NETIF_F_SG;
+ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
@@ -7253,24 +7338,31 @@ void netdev_run_todo(void)
}
}
-/* Convert net_device_stats to rtnl_link_stats64. They have the same
- * fields in the same order, with only the type differing.
+/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
+ * all the same fields in the same order as net_device_stats, with only
+ * the type differing, but rtnl_link_stats64 may have additional fields
+ * at the end for newer counters.
*/
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
#if BITS_PER_LONG == 64
- BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
memcpy(stats64, netdev_stats, sizeof(*stats64));
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + sizeof(*netdev_stats), 0,
+ sizeof(*stats64) - sizeof(*netdev_stats));
#else
- size_t i, n = sizeof(*stats64) / sizeof(u64);
+ size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
const unsigned long *src = (const unsigned long *)netdev_stats;
u64 *dst = (u64 *)stats64;
- BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
- sizeof(*stats64) / sizeof(u64));
+ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
dst[i] = src[i];
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + n * sizeof(u64), 0,
+ sizeof(*stats64) - n * sizeof(u64));
#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
@@ -7300,6 +7392,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -7407,7 +7500,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
- dev->gso_min_segs = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000000..933e8d4d3968
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,1768 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+ return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+ write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+ struct nlattr **attrs)
+{
+ struct devlink *devlink;
+ char *busname;
+ char *devname;
+
+ if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+
+ busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+ devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+ strcmp(dev_name(devlink->dev), devname) == 0 &&
+ net_eq(devlink_net(devlink), net))
+ return devlink;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+ return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ int port_index)
+{
+ struct devlink_port *devlink_port;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (devlink_port->index == port_index)
+ return devlink_port;
+ }
+ return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+ return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return ERR_PTR(-ENODEV);
+ return devlink_port;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+struct devlink_sb {
+ struct list_head list;
+ unsigned int index;
+ u32 size;
+ u16 ingress_pools_count;
+ u16 egress_pools_count;
+ u16 ingress_tc_count;
+ u16 egress_tc_count;
+};
+
+static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
+{
+ return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
+}
+
+static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (devlink_sb->index == sb_index)
+ return devlink_sb;
+ }
+ return NULL;
+}
+
+static bool devlink_sb_index_exists(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ return devlink_sb_get_by_index(devlink, sb_index);
+}
+
+static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_SB_INDEX]) {
+ u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ if (!devlink_sb)
+ return ERR_PTR(-ENODEV);
+ return devlink_sb;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_sb_get_from_attrs(devlink, info->attrs);
+}
+
+static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ u16 *p_pool_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
+ if (val >= devlink_sb_pool_count(devlink_sb))
+ return -EINVAL;
+ *p_pool_index = val;
+ return 0;
+}
+
+static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ u16 *p_pool_index)
+{
+ return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
+ p_pool_index);
+}
+
+static int
+devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
+ if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val != DEVLINK_SB_POOL_TYPE_EGRESS)
+ return -EINVAL;
+ *p_pool_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_pool_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
+}
+
+static int
+devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
+ if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
+ val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
+ return -EINVAL;
+ *p_th_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_th_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
+}
+
+static int
+devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
+ if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val >= devlink_sb->ingress_tc_count)
+ return -EINVAL;
+ if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
+ val >= devlink_sb->egress_tc_count)
+ return -EINVAL;
+ *p_tc_index = val;
+ return 0;
+}
+
+static int
+devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
+ pool_type, p_tc_index);
+}
+
+#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
+#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3)
+ /* port is not needed but we need to ensure they don't
+ * change in the middle of command
+ */
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_info(info);
+ if (IS_ERR(devlink)) {
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink);
+ }
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
+ info->user_ptr[0] = devlink;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ struct devlink_port *devlink_port;
+
+ mutex_lock(&devlink_port_mutex);
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (IS_ERR(devlink_port)) {
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink_port);
+ }
+ info->user_ptr[0] = devlink_port;
+ }
+ if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) {
+ mutex_lock(&devlink_port_mutex);
+ }
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb)) {
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink_sb);
+ }
+ info->user_ptr[1] = devlink_sb;
+ }
+ return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT ||
+ ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS)
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+ DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+ [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+ goto nla_put_failure;
+ if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+ devlink_port->desired_type))
+ goto nla_put_failure;
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ struct net_device *netdev = devlink_port->type_dev;
+
+ if (netdev &&
+ (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+ netdev->ifindex) ||
+ nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+ netdev->name)))
+ goto nla_put_failure;
+ }
+ if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+ struct ib_device *ibdev = devlink_port->type_dev;
+
+ if (ibdev &&
+ nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+ ibdev->name))
+ goto nla_put_failure;
+ }
+ if (devlink_port->split &&
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ devlink_port->split_group))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink_port->registered)
+ return;
+
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_port *devlink_port;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_port_type port_type)
+
+{
+ int err;
+
+ if (devlink->ops && devlink->ops->port_type_set) {
+ if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+ return -EINVAL;
+ err = devlink->ops->port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+ enum devlink_port_type port_type;
+
+ port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+ err = devlink_port_type_set(devlink, devlink_port, port_type);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+ u32 port_index, u32 count)
+
+{
+ if (devlink->ops && devlink->ops->port_split)
+ return devlink->ops->port_split(devlink, port_index, count);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+ u32 count;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+ !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+ return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+ if (devlink->ops && devlink->ops->port_unsplit)
+ return devlink->ops->port_unsplit(devlink, port_index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ return devlink_port_unsplit(devlink, port_index);
+}
+
+static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
+ devlink_sb->ingress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
+ devlink_sb->egress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
+ devlink_sb->ingress_tc_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
+ devlink_sb->egress_tc_count))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index, enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_sb_pool_info pool_info;
+ void *hdr;
+ int err;
+
+ err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
+ pool_index, &pool_info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ pool_info.threshold_type))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_pool_fill(msg, devlink,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ portid, seq, NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ return 0;
+}
+
+static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_pool_get)
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
+ devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type)
+
+{
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_pool_set)
+ return ops->sb_pool_set(devlink, sb_index, pool_index,
+ size, threshold_type);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ enum devlink_sb_threshold_type threshold_type;
+ u16 pool_index;
+ u32 size;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_th_type_get_from_info(info, &threshold_type);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE])
+ return -EINVAL;
+
+ size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
+ return devlink_sb_pool_set(devlink, devlink_sb->index,
+ pool_index, size, threshold_type);
+}
+
+static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_port_pool_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
+ devlink_sb, pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ struct devlink_port *devlink_port;
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_port_pool_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_port_pool_get)
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_port_pool_get_dumpit(msg, start, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops && ops->sb_port_pool_set)
+ return ops->sb_port_pool_set(devlink_port, sb_index,
+ pool_index, threshold);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
+ pool_index, threshold);
+}
+
+static int
+devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u16 pool_index;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ &pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_tc_port_bind_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_tc_port_bind_get(devlink_port,
+ devlink_sb->index,
+ tc_index, pool_type,
+ &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ int err;
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
+ devlink_sb, tc_index, pool_type,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ info->snd_portid,
+ info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ struct devlink_port *devlink_port;
+ u16 tc_index;
+ int err;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ for (tc_index = 0;
+ tc_index < devlink_sb->ingress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_INGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ for (tc_index = 0;
+ tc_index < devlink_sb->egress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_EGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int
+devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
+ devlink,
+ devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops && ops->sb_tc_pool_bind_set)
+ return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ pool_index, threshold);
+}
+
+static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_occ_snapshot)
+ return ops->sb_occ_snapshot(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_occ_max_clear)
+ return ops->sb_occ_max_clear(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .doit = devlink_nl_cmd_get_doit,
+ .dumpit = devlink_nl_cmd_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .doit = devlink_nl_cmd_port_get_doit,
+ .dumpit = devlink_nl_cmd_port_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SET,
+ .doit = devlink_nl_cmd_port_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SPLIT,
+ .doit = devlink_nl_cmd_port_split_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_UNSPLIT,
+ .doit = devlink_nl_cmd_port_unsplit_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .doit = devlink_nl_cmd_sb_get_doit,
+ .dumpit = devlink_nl_cmd_sb_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .doit = devlink_nl_cmd_sb_pool_get_doit,
+ .dumpit = devlink_nl_cmd_sb_pool_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_SET,
+ .doit = devlink_nl_cmd_sb_pool_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .doit = devlink_nl_cmd_sb_port_pool_get_doit,
+ .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
+ .doit = devlink_nl_cmd_sb_port_pool_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit,
+ .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
+ .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
+ .doit = devlink_nl_cmd_sb_occ_snapshot_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB |
+ DEVLINK_NL_FLAG_LOCK_PORTS,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
+ .doit = devlink_nl_cmd_sb_occ_max_clear_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB |
+ DEVLINK_NL_FLAG_LOCK_PORTS,
+ },
+};
+
+/**
+ * devlink_alloc - Allocate new devlink instance resources
+ *
+ * @ops: ops
+ * @priv_size: size of user private data
+ *
+ * Allocate new devlink instance resources, including devlink index
+ * and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+ struct devlink *devlink;
+
+ devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+ if (!devlink)
+ return NULL;
+ devlink->ops = ops;
+ devlink_net_set(devlink, &init_net);
+ INIT_LIST_HEAD(&devlink->port_list);
+ INIT_LIST_HEAD(&devlink->sb_list);
+ return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ * devlink_register - Register devlink instance
+ *
+ * @devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+ mutex_lock(&devlink_mutex);
+ devlink->dev = dev;
+ list_add_tail(&devlink->list, &devlink_list);
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ mutex_unlock(&devlink_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ * devlink_unregister - Unregister devlink instance
+ *
+ * @devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+ mutex_lock(&devlink_mutex);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+ list_del(&devlink->list);
+ mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ * devlink_free - Free devlink instance resources
+ *
+ * @devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+ kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ * devlink_port_register - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index)
+{
+ mutex_lock(&devlink_port_mutex);
+ if (devlink_port_index_exists(devlink, port_index)) {
+ mutex_unlock(&devlink_port_mutex);
+ return -EEXIST;
+ }
+ devlink_port->devlink = devlink;
+ devlink_port->index = port_index;
+ devlink_port->registered = true;
+ list_add_tail(&devlink_port->list, &devlink->port_list);
+ mutex_unlock(&devlink_port_mutex);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ * devlink_port_unregister - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ mutex_lock(&devlink_port_mutex);
+ list_del(&devlink_port->list);
+ mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ devlink_port->type = type;
+ devlink_port->type_dev = type_dev;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ * devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ * @devlink_port: devlink port
+ * @ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+ struct ib_device *ibdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ * devlink_port_type_clear - Clear port type
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ * devlink_port_split_set - Set port is split
+ *
+ * @devlink_port: devlink port
+ * @split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+ u32 split_group)
+{
+ devlink_port->split = true;
+ devlink_port->split_group = split_group;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ struct devlink_sb *devlink_sb;
+ int err = 0;
+
+ mutex_lock(&devlink_mutex);
+ if (devlink_sb_index_exists(devlink, sb_index)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
+ if (!devlink_sb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ devlink_sb->index = sb_index;
+ devlink_sb->size = size;
+ devlink_sb->ingress_pools_count = ingress_pools_count;
+ devlink_sb->egress_pools_count = egress_pools_count;
+ devlink_sb->ingress_tc_count = ingress_tc_count;
+ devlink_sb->egress_tc_count = egress_tc_count;
+ list_add_tail(&devlink_sb->list, &devlink->sb_list);
+unlock:
+ mutex_unlock(&devlink_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_sb_register);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ mutex_lock(&devlink_mutex);
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ WARN_ON(!devlink_sb);
+ list_del(&devlink_sb->list);
+ mutex_unlock(&devlink_mutex);
+ kfree(devlink_sb);
+}
+EXPORT_SYMBOL_GPL(devlink_sb_unregister);
+
+static int __init devlink_module_init(void)
+{
+ return genl_register_family_with_ops_groups(&devlink_nl_family,
+ devlink_nl_ops,
+ devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+ genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
diff --git a/net/core/dst.c b/net/core/dst.c
index a1656e3b8d72..b5cbbe07f786 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -265,7 +265,7 @@ again:
lwtstate_put(dst->lwtstate);
if (dst->flags & DST_METADATA)
- kfree(dst);
+ metadata_dst_free((struct metadata_dst *)dst);
else
kmem_cache_free(dst->ops->kmem_cachep, dst);
@@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+ dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+ kfree(md_dst);
+}
+
struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
{
int cpu;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000000..554d36449231
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,168 @@
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+ unsigned long refresh_ts;
+ struct dst_entry *dst;
+ u32 cookie;
+ union {
+ struct in_addr in_saddr;
+ struct in6_addr in6_saddr;
+ };
+};
+
+static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+ struct dst_entry *dst, u32 cookie)
+{
+ dst_release(dst_cache->dst);
+ if (dst)
+ dst_hold(dst);
+
+ dst_cache->cookie = cookie;
+ dst_cache->dst = dst;
+}
+
+static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+ struct dst_cache_pcpu *idst)
+{
+ struct dst_entry *dst;
+
+ dst = idst->dst;
+ if (!dst)
+ goto fail;
+
+ /* the cache already hold a dst reference; it can't go away */
+ dst_hold(dst);
+
+ if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+ (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+ dst_cache_per_cpu_dst_set(idst, NULL, 0);
+ dst_release(dst);
+ goto fail;
+ }
+ return dst;
+
+fail:
+ idst->refresh_ts = jiffies;
+ return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+ if (!dst_cache->cache)
+ return NULL;
+
+ return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in_saddr.s_addr;
+ return container_of(dst, struct rtable, dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+ __be32 saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(idst, dst, 0);
+ idst->in_saddr.s_addr = saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+ const struct in6_addr *addr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
+ rt6_get_cookie((struct rt6_info *)dst));
+ idst->in6_saddr = *addr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+ struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in6_saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+ dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+ gfp | __GFP_ZERO);
+ if (!dst_cache->cache)
+ return -ENOMEM;
+
+ dst_cache_reset(dst_cache);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ for_each_possible_cpu(i)
+ dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+ free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index daf04709dd3c..bdb4013581b1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -79,12 +79,16 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
[NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
[NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -98,6 +102,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
};
static const char
@@ -386,43 +391,464 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
return 0;
}
-int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+ u32 legacy_u32)
{
+ bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ dst[0] = legacy_u32;
+}
+EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
+
+/* return false if src had higher bits set. lower bits always updated. */
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+ const unsigned long *src)
+{
+ bool retval = true;
+
+ /* TODO: following test will soon always be true */
+ if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+ bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_fill(ext, 32);
+ bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ if (bitmap_intersects(ext, src,
+ __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+ /* src mask goes beyond bit 31 */
+ retval = false;
+ }
+ }
+ *legacy_u32 = src[0];
+ return retval;
+}
+EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->transceiver ||
+ legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+ struct ethtool_cmd *legacy_settings,
+ const struct ethtool_link_ksettings *link_ksettings)
+{
+ bool retval = true;
+
+ memset(legacy_settings, 0, sizeof(*legacy_settings));
+ /* this also clears the deprecated fields in legacy structure:
+ * __u8 transceiver;
+ * __u32 maxtxpkt;
+ * __u32 maxrxpkt;
+ */
+
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->supported,
+ link_ksettings->link_modes.supported);
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->advertising,
+ link_ksettings->link_modes.advertising);
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->lp_advertising,
+ link_ksettings->link_modes.lp_advertising);
+ ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+ legacy_settings->duplex
+ = link_ksettings->base.duplex;
+ legacy_settings->port
+ = link_ksettings->base.port;
+ legacy_settings->phy_address
+ = link_ksettings->base.phy_address;
+ legacy_settings->autoneg
+ = link_ksettings->base.autoneg;
+ legacy_settings->mdio_support
+ = link_ksettings->base.mdio_support;
+ legacy_settings->eth_tp_mdix
+ = link_ksettings->base.eth_tp_mdix;
+ legacy_settings->eth_tp_mdix_ctrl
+ = link_ksettings->base.eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32 \
+ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+ struct ethtool_link_settings base;
+ struct {
+ __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ } link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *link_ksettings)
+{
+ int err;
+ struct ethtool_cmd cmd;
+
ASSERT_RTNL();
+ if (dev->ethtool_ops->get_link_ksettings) {
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev,
+ link_ksettings);
+ }
+
+ /* driver doesn't support %ethtool_link_ksettings API. revert to
+ * legacy %ethtool_cmd API, unless it's not supported either.
+ * TODO: remove when ethtool_ops::get_settings disappears internally
+ */
if (!dev->ethtool_ops->get_settings)
return -EOPNOTSUPP;
- memset(cmd, 0, sizeof(struct ethtool_cmd));
- cmd->cmd = ETHTOOL_GSET;
- return dev->ethtool_ops->get_settings(dev, cmd);
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+
+ /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+ */
+ convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+ return err;
}
-EXPORT_SYMBOL(__ethtool_get_settings);
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+ const void __user *from)
{
- int err;
- struct ethtool_cmd cmd;
+ struct ethtool_link_usettings link_usettings;
+
+ if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+ return -EFAULT;
+
+ memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+ bitmap_from_u32array(to->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+ bitmap_from_u32array(to->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+ bitmap_from_u32array(to->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+
+ return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+ const struct ethtool_link_ksettings *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+ bitmap_to_u32array(link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_u32array(link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_link_ksettings link_ksettings;
- err = __ethtool_get_settings(dev, &cmd);
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* handle bitmap nbits handshake */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords) {
+ /* wrong link mode nbits requested */
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ /* send back number of words required as negative val */
+ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+ "need too many bits for link modes!");
+ link_ksettings.base.link_mode_masks_nwords
+ = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+ /* copy the base fields back to user, not the link
+ * mode bitmaps
+ */
+ if (copy_to_user(useraddr, &link_ksettings.base,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ /* handshake successful: user/kernel agree on
+ * link_mode_masks_nwords
+ */
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
if (err < 0)
return err;
+ /* make sure we tell the right values to user */
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+
+ return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* make sure nbits field has expected value */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ /* copy the whole structure, now that we know it has expected
+ * format
+ */
+ err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+ if (err)
+ return err;
+
+ /* re-check nwords field, just in case */
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+ get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ ASSERT_RTNL();
+
+ if (dev->ethtool_ops->get_link_ksettings) {
+ /* First, use link_ksettings API if it is supported */
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev,
+ &link_ksettings);
+ if (err < 0)
+ return err;
+ if (!convert_link_ksettings_to_legacy_settings(&cmd,
+ &link_ksettings))
+ warn_incomplete_ethtool_legacy_settings_conversion(
+ "link modes are only partially reported");
+
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
+ } else {
+ /* driver doesn't support %ethtool_link_ksettings
+ * API. revert to legacy %ethtool_cmd API, unless it's
+ * not supported either.
+ */
+ int err;
+
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+ }
+
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
+
return 0;
}
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_cmd cmd;
- if (!dev->ethtool_ops->set_settings)
- return -EOPNOTSUPP;
+ ASSERT_RTNL();
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
+ /* first, try new %ethtool_link_ksettings API. */
+ if (dev->ethtool_ops->set_link_ksettings) {
+ struct ethtool_link_ksettings link_ksettings;
+
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+ &cmd))
+ return -EINVAL;
+
+ link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+ return dev->ethtool_ops->set_link_ksettings(dev,
+ &link_ksettings);
+ }
+
+ /* legacy %ethtool_cmd API */
+
+ /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+ * disappears internally
+ */
+
+ if (!dev->ethtool_ops->set_settings)
+ return -EOPNOTSUPP;
+
return dev->ethtool_ops->set_settings(dev, &cmd);
}
@@ -632,7 +1058,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
return 0;
}
-u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len)
{
@@ -642,6 +1068,37 @@ void netdev_rss_key_fill(void *buffer, size_t len)
}
EXPORT_SYMBOL(netdev_rss_key_fill);
+static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+ u32 dev_size, current_max = 0;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ while (dev_size--)
+ current_max = max(current_max, indir[dev_size]);
+
+ *max = current_max;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -738,6 +1195,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
}
ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (user_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
out:
kfree(indir);
@@ -897,6 +1362,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
out:
kfree(rss_config);
@@ -1227,14 +1700,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels;
+ struct ethtool_channels channels, max;
+ u32 max_rx_in_use = 0;
- if (!dev->ethtool_ops->set_channels)
+ if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
+ dev->ethtool_ops->get_channels(dev, &max);
+
+ /* ensure new counts are within the maximums */
+ if ((channels.rx_count > max.max_rx) ||
+ (channels.tx_count > max.max_tx) ||
+ (channels.combined_count > max.max_combined) ||
+ (channels.other_count > max.max_other))
+ return -EINVAL;
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table settings */
+ if (netif_is_rxfh_configured(dev) &&
+ !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+ (channels.combined_count + channels.rx_count) <= max_rx_in_use)
+ return -EINVAL;
+
return dev->ethtool_ops->set_channels(dev, &channels);
}
@@ -1823,13 +2313,121 @@ out:
return ret;
}
+static int ethtool_get_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int ret;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if (!dev->ethtool_ops->get_per_queue_coalesce)
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_u32array(queue_mask,
+ MAX_NUM_QUEUE,
+ per_queue_opt->queue_mask,
+ DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ return ret;
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ useraddr += sizeof(coalesce);
+ }
+
+ return 0;
+}
+
+static int ethtool_set_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int i, ret = 0;
+ int n_queue;
+ struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+ (!dev->ethtool_ops->get_per_queue_coalesce))
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_u32array(queue_mask,
+ MAX_NUM_QUEUE,
+ per_queue_opt->queue_mask,
+ DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+ n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+ tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+ if (!backup)
+ return -ENOMEM;
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce;
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+ if (ret != 0)
+ goto roll_back;
+
+ tmp++;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+ ret = -EFAULT;
+ goto roll_back;
+ }
+
+ ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ goto roll_back;
+
+ useraddr += sizeof(coalesce);
+ }
+
+roll_back:
+ if (ret != 0) {
+ tmp = backup;
+ for_each_set_bit(i, queue_mask, bit) {
+ dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+ tmp++;
+ }
+ }
+ kfree(backup);
+
+ return ret;
+}
+
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_per_queue_op per_queue_opt;
+
+ if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+ return -EFAULT;
+
+ switch (per_queue_opt.sub_command) {
+ case ETHTOOL_GCOALESCE:
+ return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ case ETHTOOL_SCOALESCE:
+ return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ default:
+ return -EOPNOTSUPP;
+ };
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
{
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
void __user *useraddr = ifr->ifr_data;
- u32 ethcmd;
+ u32 ethcmd, sub_cmd;
int rc;
netdev_features_t old_features;
@@ -1839,8 +2437,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
+ if (ethcmd == ETHTOOL_PERQUEUE) {
+ if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+ return -EFAULT;
+ } else {
+ sub_cmd = ethcmd;
+ }
/* Allow some commands to be done by anyone */
- switch (ethcmd) {
+ switch (sub_cmd) {
case ETHTOOL_GSET:
case ETHTOOL_GDRVINFO:
case ETHTOOL_GMSGLVL:
@@ -2070,6 +2674,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPHYSTATS:
rc = ethtool_get_phy_stats(dev, useraddr);
break;
+ case ETHTOOL_PERQUEUE:
+ rc = ethtool_set_per_queue(dev, useraddr);
+ break;
+ case ETHTOOL_GLINKSETTINGS:
+ rc = ethtool_get_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_SLINKSETTINGS:
+ rc = ethtool_set_link_ksettings(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 365de66436ac..840acebbb80c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -549,7 +549,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8); /* FRA_TUN_ID */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -607,7 +607,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->target &&
nla_put_u32(skb, FRA_GOTO, rule->target)) ||
(rule->tun_id &&
- nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index bba502f7cd57..68adb5f52110 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -530,12 +530,14 @@ do_pass:
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
- /* RET_K, RET_A are remaped into 2 insns. */
+ /* RET_K is remaped into 2 insns. RET_A case doesn't need an
+ * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
+ */
case BPF_RET | BPF_A:
case BPF_RET | BPF_K:
- *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
- BPF_K : BPF_X, BPF_REG_0,
- BPF_REG_A, fp->k);
+ if (BPF_RVAL(fp->code) == BPF_K)
+ *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
+ 0, fp->k);
*insn = BPF_EXIT_INSN();
break;
@@ -992,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
*/
goto out_err_free;
- bpf_prog_select_runtime(fp);
+ /* We are guaranteed to never error here with cBPF to eBPF
+ * transitions, since there's no issue with type compatibility
+ * checks on program arrays.
+ */
+ fp = bpf_prog_select_runtime(fp, &err);
kfree(old_prog);
return fp;
@@ -1164,7 +1170,7 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
}
old_fp = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
rcu_assign_pointer(sk->sk_filter, fp);
if (old_fp)
@@ -1181,7 +1187,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
if (bpf_prog_size(prog->len) > sysctl_optmem_max)
return -ENOMEM;
- if (sk_unhashed(sk)) {
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
err = reuseport_alloc(sk);
if (err)
return err;
@@ -1333,18 +1339,40 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
-#define BPF_LDST_LEN 16U
+struct bpf_scratchpad {
+ union {
+ __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
+ u8 buff[MAX_BPF_STACK];
+ };
+};
+
+static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
+
+static inline int bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ int err;
+
+ if (!skb_cloned(skb))
+ return 0;
+ if (skb_clone_writable(skb, write_len))
+ return 0;
+ err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (!err)
+ bpf_compute_data_end(skb);
+ return err;
+}
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
struct sk_buff *skb = (struct sk_buff *) (long) r1;
int offset = (int) r2;
void *from = (void *) (long) r3;
unsigned int len = (unsigned int) r4;
- char buf[BPF_LDST_LEN];
void *ptr;
- if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
/* bpf verifier guarantees that:
@@ -1355,14 +1383,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
*
* so check for invalid 'offset' and too large 'len'
*/
- if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
+ if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + len)))
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
- ptr = skb_header_pointer(skb, offset, len, buf);
+ ptr = skb_header_pointer(skb, offset, len, sp->buff);
if (unlikely(!ptr))
return -EFAULT;
@@ -1371,17 +1397,19 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
memcpy(ptr, from, len);
- if (ptr == buf)
+ if (ptr == sp->buff)
/* skb_store_bits cannot return -EFAULT here */
skb_store_bits(skb, offset, ptr, len);
if (flags & BPF_F_RECOMPUTE_CSUM)
skb_postpush_rcsum(skb, ptr, len);
+ if (flags & BPF_F_INVALIDATE_HASH)
+ skb_clear_hash(skb);
return 0;
}
-const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.func = bpf_skb_store_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1400,25 +1428,28 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
unsigned int len = (unsigned int) r4;
void *ptr;
- if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN))
- return -EFAULT;
+ if (unlikely((u32) offset > 0xffff))
+ goto err_clear;
ptr = skb_header_pointer(skb, offset, len, to);
if (unlikely(!ptr))
- return -EFAULT;
+ goto err_clear;
if (ptr != to)
memcpy(to, ptr, len);
return 0;
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
}
-const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.func = bpf_skb_load_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_PTR_TO_RAW_STACK,
.arg4_type = ARG_CONST_STACK_SIZE,
};
@@ -1432,9 +1463,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(sum))))
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1442,6 +1471,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EFAULT;
switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ csum_replace_by_diff(ptr, to);
+ break;
case 2:
csum_replace2(ptr, from, to);
break;
@@ -1459,7 +1494,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.func = bpf_l3_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1474,23 +1509,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+ bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
int offset = (int) r2;
__sum16 sum, *ptr;
- if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+ BPF_F_HDR_FIELD_MASK)))
return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(sum))))
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
if (unlikely(!ptr))
return -EFAULT;
+ if (is_mmzero && !*ptr)
+ return 0;
switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ break;
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
break;
@@ -1501,6 +1544,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
}
+ if (is_mmzero && !*ptr)
+ *ptr = CSUM_MANGLED_0;
if (ptr == &sum)
/* skb_store_bits guaranteed to not return -EFAULT here */
skb_store_bits(skb, offset, ptr, sizeof(sum));
@@ -1508,7 +1553,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.func = bpf_l4_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1519,6 +1564,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
+ u64 diff_size = from_size + to_size;
+ __be32 *from = (__be32 *) (long) r1;
+ __be32 *to = (__be32 *) (long) r3;
+ int i, j = 0;
+
+ /* This is quite flexible, some examples:
+ *
+ * from_size == 0, to_size > 0, seed := csum --> pushing data
+ * from_size > 0, to_size == 0, seed := csum --> pulling data
+ * from_size > 0, to_size > 0, seed := 0 --> diffing data
+ *
+ * Even for diffing, from_size and to_size don't need to be equal.
+ */
+ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+ diff_size > sizeof(sp->diff)))
+ return -EINVAL;
+
+ for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = ~from[i];
+ for (i = 0; i < to_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = to[i];
+
+ return csum_partial(sp->diff, diff_size, seed);
+}
+
+static const struct bpf_func_proto bpf_csum_diff_proto = {
+ .func = bpf_csum_diff,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -1543,11 +1627,10 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
}
skb2->dev = dev;
- skb_sender_cpu_clear(skb2);
return dev_queue_xmit(skb2);
}
-const struct bpf_func_proto bpf_clone_redirect_proto = {
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
.func = bpf_clone_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1596,11 +1679,10 @@ int skb_do_redirect(struct sk_buff *skb)
}
skb->dev = dev;
- skb_sender_cpu_clear(skb);
return dev_queue_xmit(skb);
}
-const struct bpf_func_proto bpf_redirect_proto = {
+static const struct bpf_func_proto bpf_redirect_proto = {
.func = bpf_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1622,14 +1704,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
-#ifdef CONFIG_IP_ROUTE_CLASSID
- const struct dst_entry *dst;
-
- dst = skb_dst((struct sk_buff *) (unsigned long) r1);
- if (dst)
- return dst->tclassid;
-#endif
- return 0;
+ return dst_tclassid((struct sk_buff *) (unsigned long) r1);
}
static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1643,12 +1718,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
__be16 vlan_proto = (__force __be16) r2;
+ int ret;
if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
vlan_proto != htons(ETH_P_8021AD)))
vlan_proto = htons(ETH_P_8021Q);
- return skb_vlan_push(skb, vlan_proto, vlan_tci);
+ ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+ bpf_compute_data_end(skb);
+ return ret;
}
const struct bpf_func_proto bpf_skb_vlan_push_proto = {
@@ -1664,8 +1742,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ int ret;
- return skb_vlan_pop(skb);
+ ret = skb_vlan_pop(skb);
+ bpf_compute_data_end(skb);
+ return ret;
}
const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
@@ -1682,6 +1763,13 @@ bool bpf_helper_changes_skb_data(void *func)
return true;
if (func == bpf_skb_vlan_pop)
return true;
+ if (func == bpf_skb_store_bytes)
+ return true;
+ if (func == bpf_l3_csum_replace)
+ return true;
+ if (func == bpf_l4_csum_replace)
+ return true;
+
return false;
}
@@ -1696,23 +1784,34 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
u8 compat[sizeof(struct bpf_tunnel_key)];
+ void *to_orig = to;
+ int err;
- if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6))))
- return -EINVAL;
- if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags))
- return -EPROTO;
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ err = -EINVAL;
+ goto err_clear;
+ }
+ if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
+ err = -EPROTO;
+ goto err_clear;
+ }
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ err = -EINVAL;
switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ goto set_compat;
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
/* Fixup deprecated structure layouts here, so we have
* a common path later on.
*/
if (ip_tunnel_info_af(info) != AF_INET)
- return -EINVAL;
+ goto err_clear;
+set_compat:
to = (struct bpf_tunnel_key *)compat;
break;
default:
- return -EINVAL;
+ goto err_clear;
}
}
@@ -1720,28 +1819,69 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- if (flags & BPF_F_TUNINFO_IPV6)
+ if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
sizeof(to->remote_ipv6));
- else
+ to->tunnel_label = be32_to_cpu(info->key.label);
+ } else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ }
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
- memcpy((void *)(long) r2, to, size);
+ memcpy(to_orig, to, size);
return 0;
+err_clear:
+ memset(to_orig, 0, size);
+ return err;
}
-const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.func = bpf_skb_get_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_PTR_TO_RAW_STACK,
.arg3_type = ARG_CONST_STACK_SIZE,
.arg4_type = ARG_ANYTHING,
};
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *to = (u8 *) (long) r2;
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ int err;
+
+ if (unlikely(!info ||
+ !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+ err = -ENOENT;
+ goto err_clear;
+ }
+ if (unlikely(size < info->options_len)) {
+ err = -ENOMEM;
+ goto err_clear;
+ }
+
+ ip_tunnel_info_opts_get(to, info);
+ if (size > info->options_len)
+ memset(to + info->options_len, 0, size - info->options_len);
+
+ return info->options_len;
+err_clear:
+ memset(to, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+ .func = bpf_skb_get_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_RAW_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
static struct metadata_dst __percpu *md_dst;
static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1752,10 +1892,13 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
- if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_DONT_FRAGMENT)))
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
/* Fixup deprecated structure layouts here, so we have
* a common path later on.
@@ -1768,6 +1911,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return -EINVAL;
}
}
+ if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
+ from->tunnel_ext))
+ return -EINVAL;
skb_dst_drop(skb);
dst_hold((struct dst_entry *) md);
@@ -1776,7 +1922,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info = &md->u.tun_info;
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+ info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ if (flags & BPF_F_DONT_FRAGMENT)
+ info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
info->key.ttl = from->tunnel_ttl;
@@ -1785,6 +1934,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info->mode |= IP_TUNNEL_INFO_IPV6;
memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
sizeof(from->remote_ipv6));
+ info->key.label = cpu_to_be32(from->tunnel_label) &
+ IPV6_FLOWLABEL_MASK;
} else {
info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
if (flags & BPF_F_ZERO_CSUM_TX)
@@ -1794,7 +1945,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return 0;
}
-const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.func = bpf_skb_set_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1804,17 +1955,53 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *from = (u8 *) (long) r2;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+ if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+ return -EINVAL;
+ if (unlikely(size > IP_TUNNEL_OPTS_MAX))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_set(info, from, size);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+ .func = bpf_skb_set_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
if (!md_dst) {
- /* race is not possible, since it's called from
- * verifier that is holding verifier mutex
+ /* Race is not possible, since it's called from verifier
+ * that is holding verifier mutex.
*/
- md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+ md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+ GFP_KERNEL);
if (!md_dst)
return NULL;
}
- return &bpf_skb_set_tunnel_key_proto;
+
+ switch (which) {
+ case BPF_FUNC_skb_set_tunnel_key:
+ return &bpf_skb_set_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return &bpf_skb_set_tunnel_opt_proto;
+ default:
+ return NULL;
+ }
}
static const struct bpf_func_proto *
@@ -1851,6 +2038,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -1866,11 +2055,17 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
- return bpf_get_skb_set_tunnel_key_proto();
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
+ case BPF_FUNC_perf_event_output:
+ return bpf_get_event_output_proto();
default:
return sk_filter_func_proto(func_id);
}
@@ -1878,16 +2073,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
-
- /* disallow misaligned access */
+ /* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
-
- /* all __sk_buff fields are __u32 */
- if (size != 4)
+ if (size != sizeof(__u32))
return false;
return true;
@@ -1896,13 +2087,17 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
static bool sk_filter_is_valid_access(int off, int size,
enum bpf_access_type type)
{
- if (off == offsetof(struct __sk_buff, tc_classid))
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ case offsetof(struct __sk_buff, data):
+ case offsetof(struct __sk_buff, data_end):
return false;
+ }
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
break;
default:
return false;
@@ -1915,16 +2110,14 @@ static bool sk_filter_is_valid_access(int off, int size,
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type)
{
- if (off == offsetof(struct __sk_buff, tc_classid))
- return type == BPF_WRITE ? true : false;
-
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, mark):
case offsetof(struct __sk_buff, tc_index):
case offsetof(struct __sk_buff, priority):
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
+ case offsetof(struct __sk_buff, tc_classid):
break;
default:
return false;
@@ -2041,8 +2234,24 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
ctx_off -= offsetof(struct __sk_buff, tc_classid);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
- WARN_ON(type != BPF_WRITE);
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ break;
+
+ case offsetof(struct __sk_buff, data):
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, data));
+ break;
+
+ case offsetof(struct __sk_buff, data_end):
+ ctx_off -= offsetof(struct __sk_buff, data_end);
+ ctx_off += offsetof(struct sk_buff, cb);
+ ctx_off += offsetof(struct bpf_skb_data_end, data_end);
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
+ dst_reg, src_reg, ctx_off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -2069,30 +2278,30 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
}
static const struct bpf_verifier_ops sk_filter_ops = {
- .get_func_proto = sk_filter_func_proto,
- .is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
- .get_func_proto = tc_cls_act_func_proto,
- .is_valid_access = tc_cls_act_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .get_func_proto = tc_cls_act_func_proto,
+ .is_valid_access = tc_cls_act_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
- .ops = &sk_filter_ops,
- .type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .ops = &sk_filter_ops,
+ .type = BPF_PROG_TYPE_SOCKET_FILTER,
};
static struct bpf_prog_type_list sched_cls_type __read_mostly = {
- .ops = &tc_cls_act_ops,
- .type = BPF_PROG_TYPE_SCHED_CLS,
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_CLS,
};
static struct bpf_prog_type_list sched_act_type __read_mostly = {
- .ops = &tc_cls_act_ops,
- .type = BPF_PROG_TYPE_SCHED_ACT,
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_ACT,
};
static int __init register_sk_filter_ops(void)
@@ -2114,7 +2323,7 @@ int sk_detach_filter(struct sock *sk)
return -EPERM;
filter = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (filter) {
RCU_INIT_POINTER(sk->sk_filter, NULL);
sk_filter_uncharge(sk, filter);
@@ -2134,7 +2343,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (!filter)
goto out;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1033725be40b..3937b1b68d5b 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -92,8 +92,11 @@ static void flow_cache_gc_task(struct work_struct *work)
list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
- list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+ list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
flow_entry_kill(fce, xfrm);
+ atomic_dec(&xfrm->flow_cache_gc_count);
+ WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
+ }
}
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
@@ -101,6 +104,7 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
struct netns_xfrm *xfrm)
{
if (deleted) {
+ atomic_add(deleted, &xfrm->flow_cache_gc_count);
fcp->hash_count -= deleted;
spin_lock_bh(&xfrm->flow_cache_gc_lock);
list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
@@ -232,6 +236,13 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
if (fcp->hash_count > fc->high_watermark)
flow_cache_shrink(fc, fcp);
+ if (fcp->hash_count > 2 * fc->high_watermark ||
+ atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
+ atomic_inc(&net->xfrm.flow_cache_genid);
+ flo = ERR_PTR(-ENOBUFS);
+ goto ret_object;
+ }
+
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
if (fle) {
fle->net = net;
@@ -446,6 +457,7 @@ int flow_cache_init(struct net *net)
INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
mutex_init(&net->xfrm.flow_flush_sem);
+ atomic_set(&net->xfrm.flow_cache_gc_count, 0);
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 12e700332010..a669dea146c6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,25 +19,12 @@
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
-static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id)
-{
- return flow_dissector->used_keys & (1 << key_id);
-}
-
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
{
flow_dissector->used_keys |= (1 << key_id);
}
-static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id,
- void *target_container)
-{
- return ((char *) target_container) + flow_dissector->offset[key_id];
-}
-
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count)
@@ -178,15 +165,16 @@ ip:
ip_proto = iph->protocol;
- if (!dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS))
- break;
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
- key_addrs = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
- key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ memcpy(&key_addrs->v4addrs, &iph->saddr,
+ sizeof(key_addrs->v4addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ }
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -219,13 +207,12 @@ ipv6:
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
- struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
-
- key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV6_ADDRS,
- target_container);
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
- memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+ memcpy(&key_addrs->v6addrs, &iph->saddr,
+ sizeof(key_addrs->v6addrs));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -339,8 +326,11 @@ mpls:
}
case htons(ETH_P_FCOE):
- key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
- /* fall through */
+ if ((hlen - nhoff) < FCOE_HEADER_LEN)
+ goto out_bad;
+
+ nhoff += FCOE_HEADER_LEN;
+ goto out_good;
default:
goto out_bad;
}
@@ -447,13 +437,12 @@ ip_proto_again:
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
nhoff += sizeof(_fh);
+ ip_proto = fh->nexthdr;
if (!(fh->frag_off & htons(IP6_OFFSET))) {
key_control->flags |= FLOW_DIS_FIRST_FRAG;
- if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
- ip_proto = fh->nexthdr;
+ if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
goto ip_proto_again;
- }
}
goto out_good;
}
@@ -740,6 +729,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
{
u32 poff = keys->control.thoff;
+ /* skip L4 headers for fragments after the first */
+ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
+ !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
+ return poff;
+
switch (keys->basic.ip_proto) {
case IPPROTO_TCP: {
/* access doff as u8 to avoid unaligned access */
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 92d886f4adcb..4573d81093fe 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -191,6 +191,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
/**
* gen_new_estimator - create a new rate estimator
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
@@ -287,6 +288,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
/**
* gen_replace_estimator - replace rate estimator configuration
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 1e2f46a69d50..f96ee8b9478d 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -25,9 +25,9 @@
static inline int
-gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
{
- if (nla_put(d->skb, type, size, buf))
+ if (nla_put_64bit(d->skb, type, size, buf, padattr))
goto nla_put_failure;
return 0;
@@ -59,7 +59,8 @@ nla_put_failure:
*/
int
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
- int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+ int xstats_type, spinlock_t *lock,
+ struct gnet_dump *d, int padattr)
__acquires(lock)
{
memset(d, 0, sizeof(*d));
@@ -71,16 +72,17 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
+ d->padattr = padattr;
if (d->tail)
- return gnet_stats_copy(d, type, NULL, 0);
+ return gnet_stats_copy(d, type, NULL, 0, padattr);
return 0;
}
EXPORT_SYMBOL(gnet_stats_start_copy_compat);
/**
- * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * gnet_stats_start_copy - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
@@ -94,9 +96,9 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat);
*/
int
gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
- struct gnet_dump *d)
+ struct gnet_dump *d, int padattr)
{
- return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
+ return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr);
}
EXPORT_SYMBOL(gnet_stats_start_copy);
@@ -140,6 +142,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
+ * @cpu: copy statistic per cpu
* @b: basic statistics
*
* Appends the basic statistics to the top level TLV created by
@@ -168,7 +171,8 @@ gnet_stats_copy_basic(struct gnet_dump *d,
memset(&sb, 0, sizeof(sb));
sb.bytes = bstats.bytes;
sb.packets = bstats.packets;
- return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
+ return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
+ TCA_STATS_PAD);
}
return 0;
}
@@ -207,11 +211,13 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
}
if (d->tail) {
- res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
+ TCA_STATS_PAD);
if (res < 0 || est.bps == r->bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
+ TCA_STATS_PAD);
}
return 0;
@@ -285,7 +291,8 @@ gnet_stats_copy_queue(struct gnet_dump *d,
if (d->tail)
return gnet_stats_copy(d, TCA_STATS_QUEUE,
- &qstats, sizeof(qstats));
+ &qstats, sizeof(qstats),
+ TCA_STATS_PAD);
return 0;
}
@@ -315,7 +322,8 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
}
if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_APP, st, len);
+ return gnet_stats_copy(d, TCA_STATS_APP, st, len,
+ TCA_STATS_PAD);
return 0;
@@ -346,12 +354,12 @@ gnet_stats_finish_copy(struct gnet_dump *d)
if (d->compat_tc_stats)
if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
- sizeof(d->tc_stats)) < 0)
+ sizeof(d->tc_stats), d->padattr) < 0)
return -1;
if (d->compat_xstats && d->xstats) {
if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
- d->xstats_len) < 0)
+ d->xstats_len, d->padattr) < 0)
return -1;
}
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000000..941c28486896
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,87 @@
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+ if (likely(bm_pool->frag_size <= PAGE_SIZE))
+ skb_free_frag(buf);
+ else
+ kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+ int frag_size = bm_pool->frag_size;
+ void *buf;
+
+ if (likely(frag_size <= PAGE_SIZE))
+ buf = netdev_alloc_frag(frag_size);
+ else
+ buf = kmalloc(frag_size, gfp);
+
+ if (!buf)
+ return -ENOMEM;
+
+ if (bm_pool->construct)
+ if (bm_pool->construct(bm_pool, buf)) {
+ hwbm_buf_free(bm_pool, buf);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{
+ int err, i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bm_pool->lock, flags);
+ if (bm_pool->buf_num == bm_pool->size) {
+ pr_warn("pool already filled\n");
+ return bm_pool->buf_num;
+ }
+
+ if (buf_num + bm_pool->buf_num > bm_pool->size) {
+ pr_warn("cannot allocate %d buffers for pool\n",
+ buf_num);
+ return 0;
+ }
+
+ if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+ pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+ buf_num, bm_pool->buf_num);
+ return 0;
+ }
+
+ for (i = 0; i < buf_num; i++) {
+ err = hwbm_pool_refill(bm_pool, gfp);
+ if (err < 0)
+ break;
+ }
+
+ /* Update BM driver with number of buffers added to pool */
+ bm_pool->buf_num += i;
+
+ pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 299cfc24d888..669ecc9f884e 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -27,6 +27,31 @@
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+ /* Only lwt encaps implemented without using an interface for
+ * the encap need to return a string here.
+ */
+ switch (encap_type) {
+ case LWTUNNEL_ENCAP_MPLS:
+ return "MPLS";
+ case LWTUNNEL_ENCAP_ILA:
+ return "ILA";
+ case LWTUNNEL_ENCAP_IP6:
+ case LWTUNNEL_ENCAP_IP:
+ case LWTUNNEL_ENCAP_NONE:
+ case __LWTUNNEL_ENCAP_MAX:
+ /* should not have got here */
+ WARN_ON(1);
+ break;
+ }
+ return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
{
struct lwtunnel_state *lws;
@@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+ if (encap_type_str) {
+ rcu_read_unlock();
+ request_module("rtnl-lwt-%s", encap_type_str);
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ }
+ }
+#endif
if (likely(ops && ops->build_state))
ret = ops->build_state(dev, encap, family, cfg, lws);
rcu_read_unlock();
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f18ae91b652e..29dd8cc22bbf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1763,21 +1763,22 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NEIGH_VAR(parms, MCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_REPROBES,
NEIGH_VAR(parms, MCAST_REPROBES)) ||
- nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
+ NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
- NEIGH_VAR(parms, BASE_REACHABLE_TIME)) ||
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_GC_STALETIME,
- NEIGH_VAR(parms, GC_STALETIME)) ||
+ NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
- NEIGH_VAR(parms, DELAY_PROBE_TIME)) ||
+ NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_RETRANS_TIME,
- NEIGH_VAR(parms, RETRANS_TIME)) ||
+ NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
- NEIGH_VAR(parms, ANYCAST_DELAY)) ||
+ NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_PROXY_DELAY,
- NEIGH_VAR(parms, PROXY_DELAY)) ||
+ NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_LOCKTIME,
- NEIGH_VAR(parms, LOCKTIME)))
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -1804,7 +1805,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
- nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) ||
nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
@@ -1856,7 +1857,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndst.ndts_table_fulls += st->table_fulls;
}
- if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
+ if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
+ NDTA_PAD))
goto nla_put_failure;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 2bf83299600a..14d09345f00d 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
- sd->cpu_collision, sd->received_rps, flow_limit_count);
+ 0, /* was cpu_collision */
+ sd->received_rps, flow_limit_count);
return 0;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b6c8a6629b39..2b3f76fe65f4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -29,7 +29,6 @@
#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
-static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
@@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev,
return restart_syscall();
if (netif_running(netdev)) {
- struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd))
+ ret = sprintf(buf, fmt_dec, cmd.base.speed);
}
rtnl_unlock();
return ret;
@@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev,
return restart_syscall();
if (netif_running(netdev)) {
- struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd)) {
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd)) {
const char *duplex;
- switch (cmd.duplex) {
+
+ switch (cmd.base.duplex) {
case DUPLEX_HALF:
duplex = "half";
break;
@@ -574,6 +576,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
NETSTAT_ENTRY(tx_window_errors);
NETSTAT_ENTRY(rx_compressed);
NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
static struct attribute *netstat_attrs[] = {
&dev_attr_rx_packets.attr,
@@ -599,6 +602,7 @@ static struct attribute *netstat_attrs[] = {
&dev_attr_tx_window_errors.attr,
&dev_attr_rx_compressed.attr,
&dev_attr_tx_compressed.attr,
+ &dev_attr_rx_nohandler.attr,
NULL
};
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 0260c84ed83c..11fce17274f6 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -9,7 +9,6 @@
* Authors: Thomas Graf <tgraf@suug.ch>
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fdtable.h>
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index f1efbc39ef6b..2ec86fc552df 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -11,7 +11,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1474cfd2dc1c..8604ae245960 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2856,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IP);
}
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
@@ -2983,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IPV6);
}
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
@@ -3472,7 +3472,6 @@ xmit_more:
pkt_dev->odevname, ret);
pkt_dev->errors++;
/* fallthru */
- case NETDEV_TX_LOCKED:
case NETDEV_TX_BUSY:
/* Retry it next time */
atomic_dec(&(pkt_dev->skb->users));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8261d95dd846..d69c4644f8f2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,11 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
a->rx_compressed = b->rx_compressed;
a->tx_compressed = b->tx_compressed;
-}
-static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
-{
- memcpy(v, b, sizeof(*b));
+ a->rx_nohandler = b->rx_nohandler;
}
/* All VF info */
@@ -828,17 +825,17 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
/* IFLA_VF_STATS_RX_PACKETS */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_PACKETS */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_RX_BYTES */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_BYTES */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_BROADCAST */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_MULTICAST */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
nla_total_size(sizeof(struct ifla_vf_trust)));
return size;
} else
@@ -879,9 +876,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
- + nla_total_size(sizeof(struct rtnl_link_ifmap))
+ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
+ nla_total_size(sizeof(struct rtnl_link_stats))
- + nla_total_size(sizeof(struct rtnl_link_stats64))
+ + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ nla_total_size(4) /* IFLA_TXQLEN */
@@ -893,6 +890,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
+ + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -905,6 +904,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ nla_total_size(1); /* IFLA_PROTO_DOWN */
}
@@ -1049,25 +1049,23 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
struct net_device *dev)
{
- const struct rtnl_link_stats64 *stats;
- struct rtnl_link_stats64 temp;
+ struct rtnl_link_stats64 *sp;
struct nlattr *attr;
- stats = dev_get_stats(dev, &temp);
-
- attr = nla_reserve(skb, IFLA_STATS,
- sizeof(struct rtnl_link_stats));
+ attr = nla_reserve_64bit(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64), IFLA_PAD);
if (!attr)
return -EMSGSIZE;
- copy_rtnl_link_stats(nla_data(attr), stats);
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
- attr = nla_reserve(skb, IFLA_STATS64,
- sizeof(struct rtnl_link_stats64));
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
if (!attr)
return -EMSGSIZE;
- copy_rtnl_link_stats64(nla_data(attr), stats);
+ copy_rtnl_link_stats(nla_data(attr), sp);
return 0;
}
@@ -1155,18 +1153,18 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_nest_cancel(skb, vfinfo);
return -EMSGSIZE;
}
- if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
- vf_stats.rx_packets) ||
- nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
- vf_stats.tx_packets) ||
- nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
- vf_stats.rx_bytes) ||
- nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
- vf_stats.tx_bytes) ||
- nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
- vf_stats.broadcast) ||
- nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast))
+ if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast, IFLA_VF_STATS_PAD))
return -EMSGSIZE;
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
@@ -1175,15 +1173,17 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
{
- struct rtnl_link_ifmap map = {
- .mem_start = dev->mem_start,
- .mem_end = dev->mem_end,
- .base_addr = dev->base_addr,
- .irq = dev->irq,
- .dma = dev->dma,
- .port = dev->if_port,
- };
- if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
+ struct rtnl_link_ifmap map;
+
+ memset(&map, 0, sizeof(map));
+ map.mem_start = dev->mem_start;
+ map.mem_end = dev->mem_end;
+ map.base_addr = dev->base_addr;
+ map.irq = dev->irq;
+ map.dma = dev->dma;
+ map.port = dev->if_port;
+
+ if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
return -EMSGSIZE;
return 0;
@@ -1221,6 +1221,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
#ifdef CONFIG_RPS
nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
#endif
@@ -1387,15 +1389,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
[IFLA_VF_STATS] = { .type = NLA_NESTED },
[IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) },
-};
-
-static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
- [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 },
- [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 },
- [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 },
- [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 },
- [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 },
- [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 },
+ [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+ [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1412,6 +1407,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+{
+ const struct rtnl_link_ops *ops = NULL;
+ struct nlattr *linfo[IFLA_INFO_MAX + 1];
+
+ if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0)
+ return NULL;
+
+ if (linfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
+ nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ }
+
+ return ops;
+}
+
+static bool link_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool link_kind_filtered(const struct net_device *dev,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (kind_ops && dev->rtnl_link_ops != kind_ops)
+ return true;
+
+ return false;
+}
+
+static bool link_dump_filtered(struct net_device *dev,
+ int master_idx,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (link_master_filtered(dev, master_idx) ||
+ link_kind_filtered(dev, kind_ops))
+ return true;
+
+ return false;
+}
+
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -1421,6 +1468,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
struct hlist_head *head;
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
+ const struct rtnl_link_ops *kind_ops = NULL;
+ unsigned int flags = NLM_F_MULTI;
+ int master_idx = 0;
int err;
int hdrlen;
@@ -1443,18 +1493,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ if (tb[IFLA_MASTER])
+ master_idx = nla_get_u32(tb[IFLA_MASTER]);
+
+ if (tb[IFLA_LINKINFO])
+ kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
+
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
}
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &net->dev_index_head[h];
hlist_for_each_entry(dev, head, index_hlist) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
if (idx < s_idx)
goto cont;
err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI,
+ flags,
ext_filter_mask);
/* If we ran out of room on the first message,
* we're in trouble
@@ -1534,6 +1595,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return 0;
}
+static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+ int guid_type)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+}
+
+static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+{
+ if (dev->type != ARPHRD_INFINIBAND)
+ return -EOPNOTSUPP;
+
+ return handle_infiniband_guid(dev, ivt, guid_type);
+}
+
static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1636,6 +1713,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
return err;
}
+ if (tb[IFLA_VF_IB_NODE_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+ }
+
+ if (tb[IFLA_VF_IB_PORT_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+ }
+
return err;
}
@@ -3351,6 +3446,202 @@ out:
return err;
}
+static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
+{
+ return (mask & IFLA_STATS_FILTER_BIT(attrid)) &&
+ (!idxattr || idxattr == attrid);
+}
+
+static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, unsigned int filter_mask,
+ int *idxattr, int *prividx)
+{
+ struct if_stats_msg *ifsm;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ int s_prividx = *prividx;
+
+ ASSERT_RTNL();
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifsm = nlmsg_data(nlh);
+ ifsm->ifindex = dev->ifindex;
+ ifsm->filter_mask = filter_mask;
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) {
+ struct rtnl_link_stats64 *sp;
+
+ attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
+ sizeof(struct rtnl_link_stats64),
+ IFLA_STATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+
+ if (ops && ops->fill_linkxstats) {
+ int err;
+
+ *idxattr = IFLA_STATS_LINK_XSTATS;
+ attr = nla_nest_start(skb,
+ IFLA_STATS_LINK_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = ops->fill_linkxstats(skb, dev, prividx);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
+
+ nlmsg_end(skb, nlh);
+
+ return 0;
+
+nla_put_failure:
+ /* not a multi message or no progress mean a real error */
+ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx)
+ nlmsg_cancel(skb, nlh);
+ else
+ nlmsg_end(skb, nlh);
+
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
+ [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) },
+};
+
+static size_t if_nlmsg_stats_size(const struct net_device *dev,
+ u32 filter_mask)
+{
+ size_t size = 0;
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
+ size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+
+ if (ops && ops->get_linkxstats_size) {
+ size += nla_total_size(ops->get_linkxstats_size(dev));
+ /* for IFLA_STATS_LINK_XSTATS */
+ size += nla_total_size(0);
+ }
+ }
+
+ return size;
+}
+
+static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ int idxattr = 0, prividx = 0;
+ struct if_stats_msg *ifsm;
+ struct sk_buff *nskb;
+ u32 filter_mask;
+ int err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ filter_mask = ifsm->filter_mask;
+ if (!filter_mask)
+ return -EINVAL;
+
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+ if (!nskb)
+ return -ENOBUFS;
+
+ err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
+ NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ 0, filter_mask, &idxattr, &prividx);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else {
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+ }
+
+ return err;
+}
+
+static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int h, s_h, err, s_idx, s_idxattr, s_prividx;
+ struct net *net = sock_net(skb->sk);
+ unsigned int flags = NLM_F_MULTI;
+ struct if_stats_msg *ifsm;
+ struct hlist_head *head;
+ struct net_device *dev;
+ u32 filter_mask = 0;
+ int idx = 0;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+ s_idxattr = cb->args[2];
+ s_prividx = cb->args[3];
+
+ cb->seq = net->dev_base_seq;
+
+ ifsm = nlmsg_data(cb->nlh);
+ filter_mask = ifsm->filter_mask;
+ if (!filter_mask)
+ return -EINVAL;
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, filter_mask,
+ &s_idxattr, &s_prividx);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err < 0)
+ goto out;
+ s_prividx = 0;
+ s_idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ cb->args[3] = s_prividx;
+ cb->args[2] = s_idxattr;
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
/* Process one rtnetlink message. */
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -3500,4 +3791,7 @@ void __init rtnetlink_init(void)
rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
+ NULL);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8616d1147c93..f2b77e549c03 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -349,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(build_skb);
+#define NAPI_SKB_CACHE_SIZE 64
+
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ size_t skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
@@ -380,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(nc, fragsz, gfp_mask);
+ return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -476,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
void *data;
@@ -496,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = __alloc_page_frag(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -507,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
/* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->pfmemalloc)
+ if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -749,6 +757,73 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
+void __kfree_skb_flush(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* flush skb_cache if containing objects */
+ if (nc->skb_count) {
+ kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+
+static inline void _kfree_skb_defer(struct sk_buff *skb)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* drop skb->head and call any destructors for packet */
+ skb_release_all(skb);
+
+ /* record skb to CPU local list */
+ nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+ /* SLUB writes into objects when freeing */
+ prefetchw(skb);
+#endif
+
+ /* flush skb_cache if it is filled */
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+ _kfree_skb_defer(skb);
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+ if (unlikely(!skb))
+ return;
+
+ /* Zero budget indicate non-NAPI context called us, like netpoll */
+ if (unlikely(!budget)) {
+ dev_consume_skb_any(skb);
+ return;
+ }
+
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ /* if reaching here SKB is ready to free */
+ trace_consume_skb(skb);
+
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ _kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
@@ -1843,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
+ struct sk_buff *iter;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -1869,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return true;
}
+ skb_walk_frags(skb, iter) {
+ if (*offset >= iter->len) {
+ *offset -= iter->len;
+ continue;
+ }
+ /* __skb_splice_bits() only fails if the output has no room
+ * left, so no point in going over the frag_list for the error
+ * case.
+ */
+ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ return true;
+ }
+
return false;
}
@@ -1895,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk,
/*
* Map data from the skb to a pipe. Should handle both the linear part,
- * the fragments, and the frag list. It does NOT handle frag lists within
- * the frag list, if such a thing exists. We'd probably need to recurse to
- * handle that cleanly.
+ * the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
@@ -1916,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
.ops = &nosteal_pipe_buf_ops,
.spd_release = sock_spd_release,
};
- struct sk_buff *frag_iter;
int ret = 0;
- /*
- * __skb_splice_bits() only fails if the output has no room left,
- * so no point in going over the frag_list for the error case.
- */
- if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
- goto done;
- else if (!tlen)
- goto done;
-
- /*
- * now see if we have a frag_list to map
- */
- skb_walk_frags(skb, frag_iter) {
- if (!tlen)
- break;
- if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
- break;
- }
+ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
-done:
if (spd.nr_pages)
ret = splice_cb(sk, pipe, &spd);
@@ -3008,11 +3076,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int partial_segs = 0;
unsigned int headroom;
- unsigned int len;
+ unsigned int len = head_skb->len;
__be16 proto;
- bool csum;
- int sg = !!(features & NETIF_F_SG);
+ bool csum, sg;
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
@@ -3024,8 +3092,20 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
- csum = !head_skb->encap_hdr_csum &&
- !!can_checksum_protocol(features, proto);
+ sg = !!(features & NETIF_F_SG);
+ csum = !!can_checksum_protocol(features, proto);
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
+ if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+ partial_segs = len / mss;
+ if (partial_segs > 1)
+ mss *= partial_segs;
+ else
+ partial_segs = 0;
+ }
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3118,13 +3198,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (nskb->len == len + doffset)
goto perform_csum_check;
- if (!sg && !nskb->remcsum_offload) {
- nskb->ip_summed = CHECKSUM_NONE;
- nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
+ if (!sg) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len, 0);
SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ skb_headroom(nskb) + doffset;
continue;
}
@@ -3190,12 +3272,19 @@ skip_fraglist:
nskb->truesize += nskb->data_len;
perform_csum_check:
- if (!csum && !nskb->remcsum_offload) {
- nskb->csum = skb_checksum(nskb, doffset,
- nskb->len - doffset, 0);
- nskb->ip_summed = CHECKSUM_NONE;
+ if (!csum) {
+ if (skb_has_shared_frag(nskb)) {
+ err = __skb_linearize(nskb);
+ if (err)
+ goto err;
+ }
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
@@ -3205,6 +3294,23 @@ perform_csum_check:
*/
segs->prev = tail;
+ /* Update GSO info on first skb in partial sequence. */
+ if (partial_segs) {
+ int type = skb_shinfo(head_skb)->gso_type;
+
+ /* Update type to add partial and then remove dodgy if set */
+ type |= SKB_GSO_PARTIAL;
+ type &= ~SKB_GSO_DODGY;
+
+ /* Update GSO info and prepare to start updating headers on
+ * our way back down the stack of protocols.
+ */
+ skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
+ skb_shinfo(segs)->gso_segs = partial_segs;
+ skb_shinfo(segs)->gso_type = type;
+ SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ }
+
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
@@ -4237,7 +4343,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- skb_sender_cpu_clear(skb);
secpath_reset(skb);
nf_reset(skb);
nf_reset_trace(skb);
@@ -4427,15 +4532,16 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
__skb_push(skb, offset);
err = __vlan_insert_tag(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
- if (err)
+ if (err) {
+ __skb_pull(skb, offset);
return err;
+ }
+
skb->protocol = skb->vlan_proto;
skb->mac_len += VLAN_HLEN;
- __skb_pull(skb, offset);
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(skb->data
- + (2 * ETH_ALEN), VLAN_HLEN, 0));
+ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+ __skb_pull(skb, offset);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
@@ -4519,3 +4625,239 @@ failure:
return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ const int headlen, gfp_t gfp_mask)
+{
+ int i;
+ int size = skb_end_offset(skb);
+ int new_hlen = headlen - off;
+ u8 *data;
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ /* Copy real data, and all frags */
+ skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+ skb->len -= off;
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_cloned(skb)) {
+ /* drop the old head gracefully */
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+ skb_release_data(skb);
+ } else {
+ /* we can reuse existing recount- all we did was
+ * relocate values
+ */
+ skb_free_head(skb);
+ }
+
+ skb->head = data;
+ skb->data = data;
+ skb->head_frag = 0;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb_set_tail_pointer(skb, skb_headlen(skb));
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct sk_buff *skb,
+ struct skb_shared_info *shinfo, int eat,
+ gfp_t gfp_mask)
+{
+ struct sk_buff *list = shinfo->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ if (!list) {
+ pr_err("Not enough bytes to eat. Want %d\n", eat);
+ return -EFAULT;
+ }
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+ if (skb_shared(list)) {
+ clone = skb_clone(list, gfp_mask);
+ if (!clone)
+ return -ENOMEM;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without problems. */
+ insp = list;
+ }
+ if (pskb_carve(list, eat, gfp_mask) < 0) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = shinfo->frag_list) != insp) {
+ shinfo->frag_list = list->next;
+ kfree_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ shinfo->frag_list = clone;
+ }
+ return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+ int pos, gfp_t gfp_mask)
+{
+ int i, k = 0;
+ int size = skb_end_offset(skb);
+ u8 *data;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+ struct skb_shared_info *shinfo;
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb), offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ shinfo = (struct skb_shared_info *)(data + size);
+ for (i = 0; i < nfrags; i++) {
+ int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (pos + fsize > off) {
+ shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < off) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ shinfo->frags[0].page_offset += off - pos;
+ skb_frag_size_sub(&shinfo->frags[0], off - pos);
+ }
+ skb_frag_ref(skb, i);
+ k++;
+ }
+ pos += fsize;
+ }
+ shinfo->nr_frags = k;
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ if (k == 0) {
+ /* split line is in frag list */
+ pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+ }
+ skb_release_data(skb);
+
+ skb->head = data;
+ skb->head_frag = 0;
+ skb->data = data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb_reset_tail_pointer(skb);
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ skb->len -= off;
+ skb->data_len = skb->len;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+ return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+ int headlen = skb_headlen(skb);
+
+ if (len < headlen)
+ return pskb_carve_inside_header(skb, len, headlen, gfp);
+ else
+ return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+ int to_copy, gfp_t gfp)
+{
+ struct sk_buff *clone = skb_clone(skb, gfp);
+
+ if (!clone)
+ return NULL;
+
+ if (pskb_carve(clone, off, gfp) < 0 ||
+ pskb_trim(clone, to_copy)) {
+ kfree_skb(clone);
+ return NULL;
+ }
+ return clone;
+}
+EXPORT_SYMBOL(pskb_extract);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc93412..08bf97eceeb3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -221,7 +221,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
"sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
"sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
+ "sk_lock-AF_MAX"
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
@@ -237,7 +238,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
"slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
"slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
+ "slock-AF_MAX"
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
@@ -253,7 +255,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
"clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
"clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
+ "clock-AF_MAX"
};
/*
@@ -402,9 +405,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
}
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int err;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
@@ -414,10 +416,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return -ENOMEM;
}
- err = sk_filter(sk, skb);
- if (err)
- return err;
-
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
@@ -440,6 +438,18 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk->sk_data_ready(sk);
return 0;
}
+EXPORT_SYMBOL(__sock_queue_rcv_skb);
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = sk_filter(sk, skb);
+ if (err)
+ return err;
+
+ return __sock_queue_rcv_skb(sk, skb);
+}
EXPORT_SYMBOL(sock_queue_rcv_skb);
int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
@@ -832,7 +842,8 @@ set_rcvbuf:
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
if (sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM) {
- if (sk->sk_state != TCP_ESTABLISHED) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN)) {
ret = -EINVAL;
break;
}
@@ -987,6 +998,10 @@ set_rcvbuf:
sk->sk_incoming_cpu = val;
break;
+ case SO_CNX_ADVICE:
+ if (val == 1)
+ dst_negative_advice(sk);
+ break;
default:
ret = -ENOPROTOOPT;
break;
@@ -1414,8 +1429,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
}
EXPORT_SYMBOL(sk_alloc);
-void sk_destruct(struct sock *sk)
+/* Sockets having SOCK_RCU_FREE will call this function after one RCU
+ * grace period. This is the case for UDP sockets and TCP listeners.
+ */
+static void __sk_destruct(struct rcu_head *head)
{
+ struct sock *sk = container_of(head, struct sock, sk_rcu);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -1444,6 +1463,14 @@ void sk_destruct(struct sock *sk)
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_destruct(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_RCU_FREE))
+ call_rcu(&sk->sk_rcu, __sk_destruct);
+ else
+ __sk_destruct(&sk->sk_rcu);
+}
+
static void __sk_free(struct sock *sk)
{
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
@@ -1508,6 +1535,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_dst_cache = NULL;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
+ atomic_set(&newsk->sk_drops, 0);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
@@ -1531,6 +1559,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk = NULL;
goto out;
}
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
newsk->sk_err = 0;
newsk->sk_priority = 0;
@@ -1626,6 +1655,17 @@ void sock_wfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_wfree);
+/* This variant of sock_wfree() is used by TCP,
+ * since it sets SOCK_USE_WRITE_QUEUE.
+ */
+void __sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
@@ -1648,8 +1688,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL(skb_set_owner_w);
+/* This helper is used by netem, as it can hold packets in its
+ * delay queue. We want to allow the owner socket to send more
+ * packets, as if they were already TX completed by a typical driver.
+ * But we also want to keep skb->sk set because some packet schedulers
+ * rely on it (sch_fq for example). So we set skb->truesize to a small
+ * amount (1) and decrease sk_wmem_alloc accordingly.
+ */
void skb_orphan_partial(struct sk_buff *skb)
{
+ /* If this skb is a TCP pure ACK or already went here,
+ * we have nothing to do. 2 is already a very small truesize.
+ */
+ if (skb->truesize <= 2)
+ return;
+
/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
* so we do not completely orphan skb, but transfert all
* accounted bytes but one, to avoid unexpected reorders.
@@ -1861,27 +1914,51 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+ struct sockcm_cookie *sockc)
+{
+ u32 tsflags;
+
+ switch (cmsg->cmsg_type) {
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SO_TIMESTAMPING:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+
+ tsflags = *(u32 *)CMSG_DATA(cmsg);
+ if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
+ return -EINVAL;
+
+ sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
+ sockc->tsflags |= tsflags;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__sock_cmsg_send);
+
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
struct sockcm_cookie *sockc)
{
struct cmsghdr *cmsg;
+ int ret;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- switch (cmsg->cmsg_type) {
- case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
- return -EINVAL;
- sockc->mark = *(u32 *)CMSG_DATA(cmsg);
- break;
- default:
- return -EINVAL;
- }
+ ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -1903,7 +1980,7 @@ EXPORT_SYMBOL(sock_cmsg_send);
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
if (pfrag->page) {
- if (atomic_read(&pfrag->page->_count) == 1) {
+ if (page_ref_count(pfrag->page) == 1) {
pfrag->offset = 0;
return true;
}
@@ -1966,33 +2043,27 @@ static void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
- struct sk_buff *skb = sk->sk_backlog.head;
+ struct sk_buff *skb, *next;
- do {
+ while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
- bh_unlock_sock(sk);
- do {
- struct sk_buff *next = skb->next;
+ spin_unlock_bh(&sk->sk_lock.slock);
+ do {
+ next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
- /*
- * We are in process context here with softirqs
- * disabled, use cond_resched_softirq() to preempt.
- * This is safe to do because we've taken the backlog
- * queue private:
- */
- cond_resched_softirq();
+ cond_resched();
skb = next;
} while (skb != NULL);
- bh_lock_sock(sk);
- } while ((skb = sk->sk_backlog.head) != NULL);
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
/*
* Doing the zeroing here guarantee we can not loop forever
@@ -2001,6 +2072,13 @@ static void __release_sock(struct sock *sk)
sk->sk_backlog.len = 0;
}
+void __sk_flush_backlog(struct sock *sk)
+{
+ spin_lock_bh(&sk->sk_lock.slock);
+ __release_sock(sk);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
@@ -2137,6 +2215,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
}
EXPORT_SYMBOL(__sk_mem_reclaim);
+int sk_set_peek_off(struct sock *sk, int val)
+{
+ if (val < 0)
+ return -EINVAL;
+
+ sk->sk_peek_off = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
/*
* Set of default routines for initialising struct proto_ops when
@@ -2424,11 +2511,6 @@ EXPORT_SYMBOL(lock_sock_nested);
void release_sock(struct sock *sk)
{
- /*
- * The sk_lock has mutex_unlock() semantics:
- */
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a996ce8c8fb2..6b10573cc9fa 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -67,6 +67,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
return nla_put(skb, attrtype, sizeof(mem), &mem);
}
@@ -119,7 +120,7 @@ static size_t sock_diag_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+ nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
- + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+ + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
static void sock_diag_broadcast_destroy_work(struct work_struct *work)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a6beb7b6ae55..0df2aa652530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+# ifdef CONFIG_HAVE_EBPF_JIT
+ {
+ .procname = "bpf_jit_harden",
+ .data = &bpf_jit_harden,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec,
+ },
+# endif
#endif
{
.procname = "netdev_tstamp_prequeue",