summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c101
-rw-r--r--net/core/dev.c1433
-rw-r--r--net/core/devlink.c122
-rw-r--r--net/core/drop_monitor.c60
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c150
-rw-r--r--net/core/fib_rules.c78
-rw-r--r--net/core/filter.c562
-rw-r--r--net/core/flow.c60
-rw-r--r--net/core/flow_dissector.c95
-rw-r--r--net/core/gen_estimator.c296
-rw-r--r--net/core/gen_stats.c20
-rw-r--r--net/core/gro_cells.c92
-rw-r--r--net/core/lwt_bpf.c397
-rw-r--r--net/core/lwtunnel.c85
-rw-r--r--net/core/neighbour.c22
-rw-r--r--net/core/net-sysfs.c66
-rw-r--r--net/core/net_namespace.c79
-rw-r--r--net/core/netclassid_cgroup.c34
-rw-r--r--net/core/netpoll.c16
-rw-r--r--net/core/netprio_cgroup.c3
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/request_sock.c2
-rw-r--r--net/core/rtnetlink.c106
-rw-r--r--net/core/scm.c5
-rw-r--r--net/core/secure_seq.c177
-rw-r--r--net/core/skbuff.c172
-rw-r--r--net/core/sock.c239
-rw-r--r--net/core/stream.c29
-rw-r--r--net/core/sysctl_net_core.c49
-rw-r--r--net/core/utils.c2
32 files changed, 3055 insertions, 1506 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..79f9479e9658 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
+obj-$(CONFIG_GRO_CELLS) += gro_cells.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f8d5d3..f4947e737f34 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -36,7 +36,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
@@ -165,6 +165,7 @@ done:
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
+ * @destructor: invoked under the receive lock on successful dequeue
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
@@ -197,6 +198,8 @@ done:
* the standard around please.
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err,
struct sk_buff **last)
{
@@ -211,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
+ *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -224,26 +228,28 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
*last = skb;
- *peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
skb->peeked)) {
_off -= skb->len;
continue;
}
-
- skb = skb_set_peeked(skb);
- error = PTR_ERR(skb);
- if (IS_ERR(skb)) {
- spin_unlock_irqrestore(&queue->lock,
- cpu_flags);
- goto no_packet;
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ error = PTR_ERR(skb);
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
}
-
+ *peeked = 1;
atomic_inc(&skb->users);
- } else
+ } else {
__skb_unlink(skb, queue);
-
+ if (destructor)
+ destructor(sk, skb);
+ }
spin_unlock_irqrestore(&queue->lock, cpu_flags);
*off = _off;
return skb;
@@ -262,6 +268,8 @@ no_packet:
EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
@@ -270,8 +278,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
+ off, err, &last);
if (skb)
return skb;
@@ -290,7 +298,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, err);
+ NULL, &peeked, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -323,6 +331,31 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
}
EXPORT_SYMBOL(__skb_free_datagram_locked);
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb))
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ if (destructor)
+ destructor(sk, skb);
+ err = 0;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ atomic_inc(&sk->sk_drops);
+ return err;
+}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
+
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
@@ -346,23 +379,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
- int err = 0;
-
- if (flags & MSG_PEEK) {
- err = -ENOENT;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- atomic_dec(&skb->users);
- err = 0;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- }
+ int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
kfree_skb(skb);
- atomic_inc(&sk->sk_drops);
sk_mem_reclaim_partial(sk);
-
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
@@ -378,7 +398,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len)
{
int start = skb_headlen(skb);
- int i, copy = start - offset;
+ int i, copy = start - offset, start_off = offset, n;
struct sk_buff *frag_iter;
trace_skb_copy_datagram_iovec(skb, len);
@@ -387,11 +407,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (copy_to_iter(skb->data + offset, copy, to) != copy)
+ n = copy_to_iter(skb->data + offset, copy, to);
+ offset += n;
+ if (n != copy)
goto short_copy;
if ((len -= copy) == 0)
return 0;
- offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -405,13 +426,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (copy_page_to_iter(skb_frag_page(frag),
+ n = copy_page_to_iter(skb_frag_page(frag),
frag->page_offset + offset -
- start, copy, to) != copy)
+ start, copy, to);
+ offset += n;
+ if (n != copy)
goto short_copy;
if (!(len -= copy))
return 0;
- offset += copy;
}
start = end;
}
@@ -443,6 +465,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
*/
fault:
+ iov_iter_revert(to, offset - start_off);
return -EFAULT;
short_copy:
@@ -593,7 +616,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
__wsum *csump)
{
int start = skb_headlen(skb);
- int i, copy = start - offset;
+ int i, copy = start - offset, start_off = offset;
struct sk_buff *frag_iter;
int pos = 0;
int n;
@@ -603,11 +626,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+ offset += n;
if (n != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
- offset += copy;
pos = copy;
}
@@ -629,12 +652,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
offset - start, copy,
&csum2, to);
kunmap(page);
+ offset += n;
if (n != copy)
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if (!(len -= copy))
return 0;
- offset += copy;
pos += copy;
}
start = end;
@@ -667,6 +690,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
return 0;
fault:
+ iov_iter_revert(to, offset - start_off);
return -EFAULT;
}
@@ -751,6 +775,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
}
return 0;
csum_error:
+ iov_iter_revert(&msg->msg_iter, chunk);
return -EINVAL;
fault:
return -EFAULT;
diff --git a/net/core/dev.c b/net/core/dev.c
index 6666b28b6815..533a6d6f6092 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,5 +1,5 @@
/*
- * NET3 Protocol independent device support routines.
+ * NET3 Protocol independent device support routines.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -7,7 +7,7 @@
* 2 of the License, or (at your option) any later version.
*
* Derived from the non IP parts of dev.c 1.0.19
- * Authors: Ross Biro
+ * Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
*
@@ -21,9 +21,9 @@
*
* Changes:
* D.J. Barrow : Fixed bug where dev->refcnt gets set
- * to 2 if register_netdev gets called
- * before net_dev_init & also removed a
- * few lines of code in the process.
+ * to 2 if register_netdev gets called
+ * before net_dev_init & also removed a
+ * few lines of code in the process.
* Alan Cox : device private ioctl copies fields back.
* Alan Cox : Transmit queue code does relevant
* stunts to keep the queue safe.
@@ -36,7 +36,7 @@
* Alan Cox : 100 backlog just doesn't cut it when
* you start doing multicast video 8)
* Alan Cox : Rewrote net_bh and list manager.
- * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
* Alan Cox : Took out transmit every packet pass
* Saved a few bytes in the ioctl handler
* Alan Cox : Network driver sets packet type before
@@ -46,7 +46,7 @@
* Richard Kooijman: Timestamp fixes.
* Alan Cox : Wrong field in SIOCGIFDSTADDR
* Alan Cox : Device lock protection.
- * Alan Cox : Fixed nasty side effect of device close
+ * Alan Cox : Fixed nasty side effect of device close
* changes.
* Rudi Cilibrasi : Pass the right thing to
* set_mac_address()
@@ -67,12 +67,12 @@
* Paul Rusty Russell : SIOCSIFNAME
* Pekka Riikonen : Netdev boot-time settings code
* Andrew Morton : Make unregister_netdevice wait
- * indefinitely on dev->refcnt
- * J Hadi Salim : - Backlog queue sampling
+ * indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
* - netif_rx() feedback
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -139,7 +139,6 @@
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
-#include <linux/sctp.h>
#include <linux/crash_dump.h>
#include "net-sysfs.h"
@@ -193,7 +192,8 @@ static seqcount_t devnet_rename_seq;
static inline void dev_base_seq_inc(struct net *net)
{
- while (++net->dev_base_seq == 0);
+ while (++net->dev_base_seq == 0)
+ ;
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -275,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
* according to dev->type
*/
-static const unsigned short netdev_lock_type[] =
- {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+static const unsigned short netdev_lock_type[] = {
+ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
@@ -292,22 +292,22 @@ static const unsigned short netdev_lock_type[] =
ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
-static const char *const netdev_lock_name[] =
- {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
- "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
- "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
- "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
- "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
- "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
- "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
- "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
- "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
- "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
- "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
- "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
- "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
- "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
- "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+static const char *const netdev_lock_name[] = {
+ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
@@ -353,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
#endif
/*******************************************************************************
+ *
+ * Protocol management and registration routines
+ *
+ *******************************************************************************/
- Protocol management and registration routines
-
-*******************************************************************************/
/*
* Add a protocol ID to the list. Now that the input handler is
@@ -539,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po)
EXPORT_SYMBOL(dev_remove_offload);
/******************************************************************************
-
- Device Boot-time Settings Routines
-
-*******************************************************************************/
+ *
+ * Device Boot-time Settings Routines
+ *
+ ******************************************************************************/
/* Boot time configuration table */
static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
@@ -575,13 +576,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
}
/**
- * netdev_boot_setup_check - check boot time settings
- * @dev: the netdevice
+ * netdev_boot_setup_check - check boot time settings
+ * @dev: the netdevice
*
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
*/
int netdev_boot_setup_check(struct net_device *dev)
{
@@ -591,10 +592,10 @@ int netdev_boot_setup_check(struct net_device *dev)
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
!strcmp(dev->name, s[i].name)) {
- dev->irq = s[i].map.irq;
- dev->base_addr = s[i].map.base_addr;
- dev->mem_start = s[i].map.mem_start;
- dev->mem_end = s[i].map.mem_end;
+ dev->irq = s[i].map.irq;
+ dev->base_addr = s[i].map.base_addr;
+ dev->mem_start = s[i].map.mem_start;
+ dev->mem_end = s[i].map.mem_end;
return 1;
}
}
@@ -604,14 +605,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
/**
- * netdev_boot_base - get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
+ * netdev_boot_base - get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
*
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
*/
unsigned long netdev_boot_base(const char *prefix, int unit)
{
@@ -664,10 +665,10 @@ int __init netdev_boot_setup(char *str)
__setup("netdev=", netdev_boot_setup);
/*******************************************************************************
-
- Device Interface Subroutines
-
-*******************************************************************************/
+ *
+ * Device Interface Subroutines
+ *
+ *******************************************************************************/
/**
* dev_get_iflink - get 'iflink' value of a interface
@@ -738,15 +739,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
EXPORT_SYMBOL(__dev_get_by_name);
/**
- * dev_get_by_name_rcu - find a device by its name
- * @net: the applicable net namespace
- * @name: name to find
+ * dev_get_by_name_rcu - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
*
- * Find an interface by name.
- * If the name is found a pointer to the device is returned.
- * If the name is not found then %NULL is returned.
- * The reference counters are not incremented so the caller must be
- * careful with locks. The caller must hold RCU lock.
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
@@ -1290,8 +1291,8 @@ void netdev_state_change(struct net_device *dev)
EXPORT_SYMBOL(netdev_state_change);
/**
- * netdev_notify_peers - notify network peers about existence of @dev
- * @dev: network device
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
*
* Generate traffic such that interested network peers are aware of
* @dev, such as by generating a gratuitous ARP. This may be used when
@@ -1303,6 +1304,7 @@ void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
@@ -1519,17 +1521,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
static int dev_boot_phase = 1;
/**
- * register_netdevice_notifier - register a network notifier block
- * @nb: notifier
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
*
- * Register a notifier to be called when network device events occur.
- * The notifier passed is linked into the kernel structures and must
- * not be reused until it has been unregistered. A negative errno code
- * is returned on a failure.
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
*
- * When registered all registration and up events are replayed
- * to the new notifier to allow device to have a race free
- * view of the network device list.
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
@@ -1586,17 +1588,17 @@ outroll:
EXPORT_SYMBOL(register_netdevice_notifier);
/**
- * unregister_netdevice_notifier - unregister a network notifier block
- * @nb: notifier
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
*
- * Unregister a notifier previously registered by
- * register_netdevice_notifier(). The notifier is unlinked into the
- * kernel structures and may then be reused. A negative errno code
- * is returned on a failure.
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
*
- * After unregistering unregister and down device events are synthesized
- * for all devices on the device list to the removed notifier to remove
- * the need for special case cleanup code.
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
@@ -1696,50 +1698,72 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
-/* We are not allowed to call static_key_slow_dec() from irq context
- * If net_disable_timestamp() is called from irq context, defer the
- * static_key_slow_dec() calls.
- */
static atomic_t netstamp_needed_deferred;
+static atomic_t netstamp_wanted;
+static void netstamp_clear(struct work_struct *work)
+{
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+ int wanted;
+
+ wanted = atomic_add_return(deferred, &netstamp_wanted);
+ if (wanted > 0)
+ static_key_enable(&netstamp_needed);
+ else
+ static_key_disable(&netstamp_needed);
+}
+static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
void net_enable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
- int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+ int wanted;
- if (deferred) {
- while (--deferred)
- static_key_slow_dec(&netstamp_needed);
- return;
+ while (1) {
+ wanted = atomic_read(&netstamp_wanted);
+ if (wanted <= 0)
+ break;
+ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
+ return;
}
-#endif
+ atomic_inc(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
static_key_slow_inc(&netstamp_needed);
+#endif
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
- if (in_interrupt()) {
- atomic_inc(&netstamp_needed_deferred);
- return;
+ int wanted;
+
+ while (1) {
+ wanted = atomic_read(&netstamp_wanted);
+ if (wanted <= 1)
+ break;
+ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
+ return;
}
-#endif
+ atomic_dec(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
static_key_slow_dec(&netstamp_needed);
+#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
if (static_key_false(&netstamp_needed))
__net_timestamp(skb);
}
#define net_timestamp_check(COND, SKB) \
if (static_key_false(&netstamp_needed)) { \
- if ((COND) && !(SKB)->tstamp.tv64) \
+ if ((COND) && !(SKB)->tstamp) \
__net_timestamp(SKB); \
} \
@@ -1944,37 +1968,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
}
}
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+ if (dev->num_tc) {
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+ int i;
+
+ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+ if ((txq - tc->offset) < tc->count)
+ return i;
+ }
+
+ return -1;
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_XPS
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
-static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
- int cpu, u16 index)
+static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ return false;
- for (pos = 0; map && pos < map->len; pos++) {
- if (map->queues[pos] == index) {
- if (map->len > 1) {
- map->queues[pos] = map->queues[--map->len];
- } else {
- RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
- kfree_rcu(map, rcu);
- map = NULL;
- }
+ for (pos = map->len; pos--;) {
+ if (map->queues[pos] != index)
+ continue;
+
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
break;
}
+
+ RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ return false;
}
- return map;
+ return true;
}
-static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+static bool remove_xps_queue_cpu(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ int cpu, u16 offset, u16 count)
+{
+ int num_tc = dev->num_tc ? : 1;
+ bool active = false;
+ int tci;
+
+ for (tci = cpu * num_tc; num_tc--; tci++) {
+ int i, j;
+
+ for (i = count, j = offset; i--; j++) {
+ if (!remove_xps_queue(dev_maps, cpu, j))
+ break;
+ }
+
+ active |= i < 0;
+ }
+
+ return active;
+}
+
+static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
+ u16 count)
{
struct xps_dev_maps *dev_maps;
int cpu, i;
@@ -1986,21 +2053,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
if (!dev_maps)
goto out_no_maps;
- for_each_possible_cpu(cpu) {
- for (i = index; i < dev->num_tx_queues; i++) {
- if (!remove_xps_queue(dev_maps, cpu, i))
- break;
- }
- if (i == dev->num_tx_queues)
- active = true;
- }
+ for_each_possible_cpu(cpu)
+ active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
+ offset, count);
if (!active) {
RCU_INIT_POINTER(dev->xps_maps, NULL);
kfree_rcu(dev_maps, rcu);
}
- for (i = index; i < dev->num_tx_queues; i++)
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
NUMA_NO_NODE);
@@ -2008,6 +2070,11 @@ out_no_maps:
mutex_unlock(&xps_map_mutex);
}
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
+}
+
static struct xps_map *expand_xps_map(struct xps_map *map,
int cpu, u16 index)
{
@@ -2047,20 +2114,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ int i, cpu, tci, numa_node_id = -2;
+ int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
- int cpu, numa_node_id = -2;
bool active = false;
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
+ maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
+
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps);
/* allocate memory for queue storage */
- for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, mask))
- continue;
-
+ for_each_cpu_and(cpu, cpu_online_mask, mask) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2068,25 +2143,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ tci = cpu * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
NULL;
map = expand_xps_map(map, cpu, index);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
for_each_possible_cpu(cpu) {
+ /* copy maps belonging to foreign traffic classes */
+ for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
+
+ /* We need to explicitly update tci as prevous loop
+ * could break out early if dev_maps is NULL.
+ */
+ tci = cpu * num_tc + tc;
+
if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
/* add queue to CPU maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(new_dev_maps->cpu_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2100,26 +2188,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
+ /* copy maps belonging to foreign traffic classes */
+ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
}
rcu_assign_pointer(dev->xps_maps, new_dev_maps);
/* Cleanup old maps */
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (!dev_maps)
+ goto out_no_old_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
-
- kfree_rcu(dev_maps, rcu);
}
+ kfree_rcu(dev_maps, rcu);
+
+out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
@@ -2134,11 +2232,12 @@ out_no_new_maps:
/* removes queue from unused CPUs */
for_each_possible_cpu(cpu) {
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
- continue;
-
- if (remove_xps_queue(dev_maps, cpu, index))
- active = true;
+ for (i = tc, tci = cpu * num_tc; i--; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
+ if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ active |= remove_xps_queue(dev_maps, tci, index);
+ for (i = num_tc - tc, tci++; --i; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
}
/* free map if not active */
@@ -2154,11 +2253,14 @@ out_no_maps:
error:
/* remove any maps that we added */
for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
- NULL;
- if (new_map && new_map != map)
- kfree(new_map);
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = dev_maps ?
+ xmap_dereference(dev_maps->cpu_map[tci]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
}
mutex_unlock(&xps_map_mutex);
@@ -2169,6 +2271,44 @@ error:
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
+void netdev_reset_tc(struct net_device *dev)
+{
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = 0;
+ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+ if (tc >= dev->num_tc)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues(dev, offset, count);
+#endif
+ dev->tc_to_txq[tc].count = count;
+ dev->tc_to_txq[tc].offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+ if (num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = num_tc;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2293,28 +2433,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
}
EXPORT_SYMBOL(netif_schedule_queue);
-/**
- * netif_wake_subqueue - allow sending packets on subqueue
- * @dev: network device
- * @queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
- struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-
- if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
- struct Qdisc *q;
-
- rcu_read_lock();
- q = rcu_dereference(txq->qdisc);
- __netif_schedule(q);
- rcu_read_unlock();
- }
-}
-EXPORT_SYMBOL(netif_wake_subqueue);
-
void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
@@ -2408,6 +2526,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
qoffset = dev->tc_to_txq[tc].offset;
qcount = dev->tc_to_txq[tc].count;
}
@@ -2487,141 +2606,6 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/* skb_csum_offload_check - Driver helper function to determine if a device
- * with limited checksum offload capabilities is able to offload the checksum
- * for a given packet.
- *
- * Arguments:
- * skb - sk_buff for the packet in question
- * spec - contains the description of what device can offload
- * csum_encapped - returns true if the checksum being offloaded is
- * encpasulated. That is it is checksum for the transport header
- * in the inner headers.
- * checksum_help - when set indicates that helper function should
- * call skb_checksum_help if offload checks fail
- *
- * Returns:
- * true: Packet has passed the checksum checks and should be offloadable to
- * the device (a driver may still need to check for additional
- * restrictions of its device)
- * false: Checksum is not offloadable. If checksum_help was set then
- * skb_checksum_help was called to resolve checksum for non-GSO
- * packets and when IP protocol is not SCTP
- */
-bool __skb_csum_offload_chk(struct sk_buff *skb,
- const struct skb_csum_offl_spec *spec,
- bool *csum_encapped,
- bool csum_help)
-{
- struct iphdr *iph;
- struct ipv6hdr *ipv6;
- void *nhdr;
- int protocol;
- u8 ip_proto;
-
- if (skb->protocol == htons(ETH_P_8021Q) ||
- skb->protocol == htons(ETH_P_8021AD)) {
- if (!spec->vlan_okay)
- goto need_help;
- }
-
- /* We check whether the checksum refers to a transport layer checksum in
- * the outermost header or an encapsulated transport layer checksum that
- * corresponds to the inner headers of the skb. If the checksum is for
- * something else in the packet we need help.
- */
- if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
- /* Non-encapsulated checksum */
- protocol = eproto_to_ipproto(vlan_get_protocol(skb));
- nhdr = skb_network_header(skb);
- *csum_encapped = false;
- if (spec->no_not_encapped)
- goto need_help;
- } else if (skb->encapsulation && spec->encap_okay &&
- skb_checksum_start_offset(skb) ==
- skb_inner_transport_offset(skb)) {
- /* Encapsulated checksum */
- *csum_encapped = true;
- switch (skb->inner_protocol_type) {
- case ENCAP_TYPE_ETHER:
- protocol = eproto_to_ipproto(skb->inner_protocol);
- break;
- case ENCAP_TYPE_IPPROTO:
- protocol = skb->inner_protocol;
- break;
- }
- nhdr = skb_inner_network_header(skb);
- } else {
- goto need_help;
- }
-
- switch (protocol) {
- case IPPROTO_IP:
- if (!spec->ipv4_okay)
- goto need_help;
- iph = nhdr;
- ip_proto = iph->protocol;
- if (iph->ihl != 5 && !spec->ip_options_okay)
- goto need_help;
- break;
- case IPPROTO_IPV6:
- if (!spec->ipv6_okay)
- goto need_help;
- if (spec->no_encapped_ipv6 && *csum_encapped)
- goto need_help;
- ipv6 = nhdr;
- nhdr += sizeof(*ipv6);
- ip_proto = ipv6->nexthdr;
- break;
- default:
- goto need_help;
- }
-
-ip_proto_again:
- switch (ip_proto) {
- case IPPROTO_TCP:
- if (!spec->tcp_okay ||
- skb->csum_offset != offsetof(struct tcphdr, check))
- goto need_help;
- break;
- case IPPROTO_UDP:
- if (!spec->udp_okay ||
- skb->csum_offset != offsetof(struct udphdr, check))
- goto need_help;
- break;
- case IPPROTO_SCTP:
- if (!spec->sctp_okay ||
- skb->csum_offset != offsetof(struct sctphdr, checksum))
- goto cant_help;
- break;
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_DEST: {
- u8 *opthdr = nhdr;
-
- if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
- goto need_help;
-
- ip_proto = opthdr[0];
- nhdr += (opthdr[1] + 1) << 3;
-
- goto ip_proto_again;
- }
- default:
- goto need_help;
- }
-
- /* Passed the tests for offloading checksum */
- return true;
-
-need_help:
- if (csum_help && !skb_shinfo(skb)->gso_size)
- skb_checksum_help(skb);
-cant_help:
- return false;
-}
-EXPORT_SYMBOL(__skb_csum_offload_chk);
-
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -2679,9 +2663,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
- return skb->ip_summed != CHECKSUM_PARTIAL;
- else
- return skb->ip_summed == CHECKSUM_NONE;
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_NONE;
+
+ return skb->ip_summed == CHECKSUM_NONE;
}
/**
@@ -2700,11 +2685,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
{
+ struct sk_buff *segs;
+
if (unlikely(skb_needs_check(skb, tx_path))) {
int err;
- skb_warn_bad_offload(skb);
-
+ /* We're going to init ->check field in TCP or UDP header */
err = skb_cow_head(skb, 0);
if (err < 0)
return ERR_PTR(err);
@@ -2732,7 +2718,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
- return skb_mac_gso_segment(skb, features);
+ segs = skb_mac_gso_segment(skb, features);
+
+ if (unlikely(skb_needs_check(skb, tx_path)))
+ skb_warn_bad_offload(skb);
+
+ return segs;
}
EXPORT_SYMBOL(__skb_gso_segment);
@@ -2757,9 +2748,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
int i;
+
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
if (PageHighMem(skb_frag_page(frag)))
return 1;
}
@@ -2773,6 +2766,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
dma_addr_t addr = page_to_phys(skb_frag_page(frag));
+
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
return 1;
}
@@ -2815,9 +2809,9 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
- } else if (illegal_highdma(skb->dev, skb)) {
- features &= ~NETIF_F_SG;
}
+ if (illegal_highdma(skb->dev, skb))
+ features &= ~NETIF_F_SG;
return features;
}
@@ -3173,9 +3167,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
if (!cl)
return skb;
- /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
- * earlier by the caller.
- */
+ /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3216,8 +3208,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[skb->sender_cpu - 1]);
+ unsigned int tci = skb->sender_cpu - 1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
@@ -3244,6 +3242,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
int new_index = get_xps_queue(dev, skb);
+
if (new_index < 0)
new_index = skb_tx_hash(dev, skb);
@@ -3273,6 +3272,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
if (dev->real_num_tx_queues != 1) {
const struct net_device_ops *ops = dev->netdev_ops;
+
if (ops->ndo_select_queue)
queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
__netdev_pick_tx);
@@ -3334,7 +3334,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+ skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
if (static_key_false(&egress_needed)) {
skb = sch_handle_egress(skb, &rc, dev);
@@ -3361,16 +3361,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
}
/* The device has no queue. Common case for software devices:
- loopback, all the sorts of tunnels...
+ * loopback, all the sorts of tunnels...
- Really, it is unlikely that netif_tx_lock protection is necessary
- here. (f.e. loopback and IP tunnels are clean ignoring statistics
- counters.)
- However, it is possible, that they rely on protection
- made by us here.
+ * Really, it is unlikely that netif_tx_lock protection is necessary
+ * here. (f.e. loopback and IP tunnels are clean ignoring statistics
+ * counters.)
+ * However, it is possible, that they rely on protection
+ * made by us here.
- Check this and shot the lock. It is not prone from deadlocks.
- Either shot noqueue qdisc, it is even simpler 8)
+ * Check this and shot the lock. It is not prone from deadlocks.
+ *Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
@@ -3432,16 +3432,20 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
EXPORT_SYMBOL(dev_queue_xmit_accel);
-/*=======================================================================
- Receiver routines
- =======================================================================*/
+/*************************************************************************
+ * Receiver routines
+ *************************************************************************/
int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64; /* old backlog weight */
+int weight_p __read_mostly = 64; /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -3461,6 +3465,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3796,6 +3802,7 @@ static int netif_rx_internal(struct sk_buff *skb)
#endif
{
unsigned int qtail;
+
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
@@ -3855,6 +3862,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
while (clist) {
struct sk_buff *skb = clist;
+
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
@@ -3928,7 +3936,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+ skb->tc_at_ingress = 1;
qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3993,9 +4001,7 @@ int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
- ASSERT_RTNL();
-
- if (dev->rx_handler)
+ if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
@@ -4101,12 +4107,8 @@ another_round:
goto out;
}
-#ifdef CONFIG_NET_CLS_ACT
- if (skb->tc_verd & TC_NCLS) {
- skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
- goto ncls;
- }
-#endif
+ if (skb_skip_tc_classify(skb))
+ goto skip_classify;
if (pfmemalloc)
goto skip_taps;
@@ -4134,10 +4136,8 @@ skip_taps:
goto out;
}
#endif
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = 0;
-ncls:
-#endif
+ skb_reset_tc(skb);
+skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -4453,7 +4453,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
}
}
@@ -4491,7 +4493,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
- if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
+ if (skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
@@ -4504,7 +4506,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = 0;
+ NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->recursion_counter = 0;
@@ -4536,6 +4538,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (&ptype->list == head)
goto normal;
+ if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
same_flow = NAPI_GRO_CB(skb)->same_flow;
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
@@ -4631,6 +4638,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
skb_dst_drop(skb);
+ secpath_reset(skb);
kmem_cache_free(skbuff_head_cache, skb);
} else {
__kfree_skb(skb);
@@ -4639,6 +4647,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
case GRO_HELD:
case GRO_MERGED:
+ case GRO_CONSUMED:
break;
}
@@ -4671,6 +4680,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ secpath_reset(skb);
napi->skb = skb;
}
@@ -4709,6 +4719,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
break;
case GRO_MERGED:
+ case GRO_CONSUMED:
break;
}
@@ -4845,7 +4856,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = weight_p;
+ napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
@@ -4901,6 +4912,39 @@ void __napi_schedule(struct napi_struct *n)
EXPORT_SYMBOL(__napi_schedule);
/**
+ * napi_schedule_prep - check if napi can be scheduled
+ * @n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running. This is used as a condition variable
+ * insure only one NAPI poll instance runs. We also make
+ * sure there is no pending NAPI disable.
+ */
+bool napi_schedule_prep(struct napi_struct *n)
+{
+ unsigned long val, new;
+
+ do {
+ val = READ_ONCE(n->state);
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ return false;
+ new = val | NAPIF_STATE_SCHED;
+
+ /* Sets STATE_MISSED bit if STATE_SCHED was already set
+ * This was suggested by Alexander Duyck, as compiler
+ * emits better code than :
+ * if (val & NAPIF_STATE_SCHED)
+ * new |= NAPIF_STATE_MISSED;
+ */
+ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
+ NAPIF_STATE_MISSED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
+/**
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
@@ -4912,26 +4956,19 @@ void __napi_schedule_irqoff(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
-void __napi_complete(struct napi_struct *n)
-{
- BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-
- list_del_init(&n->poll_list);
- smp_mb__before_atomic();
- clear_bit(NAPI_STATE_SCHED, &n->state);
-}
-EXPORT_SYMBOL(__napi_complete);
-
-void napi_complete_done(struct napi_struct *n, int work_done)
+bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags;
+ unsigned long flags, val, new;
/*
- * don't let napi dequeue from the cpu poll list
- * just in case its running on a different cpu
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
*/
- if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
- return;
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
+ return false;
if (n->gro_list) {
unsigned long timeout = 0;
@@ -4945,14 +4982,34 @@ void napi_complete_done(struct napi_struct *n, int work_done)
else
napi_gro_flush(n, false);
}
- if (likely(list_empty(&n->poll_list))) {
- WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
- } else {
+ if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
- __napi_complete(n);
+ list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+
+ do {
+ val = READ_ONCE(n->state);
+
+ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
+
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+
+ /* If STATE_MISSED was set, leave STATE_SCHED set,
+ * because we will call napi->poll() one more time.
+ * This C code was suggested by Alexander Duyck to help gcc.
+ */
+ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
+ NAPIF_STATE_SCHED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ if (unlikely(val & NAPIF_STATE_MISSED)) {
+ __napi_schedule(n);
+ return false;
+ }
+
+ return true;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -4970,13 +5027,50 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
+
#define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+ int rc;
+
+ /* Busy polling means there is a high chance device driver hard irq
+ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
+ * set in napi_schedule_prep().
+ * Since we are about to call napi->poll() once more, we can safely
+ * clear NAPI_STATE_MISSED.
+ *
+ * Note: x86 could use a single "lock and ..." instruction
+ * to perform these two clear_bit()
+ */
+ clear_bit(NAPI_STATE_MISSED, &napi->state);
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == BUSY_POLL_BUDGET)
+ __napi_schedule(napi);
+ local_bh_enable();
+ if (local_softirq_pending())
+ do_softirq();
+}
+
bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
- int (*busy_poll)(struct napi_struct *dev);
+ int (*napi_poll)(struct napi_struct *napi, int budget);
+ void *have_poll_lock = NULL;
struct napi_struct *napi;
- int rc = false;
+ int rc;
+
+restart:
+ rc = false;
+ napi_poll = NULL;
rcu_read_lock();
@@ -4984,39 +5078,54 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (!napi)
goto out;
- /* Note: ndo_busy_poll method is optional in linux-4.5 */
- busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
-
- do {
+ preempt_disable();
+ for (;;) {
rc = 0;
local_bh_disable();
- if (busy_poll) {
- rc = busy_poll(napi);
- } else if (napi_schedule_prep(napi)) {
- void *have = netpoll_poll_lock(napi);
-
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
- if (rc == BUSY_POLL_BUDGET) {
- napi_complete_done(napi, rc);
- napi_schedule(napi);
- }
- }
- netpoll_poll_unlock(have);
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL))
+ goto count;
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val)
+ goto count;
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
}
+ rc = napi_poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+count:
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
local_bh_enable();
- if (rc == LL_FLUSH_FAILED)
- break; /* permanent failure */
+ if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+ busy_loop_timeout(end_time))
+ break;
+ if (unlikely(need_resched())) {
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+ if (rc || busy_loop_timeout(end_time))
+ return rc;
+ goto restart;
+ }
cpu_relax();
- } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
- !need_resched() && !busy_loop_timeout(end_time));
-
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
@@ -5026,7 +5135,7 @@ EXPORT_SYMBOL(sk_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
-void napi_hash_add(struct napi_struct *napi)
+static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
@@ -5046,7 +5155,6 @@ void napi_hash_add(struct napi_struct *napi)
spin_unlock(&napi_hash_lock);
}
-EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
@@ -5071,8 +5179,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
struct napi_struct *napi;
napi = container_of(timer, struct napi_struct, timer);
- if (napi->gro_list)
- napi_schedule(napi);
+
+ /* Note : we use a relaxed variant of napi_schedule_prep() not setting
+ * NAPI_STATE_MISSED, since we do not react to a device IRQ.
+ */
+ if (napi->gro_list && !napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ __napi_schedule_irqoff(napi);
return HRTIMER_NORESTART;
}
@@ -5094,7 +5207,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
- spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
@@ -5212,7 +5324,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- return;
+ goto out;
break;
}
@@ -5230,7 +5342,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
}
}
- __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -5240,6 +5351,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
+out:
+ __kfree_skb_flush();
}
struct netdev_adjacent {
@@ -5270,6 +5383,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+ struct net_device *dev = data;
+
+ return upper_dev == dev;
+}
+
/**
* netdev_has_upper_dev - Check if device is linked to an upper device
* @dev: device
@@ -5284,11 +5404,30 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
+ return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
+/**
* netdev_has_any_upper_dev - Check if device is linked to some device
* @dev: device
*
@@ -5299,7 +5438,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
- return !list_empty(&dev->all_adj_list.upper);
+ return !list_empty(&dev->adj_list.upper);
}
/**
@@ -5326,6 +5465,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.lower);
+}
+
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
struct netdev_adjacent *adj;
@@ -5362,16 +5515,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-/**
- * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next device from the dev's upper list, starting from iter
- * position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *upper;
@@ -5379,14 +5524,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
- if (&upper->list == &dev->all_adj_list.upper)
+ if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
-EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.upper,
+ udev = netdev_next_upper_dev_rcu(dev, &iter);
+ udev;
+ udev = netdev_next_upper_dev_rcu(dev, &iter)) {
+ /* first is the upper device itself */
+ ret = fn(udev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its upper devices */
+ ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
/**
* netdev_lower_get_next_private - Get the next ->private from the
@@ -5469,55 +5641,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
}
EXPORT_SYMBOL(netdev_lower_get_next);
-/**
- * netdev_all_lower_get_next - Get the next device from all lower neighbour list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RTNL lock or
- * its own locking that guarantees that the neighbour all lower
- * list will remain unchanged.
- */
-struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_entry(*iter, struct netdev_adjacent, list);
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
- if (&lower->list == &dev->all_adj_list.lower)
+ if (&lower->list == &dev->adj_list.lower)
return NULL;
- *iter = lower->list.next;
+ *iter = &lower->list;
return lower->dev;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next);
-/**
- * netdev_all_lower_get_next_rcu - Get the next device from all
- * lower neighbour list, RCU variant
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
- struct list_head **iter)
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+
+static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
-
- if (&lower->list == &dev->all_adj_list.lower)
+ if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->dev;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
@@ -5564,6 +5771,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
+
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", adj_dev->name);
return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
@@ -5574,6 +5782,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
+
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", name);
sysfs_remove_link(&(dev->dev.kobj), linkname);
@@ -5590,7 +5799,6 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
- u16 ref_nr,
struct list_head *dev_list,
void *private, bool master)
{
@@ -5600,7 +5808,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
- adj->ref_nr += ref_nr;
+ adj->ref_nr += 1;
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+ dev->name, adj_dev->name, adj->ref_nr);
+
return 0;
}
@@ -5610,12 +5821,12 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->dev = adj_dev;
adj->master = master;
- adj->ref_nr = ref_nr;
+ adj->ref_nr = 1;
adj->private = private;
dev_hold(adj_dev);
- pr_debug("dev_hold for %s, because of link added from %s to %s\n",
- adj_dev->name, dev->name, adj_dev->name);
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
+ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
@@ -5654,17 +5865,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
{
struct netdev_adjacent *adj;
+ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+ dev->name, adj_dev->name, ref_nr);
+
adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
- pr_err("tried to remove device %s from %s\n",
+ pr_err("Adjacency does not exist for device %s from %s\n",
dev->name, adj_dev->name);
- BUG();
+ WARN_ON(1);
+ return;
}
if (adj->ref_nr > ref_nr) {
- pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
- ref_nr, adj->ref_nr-ref_nr);
+ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+ dev->name, adj_dev->name, ref_nr,
+ adj->ref_nr - ref_nr);
adj->ref_nr -= ref_nr;
return;
}
@@ -5676,7 +5892,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
- pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
@@ -5684,38 +5900,27 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
- u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
{
int ret;
- ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
private, master);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
private, false);
if (ret) {
- __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
return ret;
}
return 0;
}
-static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev,
- u16 ref_nr)
-{
- return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower,
- NULL, false);
-}
-
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr,
@@ -5726,40 +5931,19 @@ static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
-static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev,
- u16 ref_nr)
-{
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower);
-}
-
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
- int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
-
- if (ret)
- return ret;
-
- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
- &dev->adj_list.upper,
- &upper_dev->adj_list.lower,
- private, master);
- if (ret) {
- __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
- return ret;
- }
-
- return 0;
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
- __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
@@ -5770,7 +5954,6 @@ static int __netdev_upper_dev_link(struct net_device *dev,
void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j, *to_i, *to_j;
int ret = 0;
ASSERT_RTNL();
@@ -5779,10 +5962,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
+ if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
- if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
+ if (netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
@@ -5804,80 +5987,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
return ret;
- /* Now that we linked these devs, make all the upper_dev's
- * all_adj_list.upper visible to every dev's all_adj_list.lower an
- * versa, and don't forget the devices itself. All of these
- * links are non-neighbours.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- pr_debug("Interlinking %s with %s, non-neighbour\n",
- i->dev->name, j->dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
- if (ret)
- goto rollback_mesh;
- }
- }
-
- /* add dev to every upper_dev's upper device */
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- pr_debug("linking %s's upper device %s with %s\n",
- upper_dev->name, i->dev->name, dev->name);
- ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
- if (ret)
- goto rollback_upper_mesh;
- }
-
- /* add upper_dev to every dev's lower device */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- pr_debug("linking %s's lower device %s with %s\n", dev->name,
- i->dev->name, upper_dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
- if (ret)
- goto rollback_lower_mesh;
- }
-
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
- goto rollback_lower_mesh;
+ goto rollback;
return 0;
-rollback_lower_mesh:
- to_i = i;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
- }
-
- i = NULL;
-
-rollback_upper_mesh:
- to_i = i;
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
- }
-
- i = j = NULL;
-
-rollback_mesh:
- to_i = i;
- to_j = j;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i && j == to_j)
- break;
- __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
- }
- if (i == to_i)
- break;
- }
-
+rollback:
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
return ret;
@@ -5934,7 +6052,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j;
+
ASSERT_RTNL();
changeupper_info.upper_dev = upper_dev;
@@ -5946,23 +6064,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
- /* Here is the tricky part. We must remove all dev's lower
- * devices from all upper_dev's upper devices and vice
- * versa, to maintain the graph relationship.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
-
- /* remove also the devices itself from lower/upper device
- * list
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
-
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
-
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
}
@@ -6118,50 +6219,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
}
EXPORT_SYMBOL(netdev_lower_state_changed);
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
- struct neighbour *n)
-{
- struct net_device *lower_dev, *stop_dev;
- struct list_head *iter;
- int err;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- if (!lower_dev->netdev_ops->ndo_neigh_construct)
- continue;
- err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
- if (err) {
- stop_dev = lower_dev;
- goto rollback;
- }
- }
- return 0;
-
-rollback:
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- if (lower_dev == stop_dev)
- break;
- if (!lower_dev->netdev_ops->ndo_neigh_destroy)
- continue;
- lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
- }
- return err;
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
-
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
- struct neighbour *n)
-{
- struct net_device *lower_dev;
- struct list_head *iter;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- if (!lower_dev->netdev_ops->ndo_neigh_destroy)
- continue;
- lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
- }
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
-
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -6414,8 +6471,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
- is important. Some (broken) drivers set IFF_PROMISC, when
- IFF_ALLMULTI is requested not asking us and not reporting.
+ * is important. Some (broken) drivers set IFF_PROMISC, when
+ * IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
@@ -6500,9 +6557,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive. */
- if (new_mtu < 0)
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
+ dev->name, new_mtu, dev->min_mtu);
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
+ dev->name, new_mtu, dev->max_mtu);
return -EINVAL;
+ }
if (!netif_device_present(dev))
return -ENODEV;
@@ -6649,33 +6715,48 @@ EXPORT_SYMBOL(dev_change_proto_down);
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @fd: new program fd or negative value to clear
+ * @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
-int dev_change_xdp_fd(struct net_device *dev, int fd)
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp = {};
+ struct netdev_xdp xdp;
int err;
+ ASSERT_RTNL();
+
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
+ if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0)
+ return err;
+ if (xdp.prog_attached)
+ return -EBUSY;
+ }
+
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ memset(&xdp, 0, sizeof(xdp));
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
+
err = ops->ndo_xdp(dev, &xdp);
if (err < 0 && prog)
bpf_prog_put(prog);
return err;
}
-EXPORT_SYMBOL(dev_change_xdp_fd);
/**
* dev_new_index - allocate an ifindex
@@ -6688,6 +6769,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd);
static int dev_new_index(struct net *net)
{
int ifindex = net->ifindex;
+
for (;;) {
if (++ifindex <= 0)
ifindex = 1;
@@ -6754,8 +6836,8 @@ static void rollback_registered_many(struct list_head *head)
/* Notify protocols, that we are about to destroy
- this device. They should clean all the things.
- */
+ * this device. They should clean all the things.
+ */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
if (!dev->rtnl_link_ops ||
@@ -6777,6 +6859,7 @@ static void rollback_registered_many(struct list_head *head)
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
@@ -6912,13 +6995,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~dev->gso_partial_features;
}
-#ifdef CONFIG_NET_RX_BUSY_POLL
- if (dev->netdev_ops->ndo_busy_poll)
- features |= NETIF_F_BUSY_POLL;
- else
-#endif
- features &= ~NETIF_F_BUSY_POLL;
-
return features;
}
@@ -7107,6 +7183,7 @@ void netif_tx_stop_all_queues(struct net_device *dev)
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
netif_tx_stop_queue(txq);
}
}
@@ -7581,17 +7658,17 @@ void netdev_freemem(struct net_device *dev)
}
/**
- * alloc_netdev_mqs - allocate network device
- * @sizeof_priv: size of private data to allocate space for
- * @name: device name format string
- * @name_assign_type: origin of device name
- * @setup: callback to initialize device
- * @txqs: the number of TX subqueues to allocate
- * @rxqs: the number of RX subqueues to allocate
- *
- * Allocates a struct net_device with private data area for driver use
- * and performs basic initialization. Also allocates subqueue structs
- * for each queue on the device.
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
@@ -7655,8 +7732,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
- INIT_LIST_HEAD(&dev->all_adj_list.upper);
- INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
#ifdef CONFIG_NET_SCHED
@@ -7667,7 +7742,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
- dev->tx_queue_len = 1;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
@@ -7705,13 +7780,13 @@ free_dev:
EXPORT_SYMBOL(alloc_netdev_mqs);
/**
- * free_netdev - free network device
- * @dev: device
+ * free_netdev - free network device
+ * @dev: device
*
- * This function does the last stage of destroying an allocated device
- * interface. The reference to the device object is released.
- * If this is the last reference then it will be freed.
- * Must be called in process context.
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released. If this
+ * is the last reference then it will be freed.Must be called in process
+ * context.
*/
void free_netdev(struct net_device *dev)
{
@@ -7893,12 +7968,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev_shutdown(dev);
/* Notify protocols, that we are about to destroy
- this device. They should clean all the things.
-
- Note that dev->reg_state stays at NETREG_REGISTERED.
- This is wanted because this way 8021q and macvlan know
- the device is just moving and can keep their slaves up.
- */
+ * this device. They should clean all the things.
+ *
+ * Note that dev->reg_state stays at NETREG_REGISTERED.
+ * This is wanted because this way 8021q and macvlan know
+ * the device is just moving and can keep their slaves up.
+ */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
@@ -7948,18 +8023,13 @@ out:
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
-static int dev_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *ocpu)
+static int dev_cpu_dead(unsigned int oldcpu)
{
struct sk_buff **list_skb;
struct sk_buff *skb;
- unsigned int cpu, oldcpu = (unsigned long)ocpu;
+ unsigned int cpu;
struct softnet_data *sd, *oldsd;
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
@@ -8009,10 +8079,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
input_queue_head_incr(oldsd);
}
- return NOTIFY_OK;
+ return 0;
}
-
/**
* netdev_increment_features - increment feature set by one
* @all: current feature set
@@ -8346,7 +8415,9 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
- hotcpu_notifier(dev_cpu_callback, 0);
+ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+ NULL, dev_cpu_dead);
+ WARN_ON(rc < 0);
dst_subsys_init();
rc = 0;
out:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1b5063088f1a..e9c1e6acfb6d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
mutex_unlock(&devlink_mutex);
}
-static struct genl_family devlink_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = DEVLINK_GENL_NAME,
- .version = DEVLINK_GENL_VERSION,
- .maxattr = DEVLINK_ATTR_MAX,
- .netnsok = true,
- .pre_doit = devlink_nl_pre_doit,
- .post_doit = devlink_nl_post_doit,
-};
+static struct genl_family devlink_nl_family;
enum devlink_multicast_groups {
DEVLINK_MCGRP_CONFIG,
@@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink,
if (devlink->ops && devlink->ops->port_type_set) {
if (port_type == DEVLINK_PORT_TYPE_NOTSET)
return -EINVAL;
+ if (port_type == devlink_port->type)
+ return 0;
err = devlink->ops->port_type_set(devlink_port, port_type);
if (err)
return err;
@@ -1398,52 +1392,68 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
}
-static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags, u16 mode)
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
{
+ const struct devlink_ops *ops = devlink->ops;
void *hdr;
+ int err = 0;
+ u16 mode;
+ u8 inline_mode;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
- goto nla_put_failure;
+ if (ops->eswitch_mode_get) {
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto nla_put_failure;
+ }
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
-static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct sk_buff *msg;
- u16 mode;
int err;
- if (!ops || !ops->eswitch_mode_get)
+ if (!ops)
return -EOPNOTSUPP;
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- return err;
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
- err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
- info->snd_portid, info->snd_seq, 0, mode);
+ err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+ info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
@@ -1453,21 +1463,38 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
-static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
u16 mode;
+ u8 inline_mode;
+ int err = 0;
- if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
- return -EINVAL;
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = ops->eswitch_mode_set(devlink, mode);
+ if (err)
+ return err;
+ }
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(
+ info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ if (err)
+ return err;
+ }
- if (ops && ops->eswitch_mode_set)
- return ops->eswitch_mode_set(devlink, mode);
- return -EOPNOTSUPP;
+ return 0;
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -1484,6 +1511,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
[DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1603,21 +1631,35 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_LOCK_PORTS,
},
{
- .cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
- .doit = devlink_nl_cmd_eswitch_mode_get_doit,
+ .cmd = DEVLINK_CMD_ESWITCH_GET,
+ .doit = devlink_nl_cmd_eswitch_get_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
- .cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
- .doit = devlink_nl_cmd_eswitch_mode_set_doit,
+ .cmd = DEVLINK_CMD_ESWITCH_SET,
+ .doit = devlink_nl_cmd_eswitch_set_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
};
+static struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+ .module = THIS_MODULE,
+ .ops = devlink_nl_ops,
+ .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
/**
* devlink_alloc - Allocate new devlink instance resources
*
@@ -1840,9 +1882,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
static int __init devlink_module_init(void)
{
- return genl_register_family_with_ops_groups(&devlink_nl_family,
- devlink_nl_ops,
- devlink_nl_mcgrps);
+ return genl_register_family(&devlink_nl_family);
}
static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 72cfb0c61125..fb55327dcfea 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,12 +59,7 @@ struct dm_hw_stat_delta {
unsigned long last_drop_val;
};
-static struct genl_family net_drop_monitor_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "NET_DM",
- .version = 2,
-};
+static struct genl_family net_drop_monitor_family;
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
@@ -80,6 +75,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
struct nlattr *nla;
struct sk_buff *skb;
unsigned long flags;
+ void *msg_header;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
@@ -87,21 +83,41 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
skb = genlmsg_new(al, GFP_KERNEL);
- if (skb) {
- genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
- 0, NET_DM_CMD_ALERT);
- nla = nla_reserve(skb, NLA_UNSPEC,
- sizeof(struct net_dm_alert_msg));
- msg = nla_data(nla);
- memset(msg, 0, al);
- } else {
- mod_timer(&data->send_timer, jiffies + HZ / 10);
+ if (!skb)
+ goto err;
+
+ msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ if (!msg_header) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ if (!nla) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
}
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ goto out;
+err:
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+out:
spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
spin_unlock_irqrestore(&data->lock, flags);
+ if (skb) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh);
+
+ genlmsg_end(skb, genlmsg_data(gnlh));
+ }
+
return skb;
}
@@ -351,6 +367,17 @@ static const struct genl_ops dropmon_ops[] = {
},
};
+static struct genl_family net_drop_monitor_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+ .module = THIS_MODULE,
+ .ops = dropmon_ops,
+ .n_ops = ARRAY_SIZE(dropmon_ops),
+ .mcgrps = dropmon_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
+};
+
static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
@@ -367,8 +394,7 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
- dropmon_ops, dropmon_mcgrps);
+ rc = genl_register_family(&net_drop_monitor_family);
if (rc) {
pr_err("Could not create drop monitor netlink family\n");
return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..960e503b5a52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
- dst->pending_confirm = 0;
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 047a1752ece1..aecb2c7241b6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -24,7 +24,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/rtnetlink.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/net.h>
/*
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXFCS_BIT] = "rx-fcs",
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
- [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
[NETIF_F_HW_TC_BIT] = "hw-tc-offload",
};
@@ -119,6 +118,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
[ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
};
+static const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -227,6 +232,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_TUNABLES)
return ARRAY_SIZE(tunable_strings);
+ if (sset == ETH_SS_PHY_TUNABLES)
+ return ARRAY_SIZE(phy_tunable_strings);
+
if (sset == ETH_SS_PHY_STATS) {
if (dev->phydev)
return phy_get_sset_count(dev->phydev);
@@ -253,6 +261,8 @@ static void __ethtool_get_strings(struct net_device *dev,
sizeof(rss_hash_func_strings));
else if (stringset == ETH_SS_TUNABLES)
memcpy(data, tunable_strings, sizeof(tunable_strings));
+ else if (stringset == ETH_SS_PHY_TUNABLES)
+ memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
else if (stringset == ETH_SS_PHY_STATS) {
struct phy_device *phydev = dev->phydev;
@@ -1394,9 +1404,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = vzalloc(reglen);
- if (reglen && !regbuf)
- return -ENOMEM;
+ regbuf = NULL;
+ if (reglen) {
+ regbuf = vzalloc(reglen);
+ if (!regbuf)
+ return -ENOMEM;
+ }
ops->get_regs(dev, &regs, regbuf);
@@ -1701,7 +1714,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels, max;
+ struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
u32 max_rx_in_use = 0;
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
@@ -1806,11 +1819,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
ret = __ethtool_get_sset_count(dev, gstrings.string_set);
if (ret < 0)
return ret;
+ if (ret > S32_MAX / ETH_GSTRING_LEN)
+ return -ENOMEM;
+ WARN_ON_ONCE(!ret);
gstrings.len = ret;
-
- data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
- if (!data)
+ data = vzalloc(gstrings.len * ETH_GSTRING_LEN);
+ if (gstrings.len && !data)
return -ENOMEM;
__ethtool_get_strings(dev, gstrings.string_set, data);
@@ -1819,12 +1834,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
goto out;
useraddr += sizeof(gstrings);
- if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ if (gstrings.len &&
+ copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
goto out;
ret = 0;
out:
- kfree(data);
+ vfree(data);
return ret;
}
@@ -1901,14 +1917,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
if (n_stats < 0)
return n_stats;
- WARN_ON(n_stats == 0);
-
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
if (copy_from_user(&stats, useraddr, sizeof(stats)))
return -EFAULT;
stats.n_stats = n_stats;
- data = kmalloc(n_stats * sizeof(u64), GFP_USER);
- if (!data)
+ data = vzalloc(n_stats * sizeof(u64));
+ if (n_stats && !data)
return -ENOMEM;
ops->get_ethtool_stats(dev, &stats, data);
@@ -1917,12 +1934,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &stats, sizeof(stats)))
goto out;
useraddr += sizeof(stats);
- if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
goto out;
ret = 0;
out:
- kfree(data);
+ vfree(data);
return ret;
}
@@ -1937,17 +1954,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
return -EOPNOTSUPP;
n_stats = phy_get_sset_count(phydev);
-
if (n_stats < 0)
return n_stats;
- WARN_ON(n_stats == 0);
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
if (copy_from_user(&stats, useraddr, sizeof(stats)))
return -EFAULT;
stats.n_stats = n_stats;
- data = kmalloc_array(n_stats, sizeof(u64), GFP_USER);
- if (!data)
+ data = vzalloc(n_stats * sizeof(u64));
+ if (n_stats && !data)
return -ENOMEM;
mutex_lock(&phydev->lock);
@@ -1958,12 +1976,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &stats, sizeof(stats)))
goto out;
useraddr += sizeof(stats);
- if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
goto out;
ret = 0;
out:
- kfree(data);
+ vfree(data);
return ret;
}
@@ -2422,6 +2440,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
};
}
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_PHY_DOWNSHIFT:
+ if (tuna->len != sizeof(u8) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U8)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+
+out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2479,6 +2576,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GET_TS_INFO:
case ETHTOOL_GEEE:
case ETHTOOL_GTUNABLE:
+ case ETHTOOL_PHY_GTUNABLE:
case ETHTOOL_GLINKSETTINGS:
break;
default:
@@ -2685,6 +2783,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SLINKSETTINGS:
rc = ethtool_set_link_ksettings(dev, useraddr);
break;
+ case ETHTOOL_PHY_GTUNABLE:
+ rc = get_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_STUNABLE:
+ rc = set_phy_tunable(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..b6791d94841d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range))
+ goto errout;
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
(rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size_64bit(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->tun_id &&
nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
(rule->l3mdev &&
- nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index b391209838ef..ebaeaf2e46e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -39,7 +40,7 @@
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
@@ -75,8 +76,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
* allow SOCK_MEMALLOC sockets to use it as this socket is
* helping free memory
*/
- if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
return -ENOMEM;
+ }
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
err = security_sock_rcv_skb(sk, skb);
if (err)
@@ -1411,8 +1417,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_STACK,
- .arg4_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
@@ -1442,8 +1448,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_RAW_STACK,
- .arg4_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
};
BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1517,10 +1523,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
{
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+ bool do_mforce = flags & BPF_F_MARK_ENFORCE;
__sum16 *ptr;
- if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
- BPF_F_HDR_FIELD_MASK)))
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
return -EINVAL;
if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
@@ -1528,7 +1535,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
return -EFAULT;
ptr = (__sum16 *)(skb->data + offset);
- if (is_mmzero && !*ptr)
+ if (is_mmzero && !do_mforce && !*ptr)
return 0;
switch (flags & BPF_F_HDR_FIELD_MASK) {
@@ -1596,10 +1603,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_STACK,
- .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
@@ -1684,6 +1691,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -1692,17 +1705,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
- switch (dev->type) {
- case ARPHRD_TUNNEL:
- case ARPHRD_TUNNEL6:
- case ARPHRD_SIT:
- case ARPHRD_IPGRE:
- case ARPHRD_VOID:
- case ARPHRD_NONE:
- return __bpf_redirect_no_mac(skb, dev, flags);
- default:
+ if (dev_is_mac_header_xmit(dev))
return __bpf_redirect_common(skb, dev, flags);
- }
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
}
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2190,16 +2196,79 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
.arg3_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < xdp->data_hard_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||
func == bpf_skb_store_bytes ||
func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace)
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head)
return true;
return false;
@@ -2239,8 +2308,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2310,8 +2379,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_RAW_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -2345,8 +2414,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_RAW_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
};
static struct metadata_dst __percpu *md_dst;
@@ -2416,8 +2485,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -2442,8 +2511,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
@@ -2515,8 +2584,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
return -EFAULT;
- return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
- bpf_xdp_copy);
+ return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ xdp_size, bpf_xdp_copy);
}
static const struct bpf_func_proto bpf_xdp_event_output_proto = {
@@ -2526,12 +2595,12 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
-sk_filter_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -2544,6 +2613,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns:
@@ -2557,6 +2628,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -2611,7 +2693,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return sk_filter_func_proto(func_id);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2623,20 +2705,106 @@ xdp_func_proto(enum bpf_func_id func_id)
return &bpf_xdp_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
default:
- return sk_filter_func_proto(func_id);
+ return bpf_base_func_proto(func_id);
}
}
-static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_inout_func_proto(func_id);
+ }
+}
+
+static bool __is_valid_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
+
/* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
- if (size != sizeof(__u32))
- return false;
+
+ switch (off) {
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ if (off + size >
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
+ return false;
+ break;
+ default:
+ if (size != sizeof(__u32))
+ return false;
+ }
return true;
}
@@ -2655,14 +2823,71 @@ static bool sk_filter_is_valid_access(int off, int size,
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
break;
default:
return false;
}
}
- return __is_valid_access(off, size, type);
+ if (off < 0 || off + size > sizeof(struct bpf_sock))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -2714,7 +2939,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case offsetof(struct __sk_buff, tc_index):
case offsetof(struct __sk_buff, priority):
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
case offsetof(struct __sk_buff, tc_classid):
break;
default:
@@ -2731,11 +2956,10 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
}
-static bool __is_valid_xdp_access(int off, int size,
- enum bpf_access_type type)
+static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
return false;
@@ -2763,7 +2987,7 @@ static bool xdp_is_valid_access(int off, int size,
break;
}
- return __is_valid_xdp_access(off, size, type);
+ return __is_valid_xdp_access(off, size);
}
void bpf_warn_invalid_xdp_action(u32 act)
@@ -2772,32 +2996,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+static u32 bpf_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
+ int off;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct __sk_buff, len):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, len));
break;
case offsetof(struct __sk_buff, protocol):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, protocol));
break;
case offsetof(struct __sk_buff, vlan_proto):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, vlan_proto));
break;
@@ -2805,17 +3030,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, priority));
else
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, priority));
break;
case offsetof(struct __sk_buff, ingress_ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, skb_iif));
break;
@@ -2823,17 +3048,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
- *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct net_device, ifindex));
break;
case offsetof(struct __sk_buff, hash):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, hash));
break;
@@ -2841,63 +3066,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, mark));
else
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, mark));
break;
case offsetof(struct __sk_buff, pkt_type):
- return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+ return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
+ si->src_reg, insn);
case offsetof(struct __sk_buff, queue_mapping):
- return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+ return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
+ si->src_reg, insn);
case offsetof(struct __sk_buff, vlan_present):
return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
- dst_reg, src_reg, insn);
+ si->dst_reg, si->src_reg, insn);
case offsetof(struct __sk_buff, vlan_tci):
return convert_skb_access(SKF_AD_VLAN_TAG,
- dst_reg, src_reg, insn);
+ si->dst_reg, si->src_reg, insn);
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct qdisc_skb_cb, data)) %
+ sizeof(__u64));
prog->cb_access = 1;
- ctx_off -= offsetof(struct __sk_buff, cb[0]);
- ctx_off += offsetof(struct sk_buff, cb);
- ctx_off += offsetof(struct qdisc_skb_cb, data);
+ off = si->off;
+ off -= offsetof(struct __sk_buff, cb[0]);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, data);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
else
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
break;
case offsetof(struct __sk_buff, tc_classid):
- ctx_off -= offsetof(struct __sk_buff, tc_classid);
- ctx_off += offsetof(struct sk_buff, cb);
- ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, tc_classid);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, tc_classid);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
else
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
break;
case offsetof(struct __sk_buff, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct sk_buff, data));
break;
case offsetof(struct __sk_buff, data_end):
- ctx_off -= offsetof(struct __sk_buff, data_end);
- ctx_off += offsetof(struct sk_buff, cb);
- ctx_off += offsetof(struct bpf_skb_data_end, data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
- ctx_off);
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_end);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct bpf_skb_data_end, data_end);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -2905,65 +3144,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, tc_index));
else
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, tc_index));
- break;
#else
if (type == BPF_WRITE)
- *insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+ *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
else
- *insn++ = BPF_MOV64_IMM(dst_reg, 0);
- break;
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
+ break;
}
return insn - insn_buf;
}
-static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct __sk_buff, ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct net_device, ifindex));
break;
default:
- return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
- ctx_off, insn_buf, prog);
+ return bpf_convert_ctx_access(type, si, insn_buf, prog);
}
return insn - insn_buf;
}
-static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct xdp_md, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data));
break;
case offsetof(struct xdp_md, data_end):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data_end));
break;
}
@@ -2974,7 +3255,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = sk_filter_convert_ctx_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
@@ -2990,32 +3271,87 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
-static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_inout_ops = {
+ .get_func_proto = lwt_inout_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops cg_sock_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = sock_filter_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
};
-static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
.ops = &tc_cls_act_ops,
.type = BPF_PROG_TYPE_SCHED_CLS,
};
-static struct bpf_prog_type_list sched_act_type __read_mostly = {
+static struct bpf_prog_type_list sched_act_type __ro_after_init = {
.ops = &tc_cls_act_ops,
.type = BPF_PROG_TYPE_SCHED_ACT,
};
-static struct bpf_prog_type_list xdp_type __read_mostly = {
+static struct bpf_prog_type_list xdp_type __ro_after_init = {
.ops = &xdp_ops,
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
+static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
+ .ops = &lwt_xmit_ops,
+ .type = BPF_PROG_TYPE_LWT_XMIT,
+};
+
+static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
+ .ops = &cg_sock_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SOCK
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
+ bpf_register_prog_type(&cg_sock_type);
+ bpf_register_prog_type(&lwt_in_type);
+ bpf_register_prog_type(&lwt_out_type);
+ bpf_register_prog_type(&lwt_xmit_type);
return 0;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 18e8893d4be5..f765c11d8df5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -417,28 +417,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
return 0;
}
-static int flow_cache_cpu(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache,
- hotcpu_notifier);
- int res, cpu = (unsigned long) hcpu;
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
+
+ return flow_cache_cpu_prepare(fc, cpu);
+}
+
+static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- res = flow_cache_cpu_prepare(fc, cpu);
- if (res)
- return notifier_from_errno(res);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- __flow_cache_shrink(fc, fcp, 0);
- break;
- }
- return NOTIFY_OK;
+ __flow_cache_shrink(fc, fcp, 0);
+ return 0;
}
int flow_cache_init(struct net *net)
@@ -465,18 +457,8 @@ int flow_cache_init(struct net *net)
if (!fc->percpu)
return -ENOMEM;
- cpu_notifier_register_begin();
-
- for_each_online_cpu(i) {
- if (flow_cache_cpu_prepare(fc, i))
- goto err;
- }
- fc->hotcpu_notifier = (struct notifier_block){
- .notifier_call = flow_cache_cpu,
- };
- __register_hotcpu_notifier(&fc->hotcpu_notifier);
-
- cpu_notifier_register_done();
+ if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
+ goto err;
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -492,8 +474,6 @@ err:
fcp->hash_table = NULL;
}
- cpu_notifier_register_done();
-
free_percpu(fc->percpu);
fc->percpu = NULL;
@@ -507,7 +487,8 @@ void flow_cache_fini(struct net *net)
struct flow_cache *fc = &net->xfrm.flow_cache_global;
del_timer_sync(&fc->rnd_timer);
- unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
for_each_possible_cpu(i) {
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
@@ -519,3 +500,14 @@ void flow_cache_fini(struct net *net)
fc->percpu = NULL;
}
EXPORT_SYMBOL(flow_cache_fini);
+
+void __init flow_cache_hp_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
+ "net/flow:prepare",
+ flow_cache_cpu_up_prep,
+ flow_cache_cpu_dead);
+ WARN_ON(ret < 0);
+}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..d98d4998213d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
EXPORT_SYMBOL(skb_flow_dissector_init);
/**
+ * skb_flow_get_be16 - extract be16 entity
+ * @skb: sk_buff to extract from
+ * @poff: offset to extract at
+ * @data: raw buffer pointer to the packet
+ * @hlen: packet header length
+ *
+ * The function will try to retrieve a be32 entity at
+ * offset poff
+ */
+static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
+ void *data, int hlen)
+{
+ __be16 *u, _u;
+
+ u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
+ if (u)
+ return *u;
+
+ return 0;
+}
+
+/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
@@ -116,7 +138,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_arp *key_arp;
struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
struct flow_dissector_key_keyid *key_keyid;
@@ -356,6 +380,62 @@ mpls:
nhoff += FCOE_HEADER_LEN;
goto out_good;
+
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP): {
+ struct {
+ unsigned char ar_sha[ETH_ALEN];
+ unsigned char ar_sip[4];
+ unsigned char ar_tha[ETH_ALEN];
+ unsigned char ar_tip[4];
+ } *arp_eth, _arp_eth;
+ const struct arphdr *arp;
+ struct arphdr _arp;
+
+ arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+ hlen, &_arp);
+ if (!arp)
+ goto out_bad;
+
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_hln != ETH_ALEN ||
+ arp->ar_pln != 4 ||
+ (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST)))
+ goto out_bad;
+
+ arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+ sizeof(_arp_eth), data,
+ hlen,
+ &_arp_eth);
+ if (!arp_eth)
+ goto out_bad;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ARP)) {
+
+ key_arp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ARP,
+ target_container);
+
+ memcpy(&key_arp->sip, arp_eth->ar_sip,
+ sizeof(key_arp->sip));
+ memcpy(&key_arp->tip, arp_eth->ar_tip,
+ sizeof(key_arp->tip));
+
+ /* Only store the lower byte of the opcode;
+ * this covers ARPOP_REPLY and ARPOP_REQUEST.
+ */
+ key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+ ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+ ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+ }
+
+ goto out_good;
+ }
+
default:
goto out_bad;
}
@@ -445,8 +525,9 @@ ip_proto_again:
if (hdr->flags & GRE_ACK)
offset += sizeof(((struct pptp_gre_header *)0)->ack);
- ppp_hdr = skb_header_pointer(skb, nhoff + offset,
- sizeof(_ppp_hdr), _ppp_hdr);
+ ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+ sizeof(_ppp_hdr),
+ data, hlen, _ppp_hdr);
if (!ppp_hdr)
goto out_bad;
@@ -546,6 +627,14 @@ ip_proto_again:
data, hlen);
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP)) {
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+ key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
+ }
+
out_good:
ret = true;
@@ -726,7 +815,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
struct flow_keys keys;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cad8e791f28e..0385dece1f6f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,13 +7,14 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
* names to make it usable in general net subsystem.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -30,165 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator
-{
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head
-{
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (!list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
-
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
-
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
{
- bool res;
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb..87f28557b329 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **rate_est)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(rate_est, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..c98bbfbd26b8
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,92 @@
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+
+struct gro_cell {
+ struct sk_buff_head napi_skbs;
+ struct napi_struct napi;
+};
+
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct gro_cell *cell;
+
+ if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
+ return netif_rx(skb);
+
+ cell = this_cpu_ptr(gcells->cells);
+
+ if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ __skb_queue_tail(&cell->napi_skbs, skb);
+ if (skb_queue_len(&cell->napi_skbs) == 1)
+ napi_schedule(&cell->napi);
+ return NET_RX_SUCCESS;
+}
+EXPORT_SYMBOL(gro_cells_receive);
+
+/* called under BH context */
+static int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+ struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+ struct sk_buff *skb;
+ int work_done = 0;
+
+ while (work_done < budget) {
+ skb = __skb_dequeue(&cell->napi_skbs);
+ if (!skb)
+ break;
+ napi_gro_receive(napi, skb);
+ work_done++;
+ }
+
+ if (work_done < budget)
+ napi_complete_done(napi, work_done);
+ return work_done;
+}
+
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+ int i;
+
+ gcells->cells = alloc_percpu(struct gro_cell);
+ if (!gcells->cells)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ __skb_queue_head_init(&cell->napi_skbs);
+
+ set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
+ netif_napi_add(dev, &cell->napi, gro_cell_poll,
+ NAPI_POLL_WEIGHT);
+ napi_enable(&cell->napi);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gro_cells_init);
+
+void gro_cells_destroy(struct gro_cells *gcells)
+{
+ int i;
+
+ if (!gcells->cells)
+ return;
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ netif_napi_del(&cell->napi);
+ __skb_queue_purge(&cell->napi_skbs);
+ }
+ free_percpu(gcells->cells);
+ gcells->cells = NULL;
+}
+EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..0cfe7b0216c3
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,397 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable is needed to protect per-cpu redirect_info between
+ * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+ * access to maps strictly require a rcu_read_lock() for protection,
+ * mixing with BH RCU lock doesn't work.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ bpf_compute_data_end(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index e5f84c26ba1a..6df9f8fabf0c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,6 +26,7 @@
#include <net/lwtunnel.h>
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
+#include <net/nexthop.h>
#ifdef CONFIG_MODULES
@@ -39,6 +40,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "MPLS";
case LWTUNNEL_ENCAP_ILA:
return "ILA";
+ case LWTUNNEL_ENCAP_SEG6:
+ return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -96,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+int lwtunnel_build_state(u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws)
{
@@ -110,25 +115,91 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
+ ret = ops->build_state(encap, family, cfg, lws);
+ if (ret)
+ module_put(ops->owner);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_valid_encap_type(u16 encap_type)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return ret;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
#ifdef CONFIG_MODULES
if (!ops) {
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- rcu_read_unlock();
+ __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
+ rtnl_lock();
+
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
}
}
#endif
- if (likely(ops && ops->build_state))
- ret = ops->build_state(dev, encap, family, cfg, lws);
- rcu_read_unlock();
+ return ops ? 0 : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(lwtunnel_valid_encap_type);
- return ret;
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
+{
+ struct rtnexthop *rtnh = (struct rtnexthop *)attr;
+ struct nlattr *nla_entype;
+ struct nlattr *attrs;
+ struct nlattr *nla;
+ u16 encap_type;
+ int attrlen;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+
+ if (nla_entype) {
+ encap_type = nla_get_u16(nla_entype);
+
+ if (lwtunnel_valid_encap_type(encap_type) != 0)
+ return -EOPNOTSUPP;
+ }
+ }
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(lwtunnel_build_state);
+EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr);
+
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+ const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+ if (ops->destroy_state) {
+ ops->destroy_state(lws);
+ kfree_rcu(lws, rcu);
+ } else {
+ kfree(lws);
+ }
+ module_put(ops->owner);
+}
+EXPORT_SYMBOL(lwtstate_free);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
{
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2ae929f9bd06..4526cbd7e28a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -100,6 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
neigh->parms->neigh_cleanup(neigh);
__neigh_notify(neigh, RTM_DELNEIGH, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
@@ -859,7 +860,8 @@ static void neigh_probe(struct neighbour *neigh)
if (skb)
skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb);
+ if (neigh->ops->solicit)
+ neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
kfree_skb(skb);
}
@@ -2291,13 +2293,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
- if (!net_eq(dev_net(n->dev), net))
- continue;
- if (neigh_ifindex_filtered(n->dev, filter_idx))
- continue;
- if (neigh_master_filtered(n->dev, filter_master_idx))
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter_idx) ||
+ neigh_master_filtered(n->dev, filter_master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2332,9 +2331,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (pneigh_net(n) != net)
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || pneigh_net(n) != net)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2927,7 +2924,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
return;
set_bit(index, p->data_state);
- call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ if (index == NEIGH_VAR_DELAY_PROBE_TIME)
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
if (!dev) /* NULL dev means this is default value */
neigh_copy_dflt_parms(net, p, index);
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6e4f34721080..65ea0ff4017c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -15,6 +15,7 @@
#include <net/switchdev.h>
#include <linux/if_arp.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/nsproxy.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -950,10 +951,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
+ struct kobject *kobj = &dev->_rx[i].kobj;
+
+ if (!atomic_read(&dev_net(dev)->count))
+ kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
- sysfs_remove_group(&dev->_rx[i].kobj,
- dev->sysfs_rx_queue_group);
- kobject_put(&dev->_rx[i].kobj);
+ sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ kobject_put(kobj);
}
return error;
@@ -1021,7 +1025,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
return sprintf(buf, "%lu", trans_timeout);
}
-#ifdef CONFIG_XPS
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -1033,6 +1036,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
+static ssize_t show_traffic_class(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ struct net_device *dev = queue->dev;
+ int index = get_netdev_queue_index(queue);
+ int tc = netdev_txq_to_tc(dev, index);
+
+ if (tc < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%u\n", tc);
+}
+
+#ifdef CONFIG_XPS
static ssize_t show_tx_maxrate(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
char *buf)
@@ -1075,6 +1093,9 @@ static struct netdev_queue_attribute queue_tx_maxrate =
static struct netdev_queue_attribute queue_trans_timeout =
__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+static struct netdev_queue_attribute queue_traffic_class =
+ __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
+
#ifdef CONFIG_BQL
/*
* Byte queue limits sysfs structures and functions.
@@ -1190,29 +1211,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute, char *buf)
{
struct net_device *dev = queue->dev;
+ int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
cpumask_var_t mask;
unsigned long index;
- int i, len;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
index = get_netdev_queue_index(queue);
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- for_each_possible_cpu(i) {
- struct xps_map *map =
- rcu_dereference(dev_maps->cpu_map[i]);
- if (map) {
- int j;
- for (j = 0; j < map->len; j++) {
- if (map->queues[j] == index) {
- cpumask_set_cpu(i, mask);
- break;
- }
+ for_each_possible_cpu(cpu) {
+ int i, tci = cpu * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ cpumask_set_cpu(cpu, mask);
+ break;
}
}
}
@@ -1260,6 +1290,7 @@ static struct netdev_queue_attribute xps_cpus_attribute =
static struct attribute *netdev_queue_default_attrs[] = {
&queue_trans_timeout.attr,
+ &queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
&queue_tx_maxrate.attr,
@@ -1340,6 +1371,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
+ if (!atomic_read(&dev_net(dev)->count))
+ queue->kobj.uevent_suppress = 1;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
#endif
@@ -1525,6 +1558,9 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &(ndev->dev);
+ if (!atomic_read(&dev_net(ndev)->count))
+ dev_set_uevent_suppress(dev, 1);
+
kobject_get(&dev->kobj);
remove_queue_kobjects(ndev);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7001da910c6b..652468ff65b7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,6 +16,8 @@
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
+#include <linux/sched/task.h>
+
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
@@ -39,6 +41,9 @@ EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -46,27 +51,28 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
- size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+ unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->len = max_gen_ptrs;
+ ng->s.len = max_gen_ptrs;
return ng;
}
-static int net_assign_generic(struct net *net, int id, void *data)
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
- BUG_ON(id == 0);
+ BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&net_mutex));
- ng = old_ng;
- if (old_ng->len >= id)
- goto assign;
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
ng = net_alloc_generic();
if (ng == NULL)
@@ -83,12 +89,12 @@ static int net_assign_generic(struct net *net, int id, void *data)
* the old copy for kfree after a grace period.
*/
- memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
rcu_assign_pointer(net->gen, ng);
- kfree_rcu(old_ng, rcu);
-assign:
- ng->ptr[id - 1] = data;
+ kfree_rcu(old_ng, s.rcu);
return 0;
}
@@ -122,8 +128,7 @@ out:
static void ops_free(const struct pernet_operations *ops, struct net *net)
{
if (ops->id && ops->size) {
- int id = *ops->id;
- kfree(net_generic(net, id));
+ kfree(net_generic(net, *ops->id));
}
}
@@ -215,16 +220,15 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
*/
int peernet2id_alloc(struct net *net, struct net *peer)
{
- unsigned long flags;
bool alloc;
int id;
if (atomic_read(&net->count) == 0)
return NETNSA_NSID_NOT_ASSIGNED;
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
if (alloc && id >= 0)
rtnl_net_notifyid(net, RTM_NEWNSID, id);
return id;
@@ -233,12 +237,11 @@ int peernet2id_alloc(struct net *net, struct net *peer)
/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
{
- unsigned long flags;
int id;
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
id = __peernet2id(net, peer);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
return id;
}
EXPORT_SYMBOL(peernet2id);
@@ -253,18 +256,17 @@ bool peernet_has_id(struct net *net, struct net *peer)
struct net *get_net_ns_by_id(struct net *net, int id)
{
- unsigned long flags;
struct net *peer;
if (id < 0)
return NULL;
rcu_read_lock();
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
peer = idr_find(&net->netns_ids, id);
if (peer)
get_net(peer);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
rcu_read_unlock();
return peer;
@@ -384,7 +386,14 @@ struct net *copy_net_ns(unsigned long flags,
get_user_ns(user_ns);
- mutex_lock(&net_mutex);
+ rv = mutex_lock_killable(&net_mutex);
+ if (rv < 0) {
+ net_free(net);
+ dec_net_namespaces(ucounts);
+ put_user_ns(user_ns);
+ return ERR_PTR(rv);
+ }
+
net->ucounts = ucounts;
rv = setup_net(net, user_ns);
if (rv == 0) {
@@ -427,17 +436,17 @@ static void cleanup_net(struct work_struct *work)
for_each_net(tmp) {
int id;
- spin_lock_irq(&tmp->nsid_lock);
+ spin_lock_bh(&tmp->nsid_lock);
id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
- spin_unlock_irq(&tmp->nsid_lock);
+ spin_unlock_bh(&tmp->nsid_lock);
if (id >= 0)
rtnl_net_notifyid(tmp, RTM_DELNSID, id);
}
- spin_lock_irq(&net->nsid_lock);
+ spin_lock_bh(&net->nsid_lock);
idr_destroy(&net->netns_ids);
- spin_unlock_irq(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
}
rtnl_unlock();
@@ -566,7 +575,6 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
- unsigned long flags;
struct net *peer;
int nsid, err;
@@ -587,15 +595,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(peer))
return PTR_ERR(peer);
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
if (__peernet2id(net, peer) >= 0) {
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
err = -EEXIST;
goto out;
}
err = alloc_netid(net, peer, nsid);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
if (err >= 0) {
rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0;
@@ -717,11 +725,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
.idx = 0,
.s_idx = cb->args[0],
};
- unsigned long flags;
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
cb->args[0] = net_cb.idx;
return skb->len;
@@ -868,7 +875,7 @@ static int register_pernet_operations(struct list_head *list,
if (ops->id) {
again:
- error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
if (error < 0) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -876,7 +883,7 @@ again:
}
return error;
}
- max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+ max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
}
error = __register_pernet_operations(list, ops);
if (error) {
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 11fce17274f6..029a61ac6cdd 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -12,6 +12,8 @@
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fdtable.h>
+#include <linux/sched/task.h>
+
#include <net/cls_cgroup.h>
#include <net/sock.h>
@@ -69,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
return 0;
}
-static void update_classid(struct cgroup_subsys_state *css, void *v)
+static void cgrp_attach(struct cgroup_taskset *tset)
{
- struct css_task_iter it;
+ struct cgroup_subsys_state *css;
struct task_struct *p;
- css_task_iter_start(css, &it);
- while ((p = css_task_iter_next(&it))) {
+ cgroup_taskset_for_each(p, css, tset) {
task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock, v);
+ iterate_fd(p->files, 0, update_classid_sock,
+ (void *)(unsigned long)css_cls_state(css)->classid);
task_unlock(p);
}
- css_task_iter_end(&it);
-}
-
-static void cgrp_attach(struct cgroup_taskset *tset)
-{
- struct cgroup_subsys_state *css;
-
- cgroup_taskset_first(tset, &css);
- update_classid(css,
- (void *)(unsigned long)css_cls_state(css)->classid);
}
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -101,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
u64 value)
{
struct cgroup_cls_state *cs = css_cls_state(css);
+ struct css_task_iter it;
+ struct task_struct *p;
cgroup_sk_alloc_disable();
cs->classid = (u32)value;
- update_classid(css, (void *)(unsigned long)cs->classid);
+ css_task_iter_start(css, &it);
+ while ((p = css_task_iter_next(&it))) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_classid_sock,
+ (void *)(unsigned long)cs->classid);
+ task_unlock(p);
+ }
+ css_task_iter_end(&it);
+
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 53599bd0c82d..29be2466970c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work)
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
+ unsigned int q_index;
if (!netif_device_present(dev) || !netif_running(dev)) {
kfree_skb(skb);
continue;
}
- txq = skb_get_tx_queue(dev, skb);
-
local_irq_save(flags);
+ /* check if skb->queue_mapping is still valid */
+ q_index = skb_get_queue_mapping(skb);
+ if (unlikely(q_index >= dev->real_num_tx_queues)) {
+ q_index = q_index % dev->real_num_tx_queues;
+ skb_set_queue_mapping(skb, q_index);
+ }
+ txq = netdev_get_tx_queue(dev, q_index);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
@@ -171,12 +177,12 @@ static void poll_one_napi(struct napi_struct *napi)
static void poll_napi(struct net_device *dev)
{
struct napi_struct *napi;
+ int cpu = smp_processor_id();
list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner != smp_processor_id() &&
- spin_trylock(&napi->poll_lock)) {
+ if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
poll_one_napi(napi);
- spin_unlock(&napi->poll_lock);
+ smp_store_release(&napi->poll_owner, -1);
}
}
}
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..0f9275ee5595 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,12 +13,15 @@
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/module.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/atomic.h>
+#include <linux/sched/task.h>
+
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 306b8f0e03c1..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -413,7 +413,7 @@ struct pktgen_hdr {
};
-static int pg_net_id __read_mostly;
+static unsigned int pg_net_id __read_mostly;
struct pktgen_net {
struct net *net;
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = 0; /* reset reclass/redir ttl */
-#endif
+ skb_reset_tc(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
* and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
-int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
void reqsk_queue_alloc(struct request_sock_queue *queue)
{
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6196cf844f6..c4e84c558240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -40,7 +40,7 @@
#include <linux/pci.h>
#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
static inline int rtnl_vfinfo_size(const struct net_device *dev,
u32 ext_filter_mask)
{
- if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
- (ext_filter_mask & RTEXT_FILTER_VF)) {
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
int num_vfs = dev_num_vf(dev->dev.parent);
size_t size = nla_total_size(0);
size += num_vfs *
@@ -877,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev,
{
size_t port_size = nla_total_size(4) /* PORT_VF */
+ nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
- + nla_total_size(sizeof(struct ifla_port_vsi))
- /* PORT_VSI_TYPE */
+ nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
+ nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
+ nla_total_size(1) /* PROT_VDP_REQUEST */
@@ -1492,19 +1489,25 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_VF] = { .type = NLA_U32 },
[IFLA_PORT_PROFILE] = { .type = NLA_STRING,
.len = PORT_PROFILE_MAX },
- [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
- .len = sizeof(struct ifla_port_vsi)},
[IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
.len = PORT_UUID_MAX },
[IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
.len = PORT_UUID_MAX },
[IFLA_PORT_REQUEST] = { .type = NLA_U8, },
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
+
+ /* Unused, but we need to keep it here since user space could
+ * fill it. It's also broken with regard to NLA_BINARY use in
+ * combination with structs.
+ */
+ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_port_vsi) },
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
};
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -2164,6 +2167,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_XDP]) {
struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
ifla_xdp_policy);
@@ -2174,9 +2178,19 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
if (xdp[IFLA_XDP_FD]) {
err = dev_change_xdp_fd(dev,
- nla_get_s32(xdp[IFLA_XDP_FD]));
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ xdp_flags);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2344,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net,
const char *ifname, unsigned char name_assign_type,
const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
- int err;
struct net_device *dev;
unsigned int num_tx_queues = 1;
unsigned int num_rx_queues = 1;
@@ -2359,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net,
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
- err = -ENOMEM;
dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
ops->setup, num_tx_queues, num_rx_queues);
if (!dev)
- goto err;
+ return ERR_PTR(-ENOMEM);
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
@@ -2389,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net,
dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
return dev;
-
-err:
- return ERR_PTR(err);
}
EXPORT_SYMBOL(rtnl_create_link);
@@ -2559,7 +2568,7 @@ replay:
return -ENODEV;
}
- if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
return -EOPNOTSUPP;
if (!ops) {
@@ -2641,6 +2650,11 @@ replay:
if (err < 0)
goto out_unregister;
}
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+ if (err)
+ goto out_unregister;
+ }
out:
if (link_net)
put_net(link_net);
@@ -3165,7 +3179,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
goto out;
- nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
return err;
@@ -3671,7 +3685,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
if (!size)
continue;
- if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
continue;
attr = nla_reserve_64bit(skb, attr_id, size,
@@ -3712,7 +3726,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
continue;
size = rtnl_get_offload_stats_attr_size(attr_id);
nla_size += nla_total_size_64bit(size);
@@ -3817,6 +3831,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = 0;
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+ struct rtnl_af_ops *af_ops;
+
+ *idxattr = IFLA_STATS_AF_SPEC;
+ attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->fill_stats_af) {
+ struct nlattr *af;
+ int err;
+
+ af = nla_nest_start(skb, af_ops->family);
+ if (!af)
+ goto nla_put_failure;
+
+ err = af_ops->fill_stats_af(skb, dev);
+
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, af);
+ else if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, af);
+ }
+ }
+
+ nla_nest_end(skb, attr);
+
+ *idxattr = 0;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -3873,6 +3920,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
size += rtnl_get_offload_stats_size(dev);
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+ struct rtnl_af_ops *af_ops;
+
+ /* for IFLA_STATS_AF_SPEC */
+ size += nla_total_size(0);
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_stats_af_size) {
+ size += nla_total_size(
+ af_ops->get_stats_af_size(dev));
+
+ /* for AF_* */
+ size += nla_total_size(0);
+ }
+ }
+ }
+
return size;
}
@@ -3886,6 +3950,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh)
u32 filter_mask;
int err;
+ if (nlmsg_len(nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
ifsm = nlmsg_data(nlh);
if (ifsm->ifindex > 0)
dev = __dev_get_by_index(net, ifsm->ifindex);
@@ -3935,6 +4002,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->seq = net->dev_base_seq;
+ if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
ifsm = nlmsg_data(cb->nlh);
filter_mask = ifsm->filter_mask;
if (!filter_mask)
diff --git a/net/core/scm.c b/net/core/scm.c
index 2696aefdc148..b1ff8a441748 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -14,6 +14,7 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/user.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/stat.h>
@@ -29,7 +30,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -71,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
struct file **fpp;
int i, num;
- num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+ num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
if (num <= 0)
return 0;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..d28da7d363f1 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,7 @@
+/*
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/cryptohash.h>
@@ -8,17 +12,20 @@
#include <linux/ktime.h>
#include <linux/string.h>
#include <linux/net.h>
-
+#include <linux/siphash.h>
#include <net/secure_seq.h>
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
-#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
+#include <linux/in6.h>
+#include <net/tcp.h>
-static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+static siphash_key_t net_secret __read_mostly;
+static siphash_key_t ts_secret __read_mostly;
static __always_inline void net_secret_init(void)
{
- net_get_random_once(net_secret, sizeof(net_secret));
+ net_get_random_once(&ts_secret, sizeof(ts_secret));
+ net_get_random_once(&net_secret, sizeof(net_secret));
}
#endif
@@ -40,81 +47,98 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
{
- u32 secret[MD5_MESSAGE_BYTES / 4];
- u32 hash[MD5_DIGEST_WORDS];
- u32 i;
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ };
+
+ if (sysctl_tcp_timestamps != 1)
+ return 0;
+
+ return siphash(&combined, offsetofend(typeof(combined), daddr),
+ &ts_secret);
+}
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
+ u64 hash;
net_secret_init();
- memcpy(hash, saddr, 16);
- for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32)daddr[i];
- secret[4] = net_secret[4] +
- (((__force u16)sport << 16) + (__force u16)dport);
- for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
- secret[i] = net_secret[i];
-
- md5_transform(hash, secret);
-
- return seq_scale(hash[0]);
+ hash = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+ *tsoff = secure_tcpv6_ts_off(saddr, daddr);
+ return seq_scale(hash);
}
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
{
- u32 secret[MD5_MESSAGE_BYTES / 4];
- u32 hash[MD5_DIGEST_WORDS];
- u32 i;
-
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .dport = dport
+ };
net_secret_init();
- memcpy(hash, saddr, 16);
- for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32) daddr[i];
- secret[4] = net_secret[4] + (__force u32)dport;
- for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
- secret[i] = net_secret[i];
-
- md5_transform(hash, secret);
-
- return hash[0];
+ return siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
}
EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
{
- u32 hash[MD5_DIGEST_WORDS];
+ if (sysctl_tcp_timestamps != 1)
+ return 0;
- net_secret_init();
- hash[0] = (__force u32)saddr;
- hash[1] = (__force u32)daddr;
- hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
- hash[3] = net_secret[15];
+ return siphash_2u32((__force u32)saddr, (__force u32)daddr,
+ &ts_secret);
+}
- md5_transform(hash, net_secret);
+/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
- return seq_scale(hash[0]);
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
+{
+ u64 hash;
+ net_secret_init();
+ hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
+ *tsoff = secure_tcp_ts_off(saddr, daddr);
+ return seq_scale(hash);
}
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
- u32 hash[MD5_DIGEST_WORDS];
-
net_secret_init();
- hash[0] = (__force u32)saddr;
- hash[1] = (__force u32)daddr;
- hash[2] = (__force u32)dport ^ net_secret[14];
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- return hash[0];
+ return siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport, &net_secret);
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
@@ -123,21 +147,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
{
- u32 hash[MD5_DIGEST_WORDS];
u64 seq;
-
net_secret_init();
- hash[0] = (__force u32)saddr;
- hash[1] = (__force u32)daddr;
- hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- seq = hash[0] | (((u64)hash[1]) << 32);
+ seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
-
return seq;
}
EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -146,26 +162,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
__be16 sport, __be16 dport)
{
- u32 secret[MD5_MESSAGE_BYTES / 4];
- u32 hash[MD5_DIGEST_WORDS];
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
u64 seq;
- u32 i;
-
net_secret_init();
- memcpy(hash, saddr, 16);
- for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32)daddr[i];
- secret[4] = net_secret[4] +
- (((__force u16)sport << 16) + (__force u16)dport);
- for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
- secret[i] = net_secret[i];
-
- md5_transform(hash, secret);
-
- seq = hash[0] | (((u64)hash[1]) << 32);
+ seq = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
-
return seq;
}
EXPORT_SYMBOL(secure_dccpv6_sequence_number);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e3e0087245b..f86bf69cfb8d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,7 +72,7 @@
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(&fclones->fclone_ref, 1);
fclones->skb2.fclone = SKB_FCLONE_CLONE;
- fclones->skb2.pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -354,7 +353,7 @@ EXPORT_SYMBOL(build_skb);
struct napi_alloc_cache {
struct page_frag_cache page;
- size_t skb_count;
+ unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -369,7 +368,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
local_irq_save(flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __alloc_page_frag(nc, fragsz, gfp_mask);
+ data = page_frag_alloc(nc, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
}
@@ -391,7 +390,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +440,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
local_irq_save(flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
local_irq_restore(flags);
@@ -505,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(&nc->page, len, gfp_mask);
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb)
skb->destructor(skb);
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_conntrack_put(skb->nfct);
+ nf_conntrack_put(skb_nfct(skb));
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nf_bridge_put(skb->nf_bridge);
@@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
#ifdef CONFIG_NET_SCHED
CHECK_SKB_FIELD(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
- CHECK_SKB_FIELD(tc_verd);
-#endif
#endif
}
@@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{
- int i;
- u8 *data;
- int size = nhead + skb_end_offset(skb) + ntail;
+ int i, osize = skb_end_offset(skb);
+ int size = osize + nhead + ntail;
long off;
+ u8 *data;
BUG_ON(nhead < 0);
@@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ /* It is not generally safe to change skb->truesize.
+ * For the moment, we really care of rx path, or
+ * when skb is orphaned (not attached to a socket).
+ */
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb->truesize += size - osize;
+
return 0;
nofrags:
@@ -2656,7 +2660,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
struct skb_frag_struct *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
- BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ if (skb_headlen(skb))
+ return 0;
todo = shiftlen;
from = 0;
@@ -3076,22 +3082,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (sg && csum && (mss != GSO_BY_FRAGS)) {
if (!(features & NETIF_F_GSO_PARTIAL)) {
struct sk_buff *iter;
+ unsigned int frag_len;
if (!list_skb ||
!net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
goto normal;
- /* Split the buffer at the frag_list pointer.
- * This is based on the assumption that all
- * buffers in the chain excluding the last
- * containing the same amount of data.
+ /* If we get here then all the required
+ * GSO features except frag_list are supported.
+ * Try to split the SKB to multiple GSO SKBs
+ * with no frag_list.
+ * Currently we can do that only when the buffers don't
+ * have a linear part and all the buffers except
+ * the last are of the same length.
*/
+ frag_len = list_skb->len;
skb_walk_frags(head_skb, iter) {
+ if (frag_len != iter->len && iter->next)
+ goto normal;
if (skb_headlen(iter))
goto normal;
len -= iter->len;
}
+
+ if (len != frag_len)
+ goto normal;
}
/* GSO partial only requires that we trim off any excess that
@@ -3688,6 +3704,15 @@ static void sock_rmem_free(struct sk_buff *skb)
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}
+static void skb_set_err_queue(struct sk_buff *skb)
+{
+ /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
+ * So, it is safe to (mis)use it to mark skbs on the error queue.
+ */
+ skb->pkt_type = PACKET_OUTGOING;
+ BUILD_BUG_ON(PACKET_OUTGOING == 0);
+}
+
/*
* Note: We dont mem charge error packets (no sk_forward_alloc changes)
*/
@@ -3701,6 +3726,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_rmem_free;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ skb_set_err_queue(skb);
/* before exiting rcu section, make sure dst is refcounted */
skb_dst_force(skb);
@@ -3712,21 +3738,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
struct sk_buff_head *q = &sk->sk_error_queue;
- struct sk_buff *skb, *skb_next;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
unsigned long flags;
- int err = 0;
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q)))
- err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ icmp_next = is_icmp_err_skb(skb_next);
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_err = err;
- if (err)
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
sk->sk_error_report(sk);
return skb;
@@ -3769,16 +3803,21 @@ EXPORT_SYMBOL(skb_clone_sk);
static void __skb_complete_tx_timestamp(struct sk_buff *skb,
struct sock *sk,
- int tstype)
+ int tstype,
+ bool opt_stats)
{
struct sock_exterr_skb *serr;
int err;
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
serr->ee.ee_info = tstype;
+ serr->opt_stats = opt_stats;
+ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
if (sk->sk_protocol == IPPROTO_TCP &&
@@ -3814,13 +3853,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
if (!skb_may_tx_timestamp(sk, false))
return;
- /* take a reference to prevent skb_orphan() from freeing the socket */
- sock_hold(sk);
-
- *skb_hwtstamps(skb) = *hwtstamps;
- __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
-
- sock_put(sk);
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
+ sock_put(sk);
+ }
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
@@ -3829,7 +3869,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
struct sock *sk, int tstype)
{
struct sk_buff *skb;
- bool tsonly;
+ bool tsonly, opt_stats = false;
if (!sk)
return;
@@ -3838,10 +3878,19 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
- if (tsonly)
- skb = alloc_skb(0, GFP_ATOMIC);
- else
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
+ skb = tcp_get_timestamping_opt_stats(sk);
+ opt_stats = true;
+ } else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
+ }
if (!skb)
return;
@@ -3855,7 +3904,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
else
skb->tstamp = ktime_get_real();
- __skb_complete_tx_timestamp(skb, sk, tstype);
+ __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
@@ -3871,7 +3920,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
struct sock *sk = skb->sk;
struct sock_exterr_skb *serr;
- int err;
+ int err = 1;
skb->wifi_acked_valid = 1;
skb->wifi_acked = acked;
@@ -3881,14 +3930,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
- /* take a reference to prevent skb_orphan() from freeing the socket */
- sock_hold(sk);
-
- err = sock_queue_err_skb(sk, skb);
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
+ err = sock_queue_err_skb(sk, skb);
+ sock_put(sk);
+ }
if (err)
kfree_skb(skb);
-
- sock_put(sk);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
@@ -4350,7 +4400,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
@@ -4913,3 +4963,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
return clone;
}
EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (skb->data_len) {
+ if (skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+ }
+ /* At this point, skb->truesize might be over estimated,
+ * because skb had a fragment, and fragments do not tell
+ * their truesize.
+ * When we pulled its content into skb->head, fragment
+ * was freed, but __pskb_pull_tail() could not possibly
+ * adjust skb->truesize, not knowing the frag truesize.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 00a074dbfe9b..2c4f574168fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,7 +118,7 @@
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
@@ -197,66 +197,55 @@ EXPORT_SYMBOL(sk_net_capable);
/*
* Each address family might have different locking rules, so we have
- * one slock key per address family:
+ * one slock key per address family and separate keys for internal and
+ * userspace sockets.
*/
static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
* locks is fast):
*/
+
+#define _sock_locks(x) \
+ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
+ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
+ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
+ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
+ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
+ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
+ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
+ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
+ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
+ x "27" , x "28" , x "AF_CAN" , \
+ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
+ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
+ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
+ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
+
static const char *const af_family_key_strings[AF_MAX+1] = {
- "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
- "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
- "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
- "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
- "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
- "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
- "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
- "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
- "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
- "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
- "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
- "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
- "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
- "sk_lock-AF_MAX"
+ _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
- "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
- "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
- "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
- "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
- "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
- "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
- "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
- "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
- "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
- "slock-27" , "slock-28" , "slock-AF_CAN" ,
- "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
- "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
- "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
- "slock-AF_MAX"
+ _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
- "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
- "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
- "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
- "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
- "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
- "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
- "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
- "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
- "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
- "clock-27" , "clock-28" , "clock-AF_CAN" ,
- "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
- "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
- "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
- "clock-AF_MAX"
+ _sock_locks("clock-")
+};
+
+static const char *const af_family_kern_key_strings[AF_MAX+1] = {
+ _sock_locks("k-sk_lock-")
+};
+static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-slock-")
+};
+static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-clock-")
};
/*
@@ -264,6 +253,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
* so split the lock classes by using a per-AF key:
*/
static struct lock_class_key af_callback_keys[AF_MAX];
+static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Take into consideration the size of the struct sk_buff overhead in the
* determination of these values, since that is non-constant across
@@ -367,7 +357,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
if (tv.tv_sec == 0 && tv.tv_usec == 0)
return 0;
if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
- *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+ *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
return 0;
}
@@ -502,6 +492,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -762,11 +753,8 @@ set_rcvbuf:
goto set_rcvbuf;
case SO_KEEPALIVE:
-#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- tcp_set_keepalive(sk, valbool);
-#endif
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
break;
@@ -854,6 +842,13 @@ set_rcvbuf:
sk->sk_tskey = 0;
}
}
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+ ret = -EINVAL;
+ break;
+ }
+
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
@@ -1141,7 +1136,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.tm.tv_usec = 0;
} else {
v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
}
break;
@@ -1152,7 +1147,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.tm.tv_usec = 0;
} else {
v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
}
break;
@@ -1288,7 +1283,16 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
- sock_lock_init_class_and_name(sk,
+ if (sk->sk_kern_sock)
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_kern_slock_key_strings[sk->sk_family],
+ af_family_kern_slock_keys + sk->sk_family,
+ af_family_kern_key_strings[sk->sk_family],
+ af_family_kern_keys + sk->sk_family);
+ else
+ sock_lock_init_class_and_name(
+ sk,
af_family_slock_key_strings[sk->sk_family],
af_family_slock_keys + sk->sk_family,
af_family_key_strings[sk->sk_family],
@@ -1394,6 +1398,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
+ sk->sk_kern_sock = kern;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt))
@@ -1437,6 +1442,11 @@ static void __sk_destruct(struct rcu_head *head)
pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc));
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
@@ -1515,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
af_family_clock_key_strings[newsk->sk_family]);
newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
atomic_set(&newsk->sk_drops, 0);
@@ -1533,11 +1544,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
is_charged = sk_filter_charge(newsk, filter);
if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- bh_unlock_sock(newsk);
- sk_free(newsk);
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
+ */
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
newsk = NULL;
goto out;
}
@@ -1586,6 +1599,16 @@ out:
}
EXPORT_SYMBOL_GPL(sk_clone_lock);
+void sk_free_unlock_clone(struct sock *sk)
+{
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ sk->sk_destruct = NULL;
+ bh_unlock_sock(sk);
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
@@ -2080,37 +2103,31 @@ void __sk_flush_backlog(struct sock *sk)
*/
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc;
- DEFINE_WAIT(wait);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
EXPORT_SYMBOL(sk_wait_data);
/**
- * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * __sk_mem_raise_allocated - increase memory_allocated
* @sk: socket
* @size: memory size to allocate
+ * @amt: pages to allocate
* @kind: allocation type
*
- * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
- * rmem allocation. This function assumes that protocols which have
- * memory_pressure use sk_wmem_queued as write buffer accounting.
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
*/
-int __sk_mem_schedule(struct sock *sk, int size, int kind)
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
- int amt = sk_mem_pages(size);
- long allocated;
-
- sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-
- allocated = sk_memory_allocated_add(sk, amt);
+ long allocated = sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
!mem_cgroup_charge_skmem(sk->sk_memcg, amt))
@@ -2171,9 +2188,6 @@ suppress_allocation:
trace_sock_exceed_buf_limit(sk, prot, allocated);
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-
sk_memory_allocated_sub(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
@@ -2181,18 +2195,40 @@ suppress_allocation:
return 0;
}
+EXPORT_SYMBOL(__sk_mem_raise_allocated);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int ret, amt = sk_mem_pages(size);
+
+ sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+ if (!ret)
+ sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ return ret;
+}
EXPORT_SYMBOL(__sk_mem_schedule);
/**
- * __sk_mem_reclaim - reclaim memory_allocated
+ * __sk_mem_reduce_allocated - reclaim memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of quanta
+ *
+ * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
*/
-void __sk_mem_reclaim(struct sock *sk, int amount)
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
sk_memory_allocated_sub(sk, amount);
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
@@ -2201,6 +2237,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
+EXPORT_SYMBOL(__sk_mem_reduce_allocated);
+
+/**
+ * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ __sk_mem_reduce_allocated(sk, amount);
+}
EXPORT_SYMBOL(__sk_mem_reclaim);
int sk_set_peek_off(struct sock *sk, int val)
@@ -2239,7 +2288,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
}
EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
return -EOPNOTSUPP;
}
@@ -2436,11 +2486,21 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
rwlock_init(&sk->sk_callback_lock);
- lockdep_set_class_and_name(&sk->sk_callback_lock,
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
@@ -2738,11 +2798,6 @@ void sk_common_release(struct sock *sk)
sk_refcnt_debug_release(sk);
- if (sk->sk_frag.page) {
- put_page(sk->sk_frag.page);
- sk->sk_frag.page = NULL;
- }
-
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
diff --git a/net/core/stream.c b/net/core/stream.c
index 1086c8b280a8..20231dbb1da0 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -13,6 +13,7 @@
*/
#include <linux/module.h>
+#include <linux/sched/signal.h>
#include <linux/net.h>
#include <linux/signal.h>
#include <linux/tcp.h>
@@ -53,8 +54,8 @@ void sk_stream_write_space(struct sock *sk)
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
int done;
do {
@@ -68,13 +69,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
!((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
- finish_wait(sk_sleep(sk), &wait);
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
@@ -94,16 +95,16 @@ static inline int sk_stream_closing(struct sock *sk)
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
EXPORT_SYMBOL(sk_stream_wait_close);
@@ -119,16 +120,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
long vm_wait = 0;
long current_timeo = *timeo_p;
bool noblock = (*timeo_p ? false : true);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ add_wait_queue(sk_sleep(sk), &wait);
+
while (1) {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p) {
@@ -147,7 +148,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
sk_wait_event(sk, &current_timeo, sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
(sk_stream_memory_free(sk) &&
- !vm_wait));
+ !vm_wait), &wait);
sk->sk_write_pending--;
if (vm_wait) {
@@ -161,7 +162,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..7f9cc400eca0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- if (sock_table)
+ if (sock_table) {
static_key_slow_inc(&rps_needed);
+ static_key_slow_inc(&rfs_needed);
+ }
if (orig_sock_table) {
static_key_slow_dec(&rps_needed);
+ static_key_slow_dec(&rfs_needed);
synchronize_rcu();
vfree(orig_sock_table);
}
@@ -219,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret != 0)
+ return ret;
+
+ dev_rx_weight = weight_p * dev_weight_rx_bias;
+ dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+ return ret;
+}
+
static int proc_do_rss_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -270,7 +288,21 @@ static struct ctl_table net_core_table[] = {
.data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_rx_bias",
+ .data = &dev_weight_rx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_tx_bias",
+ .data = &dev_weight_tx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
},
{
.procname = "netdev_max_backlog",
@@ -302,6 +334,13 @@ static struct ctl_table net_core_table[] = {
.mode = 0600,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "bpf_jit_kallsyms",
+ .data = &bpf_jit_kallsyms,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec,
+ },
# endif
#endif
{
@@ -369,14 +408,16 @@ static struct ctl_table net_core_table[] = {
.data = &sysctl_net_busy_poll,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "busy_read",
.data = &sysctl_net_busy_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
#endif
#ifdef CONFIG_NET_SCHED
diff --git a/net/core/utils.c b/net/core/utils.c
index cf5622b9ccc4..6592d7bbed39 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -31,7 +31,7 @@
#include <net/net_ratelimit.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
/*