From be8f8284cd897af2482d4e54fbc2bdfc15557259 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Mon, 20 Nov 2017 19:26:02 +0900
Subject: net: xfrm: allow clearing socket xfrm policies.

Currently it is possible to add or update socket policies, but
not clear them. Therefore, once a socket policy has been applied,
the socket cannot be used for unencrypted traffic.

This patch allows (privileged) users to clear socket policies by
passing in a NULL pointer and zero length argument to the
{IP,IPV6}_{IPSEC,XFRM}_POLICY setsockopts. This results in both
the incoming and outgoing policies being cleared.

The simple approach taken in this patch cannot clear socket
policies in only one direction. If desired this could be added
in the future, for example by continuing to pass in a length of
zero (which currently is guaranteed to return EMSGSIZE) and
making the policy be a pointer to an integer that contains one
of the XFRM_POLICY_{IN,OUT} enum values.

An alternative would have been to interpret the length as a
signed integer and use XFRM_POLICY_IN (i.e., 0) to clear the
input policy and -XFRM_POLICY_OUT (i.e., -1) to clear the output
policy.

Tested: https://android-review.googlesource.com/539816
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 +-
 net/xfrm/xfrm_state.c  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9542975eb2f9..3263662fb20a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1251,7 +1251,7 @@ EXPORT_SYMBOL(xfrm_policy_delete);
 
 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 {
-	struct net *net = xp_net(pol);
+	struct net *net = sock_net(sk);
 	struct xfrm_policy *old_pol;
 
 #ifdef CONFIG_XFRM_SUB_POLICY
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 065d89606888..1b7856be3eeb 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2048,6 +2048,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
 	struct xfrm_mgr *km;
 	struct xfrm_policy *pol = NULL;
 
+	if (!optval && !optlen) {
+		xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
+		xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
+		__sk_dst_reset(sk);
+		return 0;
+	}
+
 	if (optlen <= 0 || optlen > PAGE_SIZE)
 		return -EMSGSIZE;
 
-- 
cgit v1.2.3


From 311af51dcb5629f04976a8e451673f77e3301041 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 27 Nov 2017 12:41:38 +0100
Subject: openvswitch: use ktime_get_ts64() instead of ktime_get_ts()

timespec is deprecated because of the y2038 overflow, so let's convert
this one to ktime_get_ts64(). The code is already safe even on 32-bit
architectures, since it uses monotonic times. On 64-bit architectures,
nothing changes, while on 32-bit architectures this avoids one
type conversion.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index dbe2379329c5..76d050aba7a4 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -56,12 +56,12 @@
 
 u64 ovs_flow_used_time(unsigned long flow_jiffies)
 {
-	struct timespec cur_ts;
+	struct timespec64 cur_ts;
 	u64 cur_ms, idle_ms;
 
-	ktime_get_ts(&cur_ts);
+	ktime_get_ts64(&cur_ts);
 	idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
-	cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+	cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC +
 		 cur_ts.tv_nsec / NSEC_PER_MSEC;
 
 	return cur_ms - idle_ms;
-- 
cgit v1.2.3


From d750dbdc071bf863112ea83c64cd7c44d4bad261 Mon Sep 17 00:00:00 2001
From: Tina Ruchandani <ruchandani.tina@gmail.com>
Date: Mon, 27 Nov 2017 15:02:17 +0100
Subject: atm: mpoa: remove 32-bit timekeeping

net/atm/mpoa_* files use 'struct timeval' to store event
timestamps. struct timeval uses a 32-bit seconds field which will
overflow in the year 2038 and beyond. Morever, the timestamps are being
compared only to get seconds elapsed, so struct timeval which stores
a seconds and microseconds field is an overkill. This patch replaces
the use of struct timeval with time64_t to store a 64-bit seconds field.

Signed-off-by: Tina Ruchandani <ruchandani.tina@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/common.c      |  2 +-
 net/atm/mpc.c         |  9 +++++----
 net/atm/mpoa_caches.c | 48 +++++++++++++++++++++++-------------------------
 net/atm/mpoa_caches.h |  9 +++++----
 net/atm/mpoa_proc.c   | 15 +++++++++------
 5 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/atm/common.c b/net/atm/common.c
index 8a4f99114cd2..5763fd241dc3 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -14,7 +14,7 @@
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
-#include <linux/time.h>		/* struct timeval */
+#include <linux/time64.h>	/* 64-bit time for seconds */
 #include <linux/skbuff.h>
 #include <linux/bitops.h>
 #include <linux/init.h>
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 7c6a1cc760a2..31e0dcb970f8 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1089,7 +1089,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
 		msg->type = SND_MPOA_RES_RQST;
 		msg->content.in_info = entry->ctrl_info;
 		msg_to_mpoad(msg, mpc);
-		do_gettimeofday(&(entry->reply_wait));
+		entry->reply_wait = ktime_get_seconds();
 		mpc->in_ops->put(entry);
 		return;
 	}
@@ -1099,7 +1099,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
 		msg->type = SND_MPOA_RES_RQST;
 		msg->content.in_info = entry->ctrl_info;
 		msg_to_mpoad(msg, mpc);
-		do_gettimeofday(&(entry->reply_wait));
+		entry->reply_wait = ktime_get_seconds();
 		mpc->in_ops->put(entry);
 		return;
 	}
@@ -1175,8 +1175,9 @@ static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
 	}
 
 	entry->ctrl_info = msg->content.in_info;
-	do_gettimeofday(&(entry->tv));
-	do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */
+	entry->time = ktime_get_seconds();
+	/* Used in refreshing func from now on */
+	entry->reply_wait = ktime_get_seconds();
 	entry->refresh_time = 0;
 	ddprintk_cont("entry->shortcut = %p\n", entry->shortcut);
 
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index e01450bb32d6..4bb418313720 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -117,7 +117,7 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
 
 	memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
 	entry->ctrl_info.in_dst_ip = dst_ip;
-	do_gettimeofday(&(entry->tv));
+	entry->time = ktime_get_seconds();
 	entry->retry_time = client->parameters.mpc_p4;
 	entry->count = 1;
 	entry->entry_state = INGRESS_INVALID;
@@ -148,7 +148,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
 			if (qos != NULL)
 				msg.qos = qos->qos;
 			msg_to_mpoad(&msg, mpc);
-			do_gettimeofday(&(entry->reply_wait));
+			entry->reply_wait = ktime_get_seconds();
 			entry->entry_state = INGRESS_RESOLVING;
 		}
 		if (entry->shortcut != NULL)
@@ -171,7 +171,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
 		if (qos != NULL)
 			msg.qos = qos->qos;
 		msg_to_mpoad(&msg, mpc);
-		do_gettimeofday(&(entry->reply_wait));
+		entry->reply_wait = ktime_get_seconds();
 	}
 
 	return CLOSED;
@@ -227,17 +227,16 @@ static void in_cache_remove_entry(in_cache_entry *entry,
 static void clear_count_and_expired(struct mpoa_client *client)
 {
 	in_cache_entry *entry, *next_entry;
-	struct timeval now;
+	time64_t now;
 
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	write_lock_bh(&client->ingress_lock);
 	entry = client->in_cache;
 	while (entry != NULL) {
 		entry->count = 0;
 		next_entry = entry->next;
-		if ((now.tv_sec - entry->tv.tv_sec)
-		   > entry->ctrl_info.holding_time) {
+		if ((now - entry->time) > entry->ctrl_info.holding_time) {
 			dprintk("holding time expired, ip = %pI4\n",
 				&entry->ctrl_info.in_dst_ip);
 			client->in_ops->remove_entry(entry, client);
@@ -253,35 +252,35 @@ static void check_resolving_entries(struct mpoa_client *client)
 
 	struct atm_mpoa_qos *qos;
 	in_cache_entry *entry;
-	struct timeval now;
+	time64_t now;
 	struct k_message msg;
 
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	read_lock_bh(&client->ingress_lock);
 	entry = client->in_cache;
 	while (entry != NULL) {
 		if (entry->entry_state == INGRESS_RESOLVING) {
-			if ((now.tv_sec - entry->hold_down.tv_sec) <
-			    client->parameters.mpc_p6) {
+
+			if ((now - entry->hold_down)
+					< client->parameters.mpc_p6) {
 				entry = entry->next;	/* Entry in hold down */
 				continue;
 			}
-			if ((now.tv_sec - entry->reply_wait.tv_sec) >
-			    entry->retry_time) {
+			if ((now - entry->reply_wait) > entry->retry_time) {
 				entry->retry_time = MPC_C1 * (entry->retry_time);
 				/*
 				 * Retry time maximum exceeded,
 				 * put entry in hold down.
 				 */
 				if (entry->retry_time > client->parameters.mpc_p5) {
-					do_gettimeofday(&(entry->hold_down));
+					entry->hold_down = ktime_get_seconds();
 					entry->retry_time = client->parameters.mpc_p4;
 					entry = entry->next;
 					continue;
 				}
 				/* Ask daemon to send a resolution request. */
-				memset(&(entry->hold_down), 0, sizeof(struct timeval));
+				memset(&entry->hold_down, 0, sizeof(time64_t));
 				msg.type = SND_MPOA_RES_RTRY;
 				memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN);
 				msg.content.in_info = entry->ctrl_info;
@@ -289,7 +288,7 @@ static void check_resolving_entries(struct mpoa_client *client)
 				if (qos != NULL)
 					msg.qos = qos->qos;
 				msg_to_mpoad(&msg, client);
-				do_gettimeofday(&(entry->reply_wait));
+				entry->reply_wait = ktime_get_seconds();
 			}
 		}
 		entry = entry->next;
@@ -300,18 +299,18 @@ static void check_resolving_entries(struct mpoa_client *client)
 /* Call this every MPC-p5 seconds. */
 static void refresh_entries(struct mpoa_client *client)
 {
-	struct timeval now;
+	time64_t now;
 	struct in_cache_entry *entry = client->in_cache;
 
 	ddprintk("refresh_entries\n");
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	read_lock_bh(&client->ingress_lock);
 	while (entry != NULL) {
 		if (entry->entry_state == INGRESS_RESOLVED) {
 			if (!(entry->refresh_time))
 				entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3;
-			if ((now.tv_sec - entry->reply_wait.tv_sec) >
+			if ((now - entry->reply_wait) >
 			    entry->refresh_time) {
 				dprintk("refreshing an entry.\n");
 				entry->entry_state = INGRESS_REFRESHING;
@@ -480,7 +479,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
 
 	memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
 	entry->ctrl_info = msg->content.eg_info;
-	do_gettimeofday(&(entry->tv));
+	entry->time = ktime_get_seconds();
 	entry->entry_state = EGRESS_RESOLVED;
 	dprintk("new_eg_cache_entry cache_id %u\n",
 		ntohl(entry->ctrl_info.cache_id));
@@ -495,7 +494,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
 
 static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
 {
-	do_gettimeofday(&(entry->tv));
+	entry->time = ktime_get_seconds();
 	entry->entry_state = EGRESS_RESOLVED;
 	entry->ctrl_info.holding_time = holding_time;
 }
@@ -503,17 +502,16 @@ static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
 static void clear_expired(struct mpoa_client *client)
 {
 	eg_cache_entry *entry, *next_entry;
-	struct timeval now;
+	time64_t now;
 	struct k_message msg;
 
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	write_lock_irq(&client->egress_lock);
 	entry = client->eg_cache;
 	while (entry != NULL) {
 		next_entry = entry->next;
-		if ((now.tv_sec - entry->tv.tv_sec)
-		   > entry->ctrl_info.holding_time) {
+		if ((now - entry->time) > entry->ctrl_info.holding_time) {
 			msg.type = SND_EGRESS_PURGE;
 			msg.content.eg_info = entry->ctrl_info;
 			dprintk("egress_cache: holding time expired, cache_id = %u.\n",
diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h
index 6a266669ebf4..464c4c7f8d1f 100644
--- a/net/atm/mpoa_caches.h
+++ b/net/atm/mpoa_caches.h
@@ -2,6 +2,7 @@
 #ifndef MPOA_CACHES_H
 #define MPOA_CACHES_H
 
+#include <linux/time64.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/atm.h>
@@ -16,9 +17,9 @@ void atm_mpoa_init_cache(struct mpoa_client *mpc);
 typedef struct in_cache_entry {
 	struct in_cache_entry *next;
 	struct in_cache_entry *prev;
-	struct timeval  tv;
-	struct timeval  reply_wait;
-	struct timeval  hold_down;
+	time64_t  time;
+	time64_t  reply_wait;
+	time64_t  hold_down;
 	uint32_t  packets_fwded;
 	uint16_t  entry_state;
 	uint32_t retry_time;
@@ -53,7 +54,7 @@ struct in_cache_ops{
 typedef struct eg_cache_entry{
 	struct               eg_cache_entry *next;
 	struct               eg_cache_entry *prev;
-	struct               timeval  tv;
+	time64_t	     time;
 	uint8_t              MPS_ctrl_ATM_addr[ATM_ESA_LEN];
 	struct atm_vcc       *shortcut;
 	uint32_t             packets_rcvd;
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 8a0c17e1c203..2212da9c2da2 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/atmmpc.h>
@@ -138,7 +138,7 @@ static int mpc_show(struct seq_file *m, void *v)
 	int i;
 	in_cache_entry *in_entry;
 	eg_cache_entry *eg_entry;
-	struct timeval now;
+	time64_t now;
 	unsigned char ip_string[16];
 
 	if (v == SEQ_START_TOKEN) {
@@ -148,15 +148,17 @@ static int mpc_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
 	seq_printf(m, "Ingress Entries:\nIP address      State      Holding time  Packets fwded  VPI  VCI\n");
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) {
+		unsigned long seconds_delta = now - in_entry->time;
+
 		sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip);
 		seq_printf(m, "%-16s%s%-14lu%-12u",
 			   ip_string,
 			   ingress_state_string(in_entry->entry_state),
 			   in_entry->ctrl_info.holding_time -
-			   (now.tv_sec-in_entry->tv.tv_sec),
+			   seconds_delta,
 			   in_entry->packets_fwded);
 		if (in_entry->shortcut)
 			seq_printf(m, "   %-3d  %-3d",
@@ -169,13 +171,14 @@ static int mpc_show(struct seq_file *m, void *v)
 	seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id        State      Holding time  Packets recvd  Latest IP addr   VPI VCI\n");
 	for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) {
 		unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr;
+		unsigned long seconds_delta = now - eg_entry->time;
+
 		for (i = 0; i < ATM_ESA_LEN; i++)
 			seq_printf(m, "%02x", p[i]);
 		seq_printf(m, "\n%-16lu%s%-14lu%-15u",
 			   (unsigned long)ntohl(eg_entry->ctrl_info.cache_id),
 			   egress_state_string(eg_entry->entry_state),
-			   (eg_entry->ctrl_info.holding_time -
-			    (now.tv_sec-eg_entry->tv.tv_sec)),
+			   (eg_entry->ctrl_info.holding_time - seconds_delta),
 			   eg_entry->packets_rcvd);
 
 		/* latest IP address */
-- 
cgit v1.2.3


From fe736e778c604dee8f02d3381dfadba8d621605e Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:08 -0500
Subject: decnet: Move dn_next into decnet route structure.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 include/net/dn_route.h |  1 +
 include/net/dst.h      |  1 -
 net/decnet/dn_route.c  | 34 ++++++++++++++++++----------------
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/dn_route.h b/include/net/dn_route.h
index 55df9939bca2..342d2503cba5 100644
--- a/include/net/dn_route.h
+++ b/include/net/dn_route.h
@@ -69,6 +69,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev,
  */
 struct dn_route {
 	struct dst_entry dst;
+	struct dn_route __rcu *dn_next;
 
 	struct neighbour *n;
 
diff --git a/include/net/dst.h b/include/net/dst.h
index 349374750ee7..cc7e8166be0d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -102,7 +102,6 @@ struct dst_entry {
 	union {
 		struct dst_entry	*next;
 		struct rt6_info __rcu	*rt6_next;
-		struct dn_route __rcu	*dn_next;
 	};
 };
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 324cb9f2f551..4b3ca70be723 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -199,11 +199,11 @@ static void dn_dst_check_expire(struct timer_list *unused)
 						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
 			if (atomic_read(&rt->dst.__refcnt) > 1 ||
 			    (now - rt->dst.lastuse) < expire) {
-				rtp = &rt->dst.dn_next;
+				rtp = &rt->dn_next;
 				continue;
 			}
-			*rtp = rt->dst.dn_next;
-			rt->dst.dn_next = NULL;
+			*rtp = rt->dn_next;
+			rt->dn_next = NULL;
 			dst_dev_put(&rt->dst);
 			dst_release(&rt->dst);
 		}
@@ -233,11 +233,11 @@ static int dn_dst_gc(struct dst_ops *ops)
 						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
 			if (atomic_read(&rt->dst.__refcnt) > 1 ||
 			    (now - rt->dst.lastuse) < expire) {
-				rtp = &rt->dst.dn_next;
+				rtp = &rt->dn_next;
 				continue;
 			}
-			*rtp = rt->dst.dn_next;
-			rt->dst.dn_next = NULL;
+			*rtp = rt->dn_next;
+			rt->dn_next = NULL;
 			dst_dev_put(&rt->dst);
 			dst_release(&rt->dst);
 			break;
@@ -333,8 +333,8 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 						lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
 		if (compare_keys(&rth->fld, &rt->fld)) {
 			/* Put it first */
-			*rthp = rth->dst.dn_next;
-			rcu_assign_pointer(rth->dst.dn_next,
+			*rthp = rth->dn_next;
+			rcu_assign_pointer(rth->dn_next,
 					   dn_rt_hash_table[hash].chain);
 			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
 
@@ -345,10 +345,10 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 			*rp = rth;
 			return 0;
 		}
-		rthp = &rth->dst.dn_next;
+		rthp = &rth->dn_next;
 	}
 
-	rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
+	rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain);
 	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
 
 	dst_hold_and_use(&rt->dst, now);
@@ -369,8 +369,8 @@ static void dn_run_flush(struct timer_list *unused)
 			goto nothing_to_declare;
 
 		for(; rt; rt = next) {
-			next = rcu_dereference_raw(rt->dst.dn_next);
-			RCU_INIT_POINTER(rt->dst.dn_next, NULL);
+			next = rcu_dereference_raw(rt->dn_next);
+			RCU_INIT_POINTER(rt->dn_next, NULL);
 			dst_dev_put(&rt->dst);
 			dst_release(&rt->dst);
 		}
@@ -1183,6 +1183,7 @@ make_route:
 	if (rt == NULL)
 		goto e_nobufs;
 
+	rt->dn_next = NULL;
 	memset(&rt->fld, 0, sizeof(rt->fld));
 	rt->fld.saddr        = oldflp->saddr;
 	rt->fld.daddr        = oldflp->daddr;
@@ -1252,7 +1253,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *
 	if (!(flags & MSG_TRYHARD)) {
 		rcu_read_lock_bh();
 		for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
-			rt = rcu_dereference_bh(rt->dst.dn_next)) {
+			rt = rcu_dereference_bh(rt->dn_next)) {
 			if ((flp->daddr == rt->fld.daddr) &&
 			    (flp->saddr == rt->fld.saddr) &&
 			    (flp->flowidn_mark == rt->fld.flowidn_mark) &&
@@ -1448,6 +1449,7 @@ make_route:
 	if (rt == NULL)
 		goto e_nobufs;
 
+	rt->dn_next = NULL;
 	memset(&rt->fld, 0, sizeof(rt->fld));
 	rt->rt_saddr      = fld.saddr;
 	rt->rt_daddr      = fld.daddr;
@@ -1529,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb)
 
 	rcu_read_lock();
 	for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
-	    rt = rcu_dereference(rt->dst.dn_next)) {
+	    rt = rcu_dereference(rt->dn_next)) {
 		if ((rt->fld.saddr == cb->src) &&
 		    (rt->fld.daddr == cb->dst) &&
 		    (rt->fld.flowidn_oif == 0) &&
@@ -1749,7 +1751,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		rcu_read_lock_bh();
 		for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
 			rt;
-			rt = rcu_dereference_bh(rt->dst.dn_next), idx++) {
+			rt = rcu_dereference_bh(rt->dn_next), idx++) {
 			if (idx < s_idx)
 				continue;
 			skb_dst_set(skb, dst_clone(&rt->dst));
@@ -1795,7 +1797,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou
 {
 	struct dn_rt_cache_iter_state *s = seq->private;
 
-	rt = rcu_dereference_bh(rt->dst.dn_next);
+	rt = rcu_dereference_bh(rt->dn_next);
 	while (!rt) {
 		rcu_read_unlock_bh();
 		if (--s->bucket < 0)
-- 
cgit v1.2.3


From 071fb37ec43dcd88937a669c5f97bd37f7d29dea Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:15 -0500
Subject: ipv6: Move rt6_next from dst_entry into ipv6 route structure.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 include/net/dst.h     |  1 -
 include/net/ip6_fib.h |  5 +++--
 net/ipv6/ip6_fib.c    | 26 +++++++++++++-------------
 net/ipv6/route.c      | 10 +++++-----
 4 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index cc7e8166be0d..acbb3fb89c4d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -101,7 +101,6 @@ struct dst_entry {
 	struct lwtunnel_state   *lwtstate;
 	union {
 		struct dst_entry	*next;
-		struct rt6_info __rcu	*rt6_next;
 	};
 };
 
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 10c913816032..281a922f0c62 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -129,6 +129,7 @@ struct rt6_exception {
 
 struct rt6_info {
 	struct dst_entry		dst;
+	struct rt6_info __rcu		*rt6_next;
 
 	/*
 	 * Tail elements of dst_entry (__refcnt etc.)
@@ -176,11 +177,11 @@ struct rt6_info {
 
 #define for_each_fib6_node_rt_rcu(fn)					\
 	for (rt = rcu_dereference((fn)->leaf); rt;			\
-	     rt = rcu_dereference(rt->dst.rt6_next))
+	     rt = rcu_dereference(rt->rt6_next))
 
 #define for_each_fib6_walker_rt(w)					\
 	for (rt = (w)->leaf; rt;					\
-	     rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
+	     rt = rcu_dereference_protected(rt->rt6_next, 1))
 
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f5285f4e1d08..c43cbaedfa35 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -893,7 +893,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 	ins = &fn->leaf;
 
 	for (iter = leaf; iter;
-	     iter = rcu_dereference_protected(iter->dst.rt6_next,
+	     iter = rcu_dereference_protected(iter->rt6_next,
 				lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
@@ -950,7 +950,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			break;
 
 next_iter:
-		ins = &iter->dst.rt6_next;
+		ins = &iter->rt6_next;
 	}
 
 	if (fallback_ins && !found) {
@@ -979,7 +979,7 @@ next_iter:
 					      &sibling->rt6i_siblings);
 				break;
 			}
-			sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+			sibling = rcu_dereference_protected(sibling->rt6_next,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
@@ -1009,7 +1009,7 @@ add:
 		if (err)
 			return err;
 
-		rcu_assign_pointer(rt->dst.rt6_next, iter);
+		rcu_assign_pointer(rt->rt6_next, iter);
 		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
 		rcu_assign_pointer(*ins, rt);
@@ -1040,7 +1040,7 @@ add:
 
 		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
-		rt->dst.rt6_next = iter->dst.rt6_next;
+		rt->rt6_next = iter->rt6_next;
 		rcu_assign_pointer(*ins, rt);
 		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
 					  rt, extack);
@@ -1059,14 +1059,14 @@ add:
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
-			ins = &rt->dst.rt6_next;
+			ins = &rt->rt6_next;
 			iter = rcu_dereference_protected(*ins,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 			while (iter) {
 				if (iter->rt6i_metric > rt->rt6i_metric)
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
-					*ins = iter->dst.rt6_next;
+					*ins = iter->rt6_next;
 					iter->rt6i_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
@@ -1075,7 +1075,7 @@ add:
 					nsiblings--;
 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
-					ins = &iter->dst.rt6_next;
+					ins = &iter->rt6_next;
 				}
 				iter = rcu_dereference_protected(*ins,
 					lockdep_is_held(&rt->rt6i_table->tb6_lock));
@@ -1644,7 +1644,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
 
 	/* Unlink it */
-	*rtp = rt->dst.rt6_next;
+	*rtp = rt->rt6_next;
 	rt->rt6i_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1672,7 +1672,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
-			w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+			w->leaf = rcu_dereference_protected(rt->rt6_next,
 					    lockdep_is_held(&table->tb6_lock));
 			if (!w->leaf)
 				w->state = FWS_U;
@@ -1731,7 +1731,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 			fib6_del_route(table, fn, rtp, info);
 			return 0;
 		}
-		rtp_next = &cur->dst.rt6_next;
+		rtp_next = &cur->rt6_next;
 	}
 	return -ENOENT;
 }
@@ -2208,7 +2208,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
 
 	do {
 		iter->w.leaf = rcu_dereference_protected(
-				iter->w.leaf->dst.rt6_next,
+				iter->w.leaf->rt6_next,
 				lockdep_is_held(&iter->tbl->tb6_lock));
 		iter->skip--;
 		if (!iter->skip && iter->w.leaf)
@@ -2274,7 +2274,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!v)
 		goto iter_table;
 
-	n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next);
+	n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
 	if (n) {
 		++*pos;
 		return n;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7a8d1500d374..22c5e70361d6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -502,7 +502,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	if (!oif && ipv6_addr_any(saddr))
 		goto out;
 
-	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
+	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
 		struct net_device *dev = sprt->dst.dev;
 
 		if (oif) {
@@ -721,7 +721,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 
 	match = NULL;
 	cont = NULL;
-	for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
+	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
 		if (rt->rt6i_metric != metric) {
 			cont = rt;
 			break;
@@ -731,7 +731,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	}
 
 	for (rt = leaf; rt && rt != rr_head;
-	     rt = rcu_dereference(rt->dst.rt6_next)) {
+	     rt = rcu_dereference(rt->rt6_next)) {
 		if (rt->rt6i_metric != metric) {
 			cont = rt;
 			break;
@@ -743,7 +743,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	if (match || !cont)
 		return match;
 
-	for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
+	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
 	return match;
@@ -781,7 +781,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
+		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
-- 
cgit v1.2.3


From b92cf4aab8e688b1bd501ac2ac4f1b5c99601e3b Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:22 -0500
Subject: net: Create and use new helper xfrm_dst_child().

Only IPSEC routes have a non-NULL dst->child pointer.  And IPSEC
routes are identified by a non-NULL dst->xfrm pointer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h           |  9 +++++++++
 net/core/dst.c               |  8 +++++---
 net/ipv4/xfrm4_mode_tunnel.c |  2 +-
 net/ipv6/xfrm6_mode_tunnel.c |  2 +-
 net/ipv6/xfrm6_policy.c      |  2 +-
 net/xfrm/xfrm_output.c       |  2 +-
 net/xfrm/xfrm_policy.c       | 14 +++++++-------
 security/selinux/xfrm.c      |  2 +-
 8 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dc28a98ce97c..4021b49a6ce3 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -994,6 +994,15 @@ struct xfrm_dst {
 	u32 path_cookie;
 };
 
+static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
+{
+#ifdef CONFIG_XFRM
+	if (dst->xfrm)
+		return dst->child;
+#endif
+	return NULL;
+}
+
 #ifdef CONFIG_XFRM
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
diff --git a/net/core/dst.c b/net/core/dst.c
index 662a2d4a3d19..6a3c21b8fc8d 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -116,12 +116,14 @@ EXPORT_SYMBOL(dst_alloc);
 
 struct dst_entry *dst_destroy(struct dst_entry * dst)
 {
-	struct dst_entry *child;
+	struct dst_entry *child = NULL;
 
 	smp_rmb();
 
-	child = dst->child;
-
+#ifdef CONFIG_XFRM
+	if (dst->xfrm)
+		child = dst->child;
+#endif
 	if (!(dst->flags & DST_NOCOUNT))
 		dst_entries_add(dst->ops, -1);
 
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index e6265e2c274e..7d885a44dc9d 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
 		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
 
-	top_iph->ttl = ip4_dst_hoplimit(dst->child);
+	top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
 
 	top_iph->saddr = x->props.saddr.a4;
 	top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 02556e356f87..e66b94f46532 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	if (x->props.flags & XFRM_STATE_NOECN)
 		dsfield &= ~INET_ECN_MASK;
 	ipv6_change_dsfield(top_iph, 0, dsfield);
-	top_iph->hop_limit = ip6_dst_hoplimit(dst->child);
+	top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
 	top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
 	top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
 	return 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 885ade234a49..09fb44ee3b45 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -265,7 +265,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			in6_dev_put(xdst->u.rt6.rt6i_idev);
 			xdst->u.rt6.rt6i_idev = loopback_idev;
 			in6_dev_hold(loopback_idev);
-			xdst = (struct xfrm_dst *)xdst->u.dst.child;
+			xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst);
 		} while (xdst->u.dst.xfrm);
 
 		__in6_dev_put(loopback_idev);
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 73ad8c8ef344..23468672a767 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -44,7 +44,7 @@ static int xfrm_skb_check_space(struct sk_buff *skb)
 
 static struct dst_entry *skb_dst_pop(struct sk_buff *skb)
 {
-	struct dst_entry *child = dst_clone(skb_dst(skb)->child);
+	struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb)));
 
 	skb_dst_drop(skb);
 	return child;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9542975eb2f9..6c21a458bc8b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1636,7 +1636,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
 	xfrm_init_pmtu(dst_prev);
 
-	for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
+	for (dst_prev = dst0; dst_prev != dst; dst_prev = xfrm_dst_child(dst_prev)) {
 		struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
 
 		err = xfrm_fill_dst(xdst, dev, fl);
@@ -1800,7 +1800,7 @@ static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
 	for (i = 0; i < num; i++) {
 		if (!dst || dst->xfrm != xfrm[i])
 			return false;
-		dst = dst->child;
+		dst = xfrm_dst_child(dst);
 	}
 
 	return xfrm_bundle_ok(xdst);
@@ -2576,7 +2576,7 @@ static int stale_bundle(struct dst_entry *dst)
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
 {
-	while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
+	while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
 		dst->dev = dev_net(dev)->loopback_dev;
 		dev_hold(dst->dev);
 		dev_put(dev);
@@ -2606,7 +2606,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
 		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 		u32 pmtu, route_mtu_cached;
 
-		pmtu = dst_mtu(dst->child);
+		pmtu = dst_mtu(xfrm_dst_child(dst));
 		xdst->child_mtu_cached = pmtu;
 
 		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
@@ -2651,7 +2651,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
-		mtu = dst_mtu(dst->child);
+		mtu = dst_mtu(xfrm_dst_child(dst));
 		if (xdst->child_mtu_cached != mtu) {
 			last = xdst;
 			xdst->child_mtu_cached = mtu;
@@ -2665,7 +2665,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 			xdst->route_mtu_cached = mtu;
 		}
 
-		dst = dst->child;
+		dst = xfrm_dst_child(dst);
 	} while (dst->xfrm);
 
 	if (likely(!last))
@@ -2707,7 +2707,7 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
 {
 	const struct dst_entry *path = dst->path;
 
-	for (; dst != path; dst = dst->child) {
+	for (; dst != path; dst = xfrm_dst_child(dst)) {
 		const struct xfrm_state *xfrm = dst->xfrm;
 
 		if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 56e354fcdfc6..928188902901 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -452,7 +452,7 @@ int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb,
 	if (dst) {
 		struct dst_entry *iter;
 
-		for (iter = dst; iter != NULL; iter = iter->child) {
+		for (iter = dst; iter != NULL; iter = xfrm_dst_child(iter)) {
 			struct xfrm_state *x = iter->xfrm;
 
 			if (x && selinux_authorizable_xfrm(x))
-- 
cgit v1.2.3


From 45b018beddb631fb9a0ecbc3ba103521b03c4c80 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:28 -0500
Subject: ipsec: Create and use new helpers for dst child access.

This will make a future change moving the dst->child pointer less
invasive.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 include/net/xfrm.h     |  5 +++++
 net/xfrm/xfrm_policy.c | 47 +++++++++++++++++++++++------------------------
 2 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4021b49a6ce3..4c08eb0d46ce 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1004,6 +1004,11 @@ static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
 }
 
 #ifdef CONFIG_XFRM
+static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child)
+{
+	xdst->u.dst.child = child;
+}
+
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
 	xfrm_pols_put(xdst->pols, xdst->num_pols);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6c21a458bc8b..58b3ecba2e41 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1546,8 +1546,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	unsigned long now = jiffies;
 	struct net_device *dev;
 	struct xfrm_mode *inner_mode;
-	struct dst_entry *dst_prev = NULL;
-	struct dst_entry *dst0 = NULL;
+	struct xfrm_dst *xdst_prev = NULL;
+	struct xfrm_dst *xdst0 = NULL;
 	int i = 0;
 	int err;
 	int header_len = 0;
@@ -1573,13 +1573,13 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			goto put_states;
 		}
 
-		if (!dst_prev)
-			dst0 = dst1;
+		if (!xdst_prev)
+			xdst0 = xdst;
 		else
 			/* Ref count is taken during xfrm_alloc_dst()
 			 * No need to do dst_clone() on dst1
 			 */
-			dst_prev->child = dst1;
+			xfrm_dst_set_child(xdst_prev, &xdst->u.dst);
 
 		if (xfrm[i]->sel.family == AF_UNSPEC) {
 			inner_mode = xfrm_ip2inner_mode(xfrm[i],
@@ -1616,8 +1616,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst1->input = dst_discard;
 		dst1->output = inner_mode->afinfo->output;
 
-		dst1->next = dst_prev;
-		dst_prev = dst1;
+		dst1->next = &xdst_prev->u.dst;
+		xdst_prev = xdst;
 
 		header_len += xfrm[i]->props.header_len;
 		if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
@@ -1625,40 +1625,39 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		trailer_len += xfrm[i]->props.trailer_len;
 	}
 
-	dst_prev->child = dst;
-	dst0->path = dst;
+	xfrm_dst_set_child(xdst_prev, dst);
+	xdst0->u.dst.path = dst;
 
 	err = -ENODEV;
 	dev = dst->dev;
 	if (!dev)
 		goto free_dst;
 
-	xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
-	xfrm_init_pmtu(dst_prev);
+	xfrm_init_path(xdst0, dst, nfheader_len);
+	xfrm_init_pmtu(&xdst_prev->u.dst);
 
-	for (dst_prev = dst0; dst_prev != dst; dst_prev = xfrm_dst_child(dst_prev)) {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
-
-		err = xfrm_fill_dst(xdst, dev, fl);
+	for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
+	     xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
+		err = xfrm_fill_dst(xdst_prev, dev, fl);
 		if (err)
 			goto free_dst;
 
-		dst_prev->header_len = header_len;
-		dst_prev->trailer_len = trailer_len;
-		header_len -= xdst->u.dst.xfrm->props.header_len;
-		trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
+		xdst_prev->u.dst.header_len = header_len;
+		xdst_prev->u.dst.trailer_len = trailer_len;
+		header_len -= xdst_prev->u.dst.xfrm->props.header_len;
+		trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
 	}
 
 out:
-	return dst0;
+	return &xdst0->u.dst;
 
 put_states:
 	for (; i < nx; i++)
 		xfrm_state_put(xfrm[i]);
 free_dst:
-	if (dst0)
-		dst_release_immediate(dst0);
-	dst0 = ERR_PTR(err);
+	if (xdst0)
+		dst_release_immediate(&xdst0->u.dst);
+	xdst0 = ERR_PTR(err);
 	goto out;
 }
 
@@ -2012,7 +2011,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
 	dst1->output = xdst_queue_output;
 
 	dst_hold(dst);
-	dst1->child = dst;
+	xfrm_dst_set_child(xdst, dst);
 	dst1->path = dst;
 
 	xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
-- 
cgit v1.2.3


From b6ca8bd5a9198c70c48297390723e4e56bd6e879 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:45:44 -0500
Subject: xfrm: Move child route linkage into xfrm_dst.

XFRM bundle child chains look like this:

	xdst1 --> xdst2 --> xdst3 --> path_dst

All of xdstN are xfrm_dst objects and xdst->u.dst.xfrm is non-NULL.
The final child pointer in the chain, here called 'path_dst', is some
other kind of route such as an ipv4 or ipv6 one.

The xfrm output path pops routes, one at a time, via the child
pointer, until we hit one which has a dst->xfrm pointer which
is NULL.

We can easily preserve the above mechanisms with child sitting
only in the xfrm_dst structure.  All children in the chain
before we break out of the xfrm_output() loop have dst->xfrm
non-NULL and are therefore xfrm_dst objects.

Since we break out of the loop when we find dst->xfrm NULL, we
will not try to dereference 'dst' as if it were an xfrm_dst.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h         |  3 +--
 include/net/xfrm.h        | 15 ++++++++++-----
 net/core/dst.c            |  9 ++++++---
 net/core/pktgen.c         | 12 ++++++------
 net/netfilter/xt_policy.c |  3 ++-
 net/xfrm/xfrm_device.c    |  2 +-
 6 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index acbb3fb89c4d..cef46207408c 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -35,7 +35,6 @@ struct sk_buff;
 struct dst_entry {
 	struct net_device       *dev;
 	struct rcu_head		rcu_head;
-	struct dst_entry	*child;
 	struct  dst_ops	        *ops;
 	unsigned long		_metrics;
 	unsigned long           expires;
@@ -89,7 +88,7 @@ struct dst_entry {
 	 * Align __refcnt to a 64 bytes alignment
 	 * (L1_CACHE_SIZE would be too much)
 	 */
-	long			__pad_to_align_refcnt[2];
+	long			__pad_to_align_refcnt[3];
 #endif
 	/*
 	 * __refcnt wants to be on a different cache line from
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4c08eb0d46ce..0009dab61528 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -968,7 +968,7 @@ static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_c
 
 /* A struct encoding bundle of transformations to apply to some set of flow.
  *
- * dst->child points to the next element of bundle.
+ * xdst->child points to the next element of bundle.
  * dst->xfrm  points to an instanse of transformer.
  *
  * Due to unfortunate limitations of current routing cache, which we
@@ -984,6 +984,7 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	struct dst_entry *child;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 	int num_pols, num_xfrms;
 	u32 xfrm_genid;
@@ -997,8 +998,10 @@ struct xfrm_dst {
 static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
 {
 #ifdef CONFIG_XFRM
-	if (dst->xfrm)
-		return dst->child;
+	if (dst->xfrm) {
+		struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+		return xdst->child;
+	}
 #endif
 	return NULL;
 }
@@ -1006,7 +1009,7 @@ static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
 #ifdef CONFIG_XFRM
 static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child)
 {
-	xdst->u.dst.child = child;
+	xdst->child = child;
 }
 
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
@@ -1880,12 +1883,14 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
 	struct xfrm_state *x = dst->xfrm;
+	struct xfrm_dst *xdst;
 
 	if (!x || !x->type_offload)
 		return false;
 
+	xdst = (struct xfrm_dst *) dst;
 	if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) &&
-	    !dst->child->xfrm)
+	    !xdst->child->xfrm)
 		return true;
 
 	return false;
diff --git a/net/core/dst.c b/net/core/dst.c
index 6a3c21b8fc8d..5cf96179e8e0 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/prefetch.h>
 #include <net/lwtunnel.h>
+#include <net/xfrm.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -62,7 +63,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      struct net_device *dev, int initial_ref, int initial_obsolete,
 	      unsigned short flags)
 {
-	dst->child = NULL;
 	dst->dev = dev;
 	if (dev)
 		dev_hold(dev);
@@ -121,8 +121,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 	smp_rmb();
 
 #ifdef CONFIG_XFRM
-	if (dst->xfrm)
-		child = dst->child;
+	if (dst->xfrm) {
+		struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+
+		child = xdst->child;
+	}
 #endif
 	if (!(dst->flags & DST_NOCOUNT))
 		dst_entries_add(dst->ops, -1);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f95a15086225..b9ce241cd28c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -399,7 +399,7 @@ struct pktgen_dev {
 	__u8	ipsmode;		/* IPSEC mode (config) */
 	__u8	ipsproto;		/* IPSEC type (config) */
 	__u32	spi;
-	struct dst_entry dst;
+	struct xfrm_dst xdst;
 	struct dst_ops dstops;
 #endif
 	char result[512];
@@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
 	 * supports both transport/tunnel mode + ESP/AH type.
 	 */
 	if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
-		skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+		skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
 
 	rcu_read_lock_bh();
 	err = x->outer_mode->output(x, skb);
@@ -3742,10 +3742,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	 * performance under such circumstance.
 	 */
 	pkt_dev->dstops.family = AF_INET;
-	pkt_dev->dst.dev = pkt_dev->odev;
-	dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
-	pkt_dev->dst.child = &pkt_dev->dst;
-	pkt_dev->dst.ops = &pkt_dev->dstops;
+	pkt_dev->xdst.u.dst.dev = pkt_dev->odev;
+	dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false);
+	pkt_dev->xdst.child = &pkt_dev->xdst.u.dst;
+	pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops;
 #endif
 
 	return add_dev_to_thread(t, pkt_dev);
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 2b4ab189bba7..5639fb03bdd9 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -93,7 +93,8 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
 	if (dst->xfrm == NULL)
 		return -1;
 
-	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+	for (i = 0; dst && dst->xfrm;
+	     dst = ((struct xfrm_dst *)dst)->child, i++) {
 		pos = strict ? i : 0;
 		if (pos >= info->len)
 			return 0;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 30e5746085b8..c5851ddddd2a 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -121,7 +121,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 		return false;
 
 	if ((x->xso.offload_handle && (dev == dst->path->dev)) &&
-	     !dst->child->xfrm && x->type->get_mtu) {
+	     !xdst->child->xfrm && x->type->get_mtu) {
 		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
 
 		if (skb->len <= mtu)
-- 
cgit v1.2.3


From 3a2232e92e87166a8a5113e918b8c7b7bdce4d83 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:40 -0500
Subject: ipv6: Move dst->from into struct rt6_info.

The dst->from value is only used by ipv6 routes to track where
a route "came from".

Any time we clone or copy a core ipv6 route in the ipv6 routing
tables, we have the copy/clone's ->from point to the base route.

This is used to handle route expiration properly.

Only ipv6 uses this mechanism, and only ipv6 code references
it.  So it is safe to move it into rt6_info.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 include/net/dst.h     |  3 +--
 include/net/ip6_fib.h |  9 ++++-----
 net/core/dst.c        |  1 -
 net/ipv6/route.c      | 34 +++++++++++++++++-----------------
 4 files changed, 22 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index cef46207408c..13c839d8235a 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -39,7 +39,6 @@ struct dst_entry {
 	unsigned long		_metrics;
 	unsigned long           expires;
 	struct dst_entry	*path;
-	struct dst_entry	*from;
 #ifdef CONFIG_XFRM
 	struct xfrm_state	*xfrm;
 #else
@@ -88,7 +87,7 @@ struct dst_entry {
 	 * Align __refcnt to a 64 bytes alignment
 	 * (L1_CACHE_SIZE would be too much)
 	 */
-	long			__pad_to_align_refcnt[3];
+	long			__pad_to_align_refcnt[4];
 #endif
 	/*
 	 * __refcnt wants to be on a different cache line from
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 281a922f0c62..44d96a91e745 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -130,6 +130,7 @@ struct rt6_exception {
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
+	struct rt6_info			*from;
 
 	/*
 	 * Tail elements of dst_entry (__refcnt etc.)
@@ -204,11 +205,9 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 {
 	struct rt6_info *rt;
 
-	for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES);
-	     rt = (struct rt6_info *)rt->dst.from);
+	for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from);
 	if (rt && rt != rt0)
 		rt0->dst.expires = rt->dst.expires;
-
 	dst_set_expires(&rt0->dst, timeout);
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
@@ -243,8 +242,8 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 	u32 cookie = 0;
 
 	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
-		rt = (struct rt6_info *)(rt->dst.from);
+	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
+		rt = rt->from;
 
 	rt6_get_cookie_safe(rt, &cookie);
 
diff --git a/net/core/dst.c b/net/core/dst.c
index 5cf96179e8e0..cf2076c0eb22 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -70,7 +70,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst_init_metrics(dst, dst_default_metrics.metrics, true);
 	dst->expires = 0UL;
 	dst->path = dst;
-	dst->from = NULL;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
 #endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 22c5e70361d6..1f1ef1e071c2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -186,7 +186,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 
 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
 {
-	return dst_metrics_write_ptr(rt->dst.from);
+	return dst_metrics_write_ptr(&rt->from->dst);
 }
 
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
@@ -391,7 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
 	struct rt6_exception_bucket *bucket;
-	struct dst_entry *from = dst->from;
+	struct rt6_info *from = rt->from;
 	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
@@ -409,8 +409,8 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		kfree(bucket);
 	}
 
-	dst->from = NULL;
-	dst_release(from);
+	rt->from = NULL;
+	dst_release(&from->dst);
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -443,9 +443,9 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	if (rt->rt6i_flags & RTF_EXPIRES) {
 		if (time_after(jiffies, rt->dst.expires))
 			return true;
-	} else if (rt->dst.from) {
+	} else if (rt->from) {
 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
-		       rt6_check_expired((struct rt6_info *)rt->dst.from);
+			rt6_check_expired(rt->from);
 	}
 	return false;
 }
@@ -1054,7 +1054,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	 */
 
 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = (struct rt6_info *)ort->dst.from;
+		ort = ort->from;
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
@@ -1274,7 +1274,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 
 	/* ort can't be a cache or pcpu route */
 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = (struct rt6_info *)ort->dst.from;
+		ort = ort->from;
 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
 
 	spin_lock_bh(&rt6_exception_lock);
@@ -1415,8 +1415,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
 /* Remove the passed in cached rt from the hash table that contains it */
 int rt6_remove_exception_rt(struct rt6_info *rt)
 {
-	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
 	struct rt6_exception_bucket *bucket;
+	struct rt6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 	int err;
@@ -1460,8 +1460,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
  */
 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 {
-	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
 	struct rt6_exception_bucket *bucket;
+	struct rt6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 
@@ -1929,9 +1929,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 
 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
 {
-	if (rt->dst.from &&
-	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
-		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+	if (rt->from &&
+	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
+		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
 }
 
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
@@ -1951,7 +1951,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+	    rt6_check(rt->from, cookie))
 		return &rt->dst;
 	else
 		return NULL;
@@ -1971,7 +1971,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	rt6_dst_from_metrics_check(rt);
 
 	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
+	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
 		return rt6_dst_from_check(rt, cookie);
 	else
 		return rt6_check(rt, cookie);
@@ -3055,11 +3055,11 @@ out:
 
 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
-	BUG_ON(from->dst.from);
+	BUG_ON(from->from);
 
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	dst_hold(&from->dst);
-	rt->dst.from = &from->dst;
+	rt->from = from;
 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
 }
 
-- 
cgit v1.2.3


From 0f6c480f23f49b53644b383c5554e579498347f3 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:46 -0500
Subject: xfrm: Move dst->path into struct xfrm_dst

The first member of an IPSEC route bundle chain sets it's dst->path to
the underlying ipv4/ipv6 route that carries the bundle.

Stated another way, if one were to follow the xfrm_dst->child chain of
the bundle, the final non-NULL pointer would be the path and point to
either an ipv4 or an ipv6 route.

This is largely used to make sure that PMTU events propagate down to
the correct ipv4 or ipv6 route.

When we don't have the top of an IPSEC bundle 'dst->path == dst'.

Move it down into xfrm_dst and key off of dst->xfrm.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 include/net/dst.h       |  3 +--
 include/net/xfrm.h      | 15 ++++++++++++++-
 net/bridge/br_nf_core.c |  1 -
 net/core/dst.c          |  1 -
 net/ipv4/route.c        |  2 +-
 net/ipv6/ip6_output.c   |  4 ++--
 net/ipv6/route.c        |  6 ------
 net/xfrm/xfrm_device.c  |  2 +-
 net/xfrm/xfrm_policy.c  | 28 ++++++++++++++--------------
 9 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 13c839d8235a..424bff66f16d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -38,7 +38,6 @@ struct dst_entry {
 	struct  dst_ops	        *ops;
 	unsigned long		_metrics;
 	unsigned long           expires;
-	struct dst_entry	*path;
 #ifdef CONFIG_XFRM
 	struct xfrm_state	*xfrm;
 #else
@@ -87,7 +86,7 @@ struct dst_entry {
 	 * Align __refcnt to a 64 bytes alignment
 	 * (L1_CACHE_SIZE would be too much)
 	 */
-	long			__pad_to_align_refcnt[4];
+	long			__pad_to_align_refcnt[5];
 #endif
 	/*
 	 * __refcnt wants to be on a different cache line from
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0009dab61528..1ec0c4760646 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -985,6 +985,7 @@ struct xfrm_dst {
 	} u;
 	struct dst_entry *route;
 	struct dst_entry *child;
+	struct dst_entry *path;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 	int num_pols, num_xfrms;
 	u32 xfrm_genid;
@@ -995,6 +996,18 @@ struct xfrm_dst {
 	u32 path_cookie;
 };
 
+static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
+{
+#ifdef CONFIG_XFRM
+	if (dst->xfrm) {
+		const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst;
+
+		return xdst->path;
+	}
+#endif
+	return (struct dst_entry *) dst;
+}
+
 static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
 {
 #ifdef CONFIG_XFRM
@@ -1889,7 +1902,7 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 		return false;
 
 	xdst = (struct xfrm_dst *) dst;
-	if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) &&
+	if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
 	    !xdst->child->xfrm)
 		return true;
 
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 20cbb727df4d..8e2d7cfa4e16 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -78,7 +78,6 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 
 	atomic_set(&rt->dst.__refcnt, 1);
 	rt->dst.dev = br->dev;
-	rt->dst.path = &rt->dst;
 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
 	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
 	rt->dst.ops = &fake_dst_ops;
diff --git a/net/core/dst.c b/net/core/dst.c
index cf2076c0eb22..9bc3bb6e94ef 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -69,7 +69,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->ops = ops;
 	dst_init_metrics(dst, dst_default_metrics.metrics, true);
 	dst->expires = 0UL;
-	dst->path = dst;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
 #endif
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 43b69af242e1..f0ed031f3594 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1106,7 +1106,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 		new = true;
 	}
 
-	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
 
 	if (!dst_check(&rt->dst, 0)) {
 		if (new)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5110a418cc4d..176d74fb3b4d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1201,13 +1201,13 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
 	else
 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
-		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+		      rt->dst.dev->mtu : dst_mtu(xfrm_dst_path(&rt->dst));
 	if (np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
 	}
 	cork->base.fragsize = mtu;
-	if (dst_allfrag(rt->dst.path))
+	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1f1ef1e071c2..46fd53b268da 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4596,8 +4596,6 @@ static int __net_init ip6_route_net_init(struct net *net)
 					   GFP_KERNEL);
 	if (!net->ipv6.ip6_null_entry)
 		goto out_ip6_dst_entries;
-	net->ipv6.ip6_null_entry->dst.path =
-		(struct dst_entry *)net->ipv6.ip6_null_entry;
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
 			 ip6_template_metrics, true);
@@ -4609,8 +4607,6 @@ static int __net_init ip6_route_net_init(struct net *net)
 					       GFP_KERNEL);
 	if (!net->ipv6.ip6_prohibit_entry)
 		goto out_ip6_null_entry;
-	net->ipv6.ip6_prohibit_entry->dst.path =
-		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
 			 ip6_template_metrics, true);
@@ -4620,8 +4616,6 @@ static int __net_init ip6_route_net_init(struct net *net)
 					       GFP_KERNEL);
 	if (!net->ipv6.ip6_blk_hole_entry)
 		goto out_ip6_prohibit_entry;
-	net->ipv6.ip6_blk_hole_entry->dst.path =
-		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
 			 ip6_template_metrics, true);
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index c5851ddddd2a..c61a7d46b412 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -120,7 +120,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 	if (!x->type_offload || x->encap)
 		return false;
 
-	if ((x->xso.offload_handle && (dev == dst->path->dev)) &&
+	if ((x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev)) &&
 	     !xdst->child->xfrm && x->type->get_mtu) {
 		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 58b3ecba2e41..da1b41ee4686 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1626,7 +1626,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	}
 
 	xfrm_dst_set_child(xdst_prev, dst);
-	xdst0->u.dst.path = dst;
+	xdst0->path = dst;
 
 	err = -ENODEV;
 	dev = dst->dev;
@@ -1879,8 +1879,8 @@ static void xfrm_policy_queue_process(struct timer_list *t)
 	xfrm_decode_session(skb, &fl, dst->ops->family);
 	spin_unlock(&pq->hold_queue.lock);
 
-	dst_hold(dst->path);
-	dst = xfrm_lookup(net, dst->path, &fl, sk, 0);
+	dst_hold(xfrm_dst_path(dst));
+	dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0);
 	if (IS_ERR(dst))
 		goto purge_queue;
 
@@ -1909,8 +1909,8 @@ static void xfrm_policy_queue_process(struct timer_list *t)
 		skb = __skb_dequeue(&list);
 
 		xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
-		dst_hold(skb_dst(skb)->path);
-		dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0);
+		dst_hold(xfrm_dst_path(skb_dst(skb)));
+		dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
 		if (IS_ERR(dst)) {
 			kfree_skb(skb);
 			continue;
@@ -2012,7 +2012,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
 
 	dst_hold(dst);
 	xfrm_dst_set_child(xdst, dst);
-	dst1->path = dst;
+	xdst->path = dst;
 
 	xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
 
@@ -2630,7 +2630,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 	struct xfrm_dst *last;
 	u32 mtu;
 
-	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
+	if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
 
@@ -2691,22 +2691,20 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 
 static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
 {
-	return dst_metric_advmss(dst->path);
+	return dst_metric_advmss(xfrm_dst_path(dst));
 }
 
 static unsigned int xfrm_mtu(const struct dst_entry *dst)
 {
 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
-	return mtu ? : dst_mtu(dst->path);
+	return mtu ? : dst_mtu(xfrm_dst_path(dst));
 }
 
 static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
 					const void *daddr)
 {
-	const struct dst_entry *path = dst->path;
-
-	for (; dst != path; dst = xfrm_dst_child(dst)) {
+	while (dst->xfrm) {
 		const struct xfrm_state *xfrm = dst->xfrm;
 
 		if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
@@ -2715,6 +2713,8 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
 			daddr = xfrm->coaddr;
 		else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
 			daddr = &xfrm->id.daddr;
+
+		dst = xfrm_dst_child(dst);
 	}
 	return daddr;
 }
@@ -2723,7 +2723,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr)
 {
-	const struct dst_entry *path = dst->path;
+	const struct dst_entry *path = xfrm_dst_path(dst);
 
 	if (!skb)
 		daddr = xfrm_get_dst_nexthop(dst, daddr);
@@ -2732,7 +2732,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 
 static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 {
-	const struct dst_entry *path = dst->path;
+	const struct dst_entry *path = xfrm_dst_path(dst);
 
 	daddr = xfrm_get_dst_nexthop(dst, daddr);
 	path->ops->confirm_neigh(path, daddr);
-- 
cgit v1.2.3


From 5492093dc4160d150890bc848c26ba7f8fff3094 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:41:01 -0500
Subject: xfrm: Stop using dst->next in bundle construction.

While building ipsec bundles, blocks of xfrm dsts are linked together
using dst->next from bottom to the top.

The only thing this is used for is initializing the pmtu values of the
xfrm stack, and for updating the mtu values at xfrm_bundle_ok() time.

The bundle pmtu entries must be processed in this order so that pmtu
values lower in the stack of routes can propagate up to the higher
ones.

Avoid using dst->next by simply maintaining an array of dst pointers
as we already do for the xfrm_state objects when building the bundle.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 net/xfrm/xfrm_policy.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index da1b41ee4686..22e3350013b4 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -54,7 +54,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 static __read_mostly seqcount_t xfrm_policy_hash_generation;
 
-static void xfrm_init_pmtu(struct dst_entry *dst);
+static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
 static int stale_bundle(struct dst_entry *dst);
 static int xfrm_bundle_ok(struct xfrm_dst *xdst);
 static void xfrm_policy_queue_process(struct timer_list *t);
@@ -1538,7 +1538,9 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
  */
 
 static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
-					    struct xfrm_state **xfrm, int nx,
+					    struct xfrm_state **xfrm,
+					    struct xfrm_dst **bundle,
+					    int nx,
 					    const struct flowi *fl,
 					    struct dst_entry *dst)
 {
@@ -1573,6 +1575,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			goto put_states;
 		}
 
+		bundle[i] = xdst;
 		if (!xdst_prev)
 			xdst0 = xdst;
 		else
@@ -1616,7 +1619,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst1->input = dst_discard;
 		dst1->output = inner_mode->afinfo->output;
 
-		dst1->next = &xdst_prev->u.dst;
 		xdst_prev = xdst;
 
 		header_len += xfrm[i]->props.header_len;
@@ -1634,7 +1636,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		goto free_dst;
 
 	xfrm_init_path(xdst0, dst, nfheader_len);
-	xfrm_init_pmtu(&xdst_prev->u.dst);
+	xfrm_init_pmtu(bundle, nx);
 
 	for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
 	     xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
@@ -1812,6 +1814,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 {
 	struct net *net = xp_net(pols[0]);
 	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
 	struct xfrm_dst *xdst, *old;
 	struct dst_entry *dst;
 	int err;
@@ -1839,7 +1842,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 
 	old = xdst;
 
-	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
 	if (IS_ERR(dst)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
 		return ERR_CAST(dst);
@@ -2599,12 +2602,14 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void xfrm_init_pmtu(struct dst_entry *dst)
+static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
 {
-	do {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	while (nr--) {
+		struct xfrm_dst *xdst = bundle[nr];
 		u32 pmtu, route_mtu_cached;
+		struct dst_entry *dst;
 
+		dst = &xdst->u.dst;
 		pmtu = dst_mtu(xfrm_dst_child(dst));
 		xdst->child_mtu_cached = pmtu;
 
@@ -2617,7 +2622,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
 			pmtu = route_mtu_cached;
 
 		dst_metric_set(dst, RTAX_MTU, pmtu);
-	} while ((dst = dst->next));
+	}
 }
 
 /* Check that the bundle accepts the flow and its components are
@@ -2626,8 +2631,10 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
 
 static int xfrm_bundle_ok(struct xfrm_dst *first)
 {
+	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
 	struct dst_entry *dst = &first->u.dst;
-	struct xfrm_dst *last;
+	struct xfrm_dst *xdst;
+	int start_from, nr;
 	u32 mtu;
 
 	if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
@@ -2637,8 +2644,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 	if (dst->flags & DST_XFRM_QUEUE)
 		return 1;
 
-	last = NULL;
-
+	start_from = nr = 0;
 	do {
 		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
@@ -2650,9 +2656,11 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
+		bundle[nr++] = xdst;
+
 		mtu = dst_mtu(xfrm_dst_child(dst));
 		if (xdst->child_mtu_cached != mtu) {
-			last = xdst;
+			start_from = nr;
 			xdst->child_mtu_cached = mtu;
 		}
 
@@ -2660,30 +2668,30 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 			return 0;
 		mtu = dst_mtu(xdst->route);
 		if (xdst->route_mtu_cached != mtu) {
-			last = xdst;
+			start_from = nr;
 			xdst->route_mtu_cached = mtu;
 		}
 
 		dst = xfrm_dst_child(dst);
 	} while (dst->xfrm);
 
-	if (likely(!last))
+	if (likely(!start_from))
 		return 1;
 
-	mtu = last->child_mtu_cached;
-	for (;;) {
-		dst = &last->u.dst;
+	xdst = bundle[start_from - 1];
+	mtu = xdst->child_mtu_cached;
+	while (start_from--) {
+		dst = &xdst->u.dst;
 
 		mtu = xfrm_state_mtu(dst->xfrm, mtu);
-		if (mtu > last->route_mtu_cached)
-			mtu = last->route_mtu_cached;
+		if (mtu > xdst->route_mtu_cached)
+			mtu = xdst->route_mtu_cached;
 		dst_metric_set(dst, RTAX_MTU, mtu);
-
-		if (last == first)
+		if (!start_from)
 			break;
 
-		last = (struct xfrm_dst *)last->u.dst.next;
-		last->child_mtu_cached = mtu;
+		xdst = bundle[start_from - 1];
+		xdst->child_mtu_cached = mtu;
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 7149f813d12d92ba9abcf49026f7cebc3d55c426 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:41:07 -0500
Subject: net: Remove dst->next

There are no more users.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 include/net/dst.h | 4 ----
 net/core/dst.c    | 1 -
 2 files changed, 5 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 2270513d0790..33d2a5433924 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -88,10 +88,6 @@ struct dst_entry {
 #ifndef CONFIG_64BIT
 	atomic_t		__refcnt;	/* 32-bit offset 64 */
 #endif
-
-	union {
-		struct dst_entry	*next;
-	};
 };
 
 struct dst_metrics {
diff --git a/net/core/dst.c b/net/core/dst.c
index 9bc3bb6e94ef..007aa0b08291 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -86,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
-	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
 }
-- 
cgit v1.2.3


From e94a62f507f9498e5f4b2c69ef181b3402934c2a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 30 Nov 2017 15:39:34 +0100
Subject: net/reuseport: drop legacy code

Since commit e32ea7e74727 ("soreuseport: fast reuseport UDP socket
selection") and commit c125e80b8868 ("soreuseport: fast reuseport
TCP socket selection") the relevant reuseport socket matching the current
packet is selected by the reuseport_select_sock() call. The only
exceptions are invalid BPF filters/filters returning out-of-range
indices.
In the latter case the code implicitly falls back to using the hash
demultiplexing, but instead of selecting the socket inside the
reuseport_select_sock() function, it relies on the hash selection
logic introduced with the early soreuseport implementation.

With this patch, in case of a BPF filter returning a bad socket
index value, we fall back to hash-based selection inside the
reuseport_select_sock() body, so that we can drop some duplicate
code in the ipv4 and ipv6 stack.

This also allows faster lookup in the above scenario and will allow
us to avoid computing the hash value for successful, BPF based
demultiplexing - in a later patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_reuseport.c   |  4 +++-
 net/ipv4/inet_hashtables.c  | 11 ++---------
 net/ipv4/udp.c              | 22 ++++------------------
 net/ipv6/inet6_hashtables.c | 11 ++---------
 net/ipv6/udp.c              | 22 ++++------------------
 5 files changed, 15 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5eeb1d20cc38..c5bb52bc73a1 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk,
 
 		if (prog && skb)
 			sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
-		else
+
+		/* no bpf or invalid bpf result: fall back to hash usage */
+		if (!sk2)
 			sk2 = reuse->socks[reciprocal_scale(hash, socks)];
 	}
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e7d15fb0d94d..427b705d7c64 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -216,32 +216,25 @@ struct sock *__inet_lookup_listener(struct net *net,
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
-	int score, hiscore = 0, matches = 0, reuseport = 0;
 	bool exact_dif = inet_exact_dif_match(net, skb);
 	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
 	u32 phash = 0;
 
 	sk_for_each_rcu(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr,
 				      dif, sdif, exact_dif);
 		if (score > hiscore) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				phash = inet_ehashfn(net, daddr, hnum,
 						     saddr, sport);
 				result = reuseport_select_sock(sk, phash,
 							       skb, doff);
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			hiscore = score;
-		} else if (score == hiscore && reuseport) {
-			matches++;
-			if (reciprocal_scale(phash, matches) == 0)
-				result = sk;
-			phash = next_pseudo_random32(phash);
 		}
 	}
 	return result;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e4ff25c947c5..36f857c87fe2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -445,7 +445,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 				     struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	result = NULL;
@@ -454,23 +454,16 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			badness = score;
 			result = sk;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
@@ -488,7 +481,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	bool exact_dif = udp_lib_exact_dif_match(net, skb);
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
@@ -526,23 +519,16 @@ begin:
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			badness = score;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b01858f5deb1..0d1451381f5c 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -134,31 +134,24 @@ struct sock *inet6_lookup_listener(struct net *net,
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
-	int score, hiscore = 0, matches = 0, reuseport = 0;
 	bool exact_dif = inet6_exact_dif_match(net, skb);
 	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
 	u32 phash = 0;
 
 	sk_for_each(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
 		if (score > hiscore) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				phash = inet6_ehashfn(net, daddr, hnum,
 						      saddr, sport);
 				result = reuseport_select_sock(sk, phash,
 							       skb, doff);
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			hiscore = score;
-		} else if (score == hiscore && reuseport) {
-			matches++;
-			if (reciprocal_scale(phash, matches) == 0)
-				result = sk;
-			phash = next_pseudo_random32(phash);
 		}
 	}
 	return result;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3f30fa313bf2..c9f91c28b81d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -184,7 +184,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 		struct udp_hslot *hslot2, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	result = NULL;
@@ -193,8 +193,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
 
@@ -202,15 +201,9 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			badness = score;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
@@ -228,7 +221,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	bool exact_dif = udp6_lib_exact_dif_match(net, skb);
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
@@ -267,23 +260,16 @@ begin:
 		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
 				      sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			badness = score;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
-- 
cgit v1.2.3


From 0ba23a211360af7b6658e4fcfc571970bbbacc55 Mon Sep 17 00:00:00 2001
From: Yossef Efraim <yossefe@mellanox.com>
Date: Tue, 28 Nov 2017 11:49:28 +0200
Subject: xfrm: Fix xfrm_replay_overflow_offload_esn

In case of wrap around, replay_esn->oseq_hi is not updated
before it is tested for it's actual value, leading function
to fail with overflow indication and packets being dropped.

This patch updates replay_esn->oseq_hi in the right place.

Fixes: d7dbefc45cf5 ("xfrm: Add xfrm_replay_overflow functions for offloading")
Signed-off-by: Yossef Efraim <yossefe@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_replay.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 8b23c5bcf8e8..02501817227b 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -666,7 +666,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
 		if (unlikely(oseq < replay_esn->oseq)) {
 			XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi;
 			xo->seq.hi = oseq_hi;
-
+			replay_esn->oseq_hi = oseq_hi;
 			if (replay_esn->oseq_hi == 0) {
 				replay_esn->oseq--;
 				replay_esn->oseq_hi--;
@@ -678,7 +678,6 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
 		}
 
 		replay_esn->oseq = oseq;
-		replay_esn->oseq_hi = oseq_hi;
 
 		if (xfrm_aevent_is_on(net))
 			x->repl->notify(x, XFRM_REPLAY_UPDATE);
-- 
cgit v1.2.3


From 43024b9ccd524ea1ad3e441edf08cb893bcd6d62 Mon Sep 17 00:00:00 2001
From: Yossef Efraim <yossefe@mellanox.com>
Date: Tue, 28 Nov 2017 11:49:29 +0200
Subject: xfrm: Fix xfrm_dev_state_add to fail for unsupported HW SA option

xfrm_dev_state_add function returns success for unsupported HW SA options.
Resulting the calling function to create SW SA without corrlating HW SA.
Desipte IPSec device offloading option was chosen.
These not supported HW SA options are hard coded within xfrm_dev_state_add
function.
SW backward compatibility will break if we add any of these option as old
HW will fail with new SW.

This patch changes the behaviour to return -EINVAL in case unsupported
option is chosen.
Notifying user application regarding failure and not breaking backward
compatibility for newly added HW SA options.

Signed-off-by: Yossef Efraim <yossefe@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 30e5746085b8..dc68d9c1fc8f 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -67,7 +67,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 
 	/* We don't yet support UDP encapsulation, TFC padding and ESN. */
 	if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
-		return 0;
+		return -EINVAL;
 
 	dev = dev_get_by_index(net, xuo->ifindex);
 	if (!dev) {
-- 
cgit v1.2.3


From 9b7e14dba0c087e950fc024b486e8f729c1ee672 Mon Sep 17 00:00:00 2001
From: Aviv Heller <avivh@mellanox.com>
Date: Tue, 28 Nov 2017 19:55:42 +0200
Subject: xfrm: Remove redundant state assignment in xfrm_input()

x is already initialized to the same value, above.

Signed-off-by: Aviv Heller <avivh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_input.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 347ab31574d5..ac277b97e0d7 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -231,7 +231,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 		if (xo && (xo->flags & CRYPTO_DONE)) {
 			crypto_done = true;
-			x = xfrm_input_state(skb);
 			family = XFRM_SPI_SKB_CB(skb)->family;
 
 			if (!(xo->status & CRYPTO_SUCCESS)) {
-- 
cgit v1.2.3


From 4c94cc2d3d57a2e843ab10887f67faa82c2337f9 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 30 Nov 2017 16:47:25 +0100
Subject: tipc: fall back to smaller MTU if allocation of local send skb fails

When sending node local messages the code is using an 'mtu' of 66060
bytes to avoid unnecessary fragmentation. During situations of low
memory tipc_msg_build() may sometimes fail to allocate such large
buffers, resulting in unnecessary send failures. This can easily be
remedied by falling back to a smaller MTU, and then reassemble the
buffer chain as if the message were arriving from a remote node.

At the same time, we change the initial MTU setting of the broadcast
link to a lower value, so that large messages always are fragmented
into smaller buffers even when we run in single node mode. Apart from
obtaining the same advantage as for the 'fallback' solution above, this
turns out to give a significant performance improvement. This can
probably be explained with the __pskb_copy() operation performed on the
buffer for each recipient during reception. We found the optimal value
for this, considering the most relevant skb pool, to be 3744 bytes.

Acked-by: Ying Xue <ying.xue@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c | 12 ++++++++----
 net/tipc/link.c  |  2 +-
 net/tipc/msg.c   | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 net/tipc/msg.h   |  3 ++-
 4 files changed, 55 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 329325bd553e..37892b3909af 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/bcast.c: TIPC broadcast code
  *
- * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
+ * Copyright (c) 2004-2006, 2014-2017, Ericsson AB
  * Copyright (c) 2004, Intel Corporation.
  * Copyright (c) 2005, 2010-2011, Wind River Systems
  * All rights reserved.
@@ -42,8 +42,8 @@
 #include "link.h"
 #include "name_table.h"
 
-#define	BCLINK_WIN_DEFAULT	50	/* bcast link window size (default) */
-#define	BCLINK_WIN_MIN	        32	/* bcast minimum link window size */
+#define BCLINK_WIN_DEFAULT  50	/* bcast link window size (default) */
+#define BCLINK_WIN_MIN      32	/* bcast minimum link window size */
 
 const char tipc_bclink_name[] = "broadcast-link";
 
@@ -74,6 +74,10 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
 	return tipc_net(net)->bcbase;
 }
 
+/* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link
+ * Note: the MTU is decremented to give room for a tunnel header, in
+ * case the message needs to be sent as replicast
+ */
 int tipc_bcast_get_mtu(struct net *net)
 {
 	return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
@@ -515,7 +519,7 @@ int tipc_bcast_init(struct net *net)
 	spin_lock_init(&tipc_net(net)->bclock);
 
 	if (!tipc_link_bc_create(net, 0, 0,
-				 U16_MAX,
+				 FB_MTU,
 				 BCLINK_WIN_DEFAULT,
 				 0,
 				 &bb->inputq,
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 6bce0b1117bd..2d6b2aed30e0 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -483,7 +483,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 /**
  * tipc_link_bc_create - create new link to be used for broadcast
  * @n: pointer to associated node
- * @mtu: mtu to be used
+ * @mtu: mtu to be used initially if no peers
  * @window: send window to be used
  * @inputq: queue to put messages ready for delivery
  * @namedq: queue to put binding table update messages ready for delivery
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b0d07b35909d..55d8ba92291d 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -251,20 +251,23 @@ bool tipc_msg_validate(struct sk_buff **_skb)
  * @pktmax: Max packet size that can be used
  * @list: Buffer or chain of buffers to be returned to caller
  *
+ * Note that the recursive call we are making here is safe, since it can
+ * logically go only one further level down.
+ *
  * Returns message data size or errno: -ENOMEM, -EFAULT
  */
-int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
-		   int offset, int dsz, int pktmax, struct sk_buff_head *list)
+int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
+		   int dsz, int pktmax, struct sk_buff_head *list)
 {
 	int mhsz = msg_hdr_sz(mhdr);
+	struct tipc_msg pkthdr;
 	int msz = mhsz + dsz;
-	int pktno = 1;
-	int pktsz;
 	int pktrem = pktmax;
-	int drem = dsz;
-	struct tipc_msg pkthdr;
 	struct sk_buff *skb;
+	int drem = dsz;
+	int pktno = 1;
 	char *pktpos;
+	int pktsz;
 	int rc;
 
 	msg_set_size(mhdr, msz);
@@ -272,8 +275,18 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 	/* No fragmentation needed? */
 	if (likely(msz <= pktmax)) {
 		skb = tipc_buf_acquire(msz, GFP_KERNEL);
-		if (unlikely(!skb))
+
+		/* Fall back to smaller MTU if node local message */
+		if (unlikely(!skb)) {
+			if (pktmax != MAX_MSG_SIZE)
+				return -ENOMEM;
+			rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list);
+			if (rc != dsz)
+				return rc;
+			if (tipc_msg_assemble(list))
+				return dsz;
 			return -ENOMEM;
+		}
 		skb_orphan(skb);
 		__skb_queue_tail(list, skb);
 		skb_copy_to_linear_data(skb, mhdr, mhsz);
@@ -589,6 +602,30 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
 	return true;
 }
 
+/* tipc_msg_assemble() - assemble chain of fragments into one message
+ */
+bool tipc_msg_assemble(struct sk_buff_head *list)
+{
+	struct sk_buff *skb, *tmp = NULL;
+
+	if (skb_queue_len(list) == 1)
+		return true;
+
+	while ((skb = __skb_dequeue(list))) {
+		skb->next = NULL;
+		if (tipc_buf_append(&tmp, &skb)) {
+			__skb_queue_tail(list, skb);
+			return true;
+		}
+		if (!tmp)
+			break;
+	}
+	__skb_queue_purge(list);
+	__skb_queue_head_init(list);
+	pr_warn("Failed do assemble buffer\n");
+	return false;
+}
+
 /* tipc_msg_reassemble() - clone a buffer chain of fragments and
  *                         reassemble the clones into one message
  */
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 3e4384c222f7..b4ba1b4f9ae7 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -98,7 +98,7 @@ struct plist;
 #define MAX_H_SIZE                60	/* Largest possible TIPC header size */
 
 #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
-
+#define FB_MTU                  3744
 #define TIPC_MEDIA_INFO_OFFSET	5
 
 struct tipc_skb_cb {
@@ -943,6 +943,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
 int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		   int offset, int dsz, int mtu, struct sk_buff_head *list);
 bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
+bool tipc_msg_assemble(struct sk_buff_head *list);
 bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
 bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
 			struct sk_buff_head *cpy);
-- 
cgit v1.2.3


From 2d746c93b6e55d34a98c8983b30d991707a2059b Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 30 Nov 2017 11:11:27 -0800
Subject: rds: tcp: remove redundant function rds_tcp_conn_paths_destroy()

A side-effect of Commit c14b0366813a ("rds: tcp: set linger to 1
when unloading a rds-tcp") is that we always send a RST on the tcp
connection for rds_conn_destroy(), so rds_tcp_conn_paths_destroy()
is not needed any more and is removed in this patch.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 6b7ee71f40c6..222cc530e5b5 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -495,27 +495,6 @@ static struct pernet_operations rds_tcp_net_ops = {
 	.size = sizeof(struct rds_tcp_net),
 };
 
-/* explicitly send a RST on each socket, thereby releasing any socket refcnts
- * that may otherwise hold up netns deletion.
- */
-static void rds_tcp_conn_paths_destroy(struct rds_connection *conn)
-{
-	struct rds_conn_path *cp;
-	struct rds_tcp_connection *tc;
-	int i;
-	struct sock *sk;
-
-	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
-		cp = &conn->c_path[i];
-		tc = cp->cp_transport_data;
-		if (!tc->t_sock)
-			continue;
-		sk = tc->t_sock->sk;
-		sk->sk_prot->disconnect(sk, 0);
-		tcp_done(sk);
-	}
-}
-
 static void rds_tcp_kill_sock(struct net *net)
 {
 	struct rds_tcp_connection *tc, *_tc;
@@ -535,10 +514,8 @@ static void rds_tcp_kill_sock(struct net *net)
 			list_move_tail(&tc->t_tcp_node, &tmp_list);
 	}
 	spin_unlock_irq(&rds_tcp_conn_lock);
-	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
-		rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn);
+	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 		rds_conn_destroy(tc->t_cpath->cp_conn);
-	}
 }
 
 void *rds_tcp_listen_sock_def_readable(struct net *net)
-- 
cgit v1.2.3


From 681648e67d43cf269c5590ecf021ed481f4551fc Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 30 Nov 2017 11:11:28 -0800
Subject: rds: tcp: correctly sequence cleanup on netns deletion.

Commit 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net")
introduces a regression in rds-tcp netns cleanup. The cleanup_net(),
(and thus rds_tcp_dev_event notification) is only called from put_net()
when all netns refcounts go to 0, but this cannot happen if the
rds_connection itself is holding a c_net ref that it expects to
release in rds_tcp_kill_sock.

Instead, the rds_tcp_kill_sock callback should make sure to
tear down state carefully, ensuring that the socket teardown
is only done after all data-structures and workqs that depend
on it are quiesced.

The original motivation for commit 8edc3affc077 ("rds: tcp: Take explicit
refcounts on struct net") was to resolve a race condition reported by
syzkaller where workqs for tx/rx/connect were triggered after the
namespace was deleted. Those worker threads should have been
cancelled/flushed before socket tear-down and indeed,
rds_conn_path_destroy() does try to sequence this by doing
     /* cancel cp_send_w */
     /* cancel cp_recv_w */
     /* flush cp_down_w */
     /* free data structures */
Here the "flush cp_down_w" will trigger rds_conn_shutdown and thus
invoke rds_tcp_conn_path_shutdown() to close the tcp socket, so that
we ought to have satisfied the requirement that "socket-close is
done after all other dependent state is quiesced". However,
rds_conn_shutdown has a bug in that it *always* triggers the reconnect
workq (and if connection is successful, we always restart tx/rx
workqs so with the right timing, we risk the race conditions reported
by syzkaller).

Netns deletion is like module teardown- no need to restart a
reconnect in this case. We can use the c_destroy_in_prog bit
to avoid restarting the reconnect.

Fixes: 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net")
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 3 ++-
 net/rds/rds.h        | 6 +++---
 net/rds/tcp.c        | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7ee2d5d68b78..9efc82c665b5 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
 	 * to the conn hash, so we never trigger a reconnect on this
 	 * conn - the reconnect is always triggered by the active peer. */
 	cancel_delayed_work_sync(&cp->cp_conn_w);
+	if (conn->c_destroy_in_prog)
+		return;
 	rcu_read_lock();
 	if (!hlist_unhashed(&conn->c_hash_node)) {
 		rcu_read_unlock();
@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn)
 	 */
 	rds_cong_remove_conn(conn);
 
-	put_net(conn->c_net);
 	kfree(conn->c_path);
 	kmem_cache_free(rds_conn_slab, conn);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c349c71babff..d09f6c1facb4 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -150,7 +150,7 @@ struct rds_connection {
 
 	/* Protocol version */
 	unsigned int		c_version;
-	struct net		*c_net;
+	possible_net_t		c_net;
 
 	struct list_head	c_map_item;
 	unsigned long		c_map_queued;
@@ -165,13 +165,13 @@ struct rds_connection {
 static inline
 struct net *rds_conn_net(struct rds_connection *conn)
 {
-	return conn->c_net;
+	return read_pnet(&conn->c_net);
 }
 
 static inline
 void rds_conn_net_set(struct rds_connection *conn, struct net *net)
 {
-	conn->c_net = get_net(net);
+	write_pnet(&conn->c_net, net);
 }
 
 #define RDS_FLAG_CONG_BITMAP	0x01
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 222cc530e5b5..f580f72ae69e 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -506,7 +506,7 @@ static void rds_tcp_kill_sock(struct net *net)
 	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
 	spin_lock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
-		struct net *c_net = tc->t_cpath->cp_conn->c_net;
+		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
 		if (net != c_net || !tc->t_sock)
 			continue;
@@ -563,7 +563,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 
 	spin_lock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
-		struct net *c_net = tc->t_cpath->cp_conn->c_net;
+		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
 		if (net != c_net || !tc->t_sock)
 			continue;
-- 
cgit v1.2.3


From f10b4cff98c6977668434fbf5dd58695eeca2897 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 30 Nov 2017 11:11:29 -0800
Subject: rds: tcp: atomically purge entries from rds_tcp_conn_list during
 netns delete

The rds_tcp_kill_sock() function parses the rds_tcp_conn_list
to find the rds_connection entries marked for deletion as part
of the netns deletion under the protection of the rds_tcp_conn_lock.
Since the rds_tcp_conn_list tracks rds_tcp_connections (which
have a 1:1 mapping with rds_conn_path), multiple tc entries in
the rds_tcp_conn_list will map to a single rds_connection, and will
be deleted as part of the rds_conn_destroy() operation that is
done outside the rds_tcp_conn_lock.

The rds_tcp_conn_list traversal done under the protection of
rds_tcp_conn_lock should not leave any doomed tc entries in
the list after the rds_tcp_conn_lock is released, else another
concurrently executiong netns delete (for a differnt netns) thread
may trip on these entries.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 9 +++++++--
 net/rds/tcp.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index f580f72ae69e..39f502d47969 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -306,7 +306,8 @@ static void rds_tcp_conn_free(void *arg)
 	rdsdebug("freeing tc %p\n", tc);
 
 	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
-	list_del(&tc->t_tcp_node);
+	if (!tc->t_tcp_node_detached)
+		list_del(&tc->t_tcp_node);
 	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
 
 	kmem_cache_free(rds_tcp_conn_slab, tc);
@@ -510,8 +511,12 @@ static void rds_tcp_kill_sock(struct net *net)
 
 		if (net != c_net || !tc->t_sock)
 			continue;
-		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
 			list_move_tail(&tc->t_tcp_node, &tmp_list);
+		} else {
+			list_del(&tc->t_tcp_node);
+			tc->t_tcp_node_detached = true;
+		}
 	}
 	spin_unlock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 1aafbf7c3011..e7858ee8ed8b 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -12,6 +12,7 @@ struct rds_tcp_incoming {
 struct rds_tcp_connection {
 
 	struct list_head	t_tcp_node;
+	bool			t_tcp_node_detached;
 	struct rds_conn_path	*t_cpath;
 	/* t_conn_path_lock synchronizes the connection establishment between
 	 * rds_tcp_accept_one and rds_tcp_conn_path_connect
-- 
cgit v1.2.3


From a3222dc95ca751cdc5f6ac3c9b092b160b73ed9f Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 30 Nov 2017 11:51:27 -0800
Subject: ip_gre: Refector the erpsan tunnel code.

Move two erspan functions to header file, erspan.h, so ipv6
erspan implementation can use it.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_gre.c    | 54 +++++-----------------------------------------------
 2 files changed, 56 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index ca94fc86865e..6e758d08c9ee 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -58,4 +58,55 @@ struct erspanhdr {
 	struct erspan_metadata md;
 };
 
+static inline u8 tos_to_cos(u8 tos)
+{
+	u8 dscp, cos;
+
+	dscp = tos >> 2;
+	cos = dscp >> 3;
+	return cos;
+}
+
+static inline void erspan_build_header(struct sk_buff *skb,
+				__be32 id, u32 index,
+				bool truncate, bool is_ipv4)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	enum erspan_encap_type enc_type;
+	struct erspanhdr *ershdr;
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+	u8 tos;
+
+	tos = is_ipv4 ? ip_hdr(skb)->tos :
+			(ipv6_hdr(skb)->priority << 4) +
+			(ipv6_hdr(skb)->flow_lbl[0] >> 4);
+
+	enc_type = ERSPAN_ENCAP_NOVLAN;
+
+	/* If mirrored packet has vlan tag, extract tci and
+	 *  perserve vlan header in the mirrored frame.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+		enc_type = ERSPAN_ENCAP_INFRAME;
+	}
+
+	skb_push(skb, sizeof(*ershdr));
+	ershdr = (struct erspanhdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr));
+
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION << VER_OFFSET));
+	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
+			   ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) |
+			   (enc_type << EN_OFFSET & EN_MASK) |
+			   ((truncate << T_OFFSET) & T_MASK));
+	ershdr->md.index = htonl(index & INDEX_MASK);
+}
+
 #endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index bb6239169b1a..d828821d88d7 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void erspan_build_header(struct sk_buff *skb,
-				__be32 id, u32 index, bool truncate);
+				__be32 id, u32 index,
+				bool truncate, bool is_ipv4);
 
 static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
@@ -589,7 +590,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto err_free_rt;
 
 	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-			    ntohl(md->index), truncate);
+			    ntohl(md->index), truncate, true);
 
 	gre_build_header(skb, 8, TUNNEL_SEQ,
 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -668,52 +669,6 @@ free_skb:
 	return NETDEV_TX_OK;
 }
 
-static inline u8 tos_to_cos(u8 tos)
-{
-	u8 dscp, cos;
-
-	dscp = tos >> 2;
-	cos = dscp >> 3;
-	return cos;
-}
-
-static void erspan_build_header(struct sk_buff *skb,
-				__be32 id, u32 index, bool truncate)
-{
-	struct iphdr *iphdr = ip_hdr(skb);
-	struct ethhdr *eth = eth_hdr(skb);
-	enum erspan_encap_type enc_type;
-	struct erspanhdr *ershdr;
-	struct qtag_prefix {
-		__be16 eth_type;
-		__be16 tci;
-	} *qp;
-	u16 vlan_tci = 0;
-
-	enc_type = ERSPAN_ENCAP_NOVLAN;
-
-	/* If mirrored packet has vlan tag, extract tci and
-	 *  perserve vlan header in the mirrored frame.
-	 */
-	if (eth->h_proto == htons(ETH_P_8021Q)) {
-		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
-		vlan_tci = ntohs(qp->tci);
-		enc_type = ERSPAN_ENCAP_INFRAME;
-	}
-
-	skb_push(skb, sizeof(*ershdr));
-	ershdr = (struct erspanhdr *)skb->data;
-	memset(ershdr, 0, sizeof(*ershdr));
-
-	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
-				 (ERSPAN_VERSION << VER_OFFSET));
-	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
-			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
-			   (enc_type << EN_OFFSET & EN_MASK) |
-			   ((truncate << T_OFFSET) & T_MASK));
-	ershdr->md.index = htonl(index & INDEX_MASK);
-}
-
 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 			       struct net_device *dev)
 {
@@ -737,7 +692,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	}
 
 	/* Push ERSPAN header */
-	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+			    truncate, true);
 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
 	return NETDEV_TX_OK;
-- 
cgit v1.2.3


From 898b29798e36019966839187fd58dacec16d7db6 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 30 Nov 2017 11:51:28 -0800
Subject: ip6_gre: Refactor ip6gre xmit codes

This patch refactors the ip6gre_xmit_{ipv4, ipv6}.
It is a prep work to add the ip6erspan tunnel.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 123 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 75 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4cfd8e0696fe..907d2e8405e2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -496,6 +496,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
+static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb,
+				     struct net_device *dev,
+				     struct flowi6 *fl6, __u8 *dsfield,
+				     int *encap_limit)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct ip6_tnl *t = netdev_priv(dev);
+
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		*encap_limit = t->parms.encap_limit;
+
+	memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+		*dsfield = ipv4_get_dsfield(iph);
+	else
+		*dsfield = ip6_tclass(t->parms.flowinfo);
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+		fl6->flowi6_mark = skb->mark;
+	else
+		fl6->flowi6_mark = t->parms.fwmark;
+
+	fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+}
+
+static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
+				    struct net_device *dev,
+				    struct flowi6 *fl6, __u8 *dsfield,
+				    int *encap_limit)
+{
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct ip6_tnl *t = netdev_priv(dev);
+	__u16 offset;
+
+	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+	/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+
+	if (offset > 0) {
+		struct ipv6_tlv_tnl_enc_lim *tel;
+
+		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+		if (tel->encap_limit == 0) {
+			icmpv6_send(skb, ICMPV6_PARAMPROB,
+				    ICMPV6_HDR_FIELD, offset + 2);
+			return -1;
+		}
+		*encap_limit = tel->encap_limit - 1;
+	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+		*encap_limit = t->parms.encap_limit;
+	}
+
+	memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+		*dsfield = ipv6_get_dsfield(ipv6h);
+	else
+		*dsfield = ip6_tclass(t->parms.flowinfo);
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+		fl6->flowlabel |= ip6_flowlabel(ipv6h);
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+		fl6->flowi6_mark = skb->mark;
+	else
+		fl6->flowi6_mark = t->parms.fwmark;
+
+	fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+	return 0;
+}
+
 static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 			       struct net_device *dev, __u8 dsfield,
 			       struct flowi6 *fl6, int encap_limit,
@@ -527,7 +599,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
-	const struct iphdr  *iph = ip_hdr(skb);
 	int encap_limit = -1;
 	struct flowi6 fl6;
 	__u8 dsfield;
@@ -536,21 +607,7 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
-
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		dsfield = ipv4_get_dsfield(iph);
-	else
-		dsfield = ip6_tclass(t->parms.flowinfo);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
-	else
-		fl6.flowi6_mark = t->parms.fwmark;
-
-	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+	prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit);
 
 	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
 	if (err)
@@ -574,7 +631,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	int encap_limit = -1;
-	__u16 offset;
 	struct flowi6 fl6;
 	__u8 dsfield;
 	__u32 mtu;
@@ -583,37 +639,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 	if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
 		return -1;
 
-	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-	/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
-	ipv6h = ipv6_hdr(skb);
-
-	if (offset > 0) {
-		struct ipv6_tlv_tnl_enc_lim *tel;
-		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
-		if (tel->encap_limit == 0) {
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_HDR_FIELD, offset + 2);
-			return -1;
-		}
-		encap_limit = tel->encap_limit - 1;
-	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
-
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		dsfield = ipv6_get_dsfield(ipv6h);
-	else
-		dsfield = ip6_tclass(t->parms.flowinfo);
-
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-		fl6.flowlabel |= ip6_flowlabel(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
-	else
-		fl6.flowi6_mark = t->parms.fwmark;
-
-	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+	if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
+		return -1;
 
 	if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
 		return -1;
-- 
cgit v1.2.3


From 5a963eb61b7c39e6c422b6e48619d19d04719358 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 30 Nov 2017 11:51:29 -0800
Subject: ip6_gre: Add ERSPAN native tunnel support

The patch adds support for ERSPAN tunnel over ipv6.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |   1 +
 net/ipv6/ip6_gre.c       | 270 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 267 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index d66f70f63734..109a5a8877ef 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -36,6 +36,7 @@ struct __ip6_tnl_parm {
 	__be32			o_key;
 
 	__u32			fwmark;
+	__u32			index;	/* ERSPAN type II index */
 };
 
 /* IPv6 tunnel */
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 907d2e8405e2..76379f01bcd2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -55,6 +55,7 @@
 #include <net/ip6_route.h>
 #include <net/ip6_tunnel.h>
 #include <net/gre.h>
+#include <net/erspan.h>
 
 
 static bool log_ecn_error = true;
@@ -73,6 +74,7 @@ struct ip6gre_net {
 
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly;
 static struct rtnl_link_ops ip6gre_tap_ops __read_mostly;
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly;
 static int ip6gre_tunnel_init(struct net_device *dev);
 static void ip6gre_tunnel_setup(struct net_device *dev);
 static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
@@ -121,7 +123,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 	unsigned int h1 = HASH_KEY(key);
 	struct ip6_tnl *t, *cand = NULL;
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
-	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
+	int dev_type = (gre_proto == htons(ETH_P_TEB) ||
+			gre_proto == htons(ETH_P_ERSPAN)) ?
 		       ARPHRD_ETHER : ARPHRD_IP6GRE;
 	int score, cand_score = 4;
 
@@ -468,6 +471,41 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 	return PACKET_REJECT;
 }
 
+static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
+			 struct tnl_ptk_info *tpi)
+{
+	const struct ipv6hdr *ipv6h;
+	struct erspanhdr *ershdr;
+	struct ip6_tnl *tunnel;
+	__be32 index;
+
+	ipv6h = ipv6_hdr(skb);
+	ershdr = (struct erspanhdr *)skb->data;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
+		return PACKET_REJECT;
+
+	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
+	index = ershdr->md.index;
+
+	tunnel = ip6gre_tunnel_lookup(skb->dev,
+				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
+				      tpi->proto);
+	if (tunnel) {
+		if (__iptunnel_pull_header(skb, sizeof(*ershdr),
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			return PACKET_REJECT;
+
+		tunnel->parms.index = ntohl(index);
+		ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+
+		return PACKET_RCVD;
+	}
+
+	return PACKET_REJECT;
+}
+
 static int gre_rcv(struct sk_buff *skb)
 {
 	struct tnl_ptk_info tpi;
@@ -481,6 +519,12 @@ static int gre_rcv(struct sk_buff *skb)
 	if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
 		goto drop;
 
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+		if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD)
+			return 0;
+		goto drop;
+	}
+
 	if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
 		return 0;
 
@@ -732,6 +776,88 @@ tx_err:
 	return NETDEV_TX_OK;
 }
 
+static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
+					 struct net_device *dev)
+{
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct dst_entry *dst = skb_dst(skb);
+	struct net_device_stats *stats;
+	bool truncate = false;
+	int encap_limit = -1;
+	__u8 dsfield = false;
+	struct flowi6 fl6;
+	int err = -EINVAL;
+	__u32 mtu;
+
+	if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
+		goto tx_err;
+
+	if (gre_handle_offloads(skb, false))
+		goto tx_err;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+		prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+					 &dsfield, &encap_limit);
+		break;
+	case htons(ETH_P_IPV6):
+		if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+			goto tx_err;
+		if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
+					     &dsfield, &encap_limit))
+			goto tx_err;
+		break;
+	default:
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		break;
+	}
+
+	if (skb->len > dev->mtu + dev->hard_header_len) {
+		pskb_trim(skb, dev->mtu + dev->hard_header_len);
+		truncate = true;
+	}
+
+	erspan_build_header(skb, t->parms.o_key, t->parms.index,
+			    truncate, false);
+	t->parms.o_flags &= ~TUNNEL_KEY;
+
+	IPCB(skb)->flags = 0;
+	fl6.daddr = t->parms.raddr;
+
+	/* Push GRE header. */
+	gre_build_header(skb, 8, TUNNEL_SEQ,
+			 htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++));
+
+	/* TooBig packet may have updated dst->dev's mtu */
+	if (dst && dst_mtu(dst) > dst->dev->mtu)
+		dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
+
+	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+			   NEXTHDR_GRE);
+	if (err != 0) {
+		/* XXX: send ICMP error even if DF is not set. */
+		if (err == -EMSGSIZE) {
+			if (skb->protocol == htons(ETH_P_IP))
+				icmp_send(skb, ICMP_DEST_UNREACH,
+					  ICMP_FRAG_NEEDED, htonl(mtu));
+			else
+				icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		}
+
+		goto tx_err;
+	}
+	return NETDEV_TX_OK;
+
+tx_err:
+	stats = &t->dev->stats;
+	stats->tx_errors++;
+	stats->tx_dropped++;
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
 static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 {
 	struct net_device *dev = t->dev;
@@ -1111,7 +1237,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
 	dev_hold(dev);
 }
 
-
 static struct inet6_protocol ip6gre_protocol __read_mostly = {
 	.handler     = gre_rcv,
 	.err_handler = ip6gre_err,
@@ -1126,7 +1251,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
 
 	for_each_netdev_safe(net, dev, aux)
 		if (dev->rtnl_link_ops == &ip6gre_link_ops ||
-		    dev->rtnl_link_ops == &ip6gre_tap_ops)
+		    dev->rtnl_link_ops == &ip6gre_tap_ops ||
+		    dev->rtnl_link_ops == &ip6erspan_tap_ops)
 			unregister_netdevice_queue(dev, head);
 
 	for (prio = 0; prio < 4; prio++) {
@@ -1248,6 +1374,47 @@ out:
 	return ip6gre_tunnel_validate(tb, data, extack);
 }
 
+static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+				  struct netlink_ext_ack *extack)
+{
+	__be16 flags = 0;
+	int ret;
+
+	if (!data)
+		return 0;
+
+	ret = ip6gre_tap_validate(tb, data, extack);
+	if (ret)
+		return ret;
+
+	/* ERSPAN should only have GRE sequence and key flag */
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (!data[IFLA_GRE_COLLECT_METADATA] &&
+	    flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_ERSPAN_INDEX]) {
+		u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+		if (index & ~INDEX_MASK)
+			return -EINVAL;
+	}
+	return 0;
+}
 
 static void ip6gre_netlink_parms(struct nlattr *data[],
 				struct __ip6_tnl_parm *parms)
@@ -1294,6 +1461,9 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_GRE_FWMARK])
 		parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
+	if (data[IFLA_GRE_ERSPAN_INDEX])
+		parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
 }
 
 static int ip6gre_tap_init(struct net_device *dev)
@@ -1330,6 +1500,59 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
 		       NETIF_F_HIGHDMA |		\
 		       NETIF_F_HW_CSUM)
 
+static int ip6erspan_tap_init(struct net_device *dev)
+{
+	struct ip6_tnl *tunnel;
+	int t_hlen;
+	int ret;
+
+	tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
+	strcpy(tunnel->parms.name, dev->name);
+
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (ret) {
+		free_percpu(dev->tstats);
+		dev->tstats = NULL;
+		return ret;
+	}
+
+	tunnel->tun_hlen = 8;
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+		       sizeof(struct erspanhdr);
+	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+
+	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	dev->mtu = ETH_DATA_LEN - t_hlen;
+	if (dev->type == ARPHRD_ETHER)
+		dev->mtu -= ETH_HLEN;
+	if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		dev->mtu -= 8;
+
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	tunnel = netdev_priv(dev);
+	ip6gre_tnl_link_config(tunnel, 1);
+
+	return 0;
+}
+
+static const struct net_device_ops ip6erspan_netdev_ops = {
+	.ndo_init =		ip6erspan_tap_init,
+	.ndo_uninit =		ip6gre_tunnel_uninit,
+	.ndo_start_xmit =	ip6erspan_tunnel_xmit,
+	.ndo_set_mac_address =	eth_mac_addr,
+	.ndo_validate_addr =	eth_validate_addr,
+	.ndo_change_mtu =	ip6_tnl_change_mtu,
+	.ndo_get_stats64 =	ip_tunnel_get_stats64,
+	.ndo_get_iflink =	ip6_tnl_get_iflink,
+};
+
 static void ip6gre_tap_setup(struct net_device *dev)
 {
 
@@ -1521,6 +1744,8 @@ static size_t ip6gre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_FWMARK */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
 		0;
 }
 
@@ -1542,7 +1767,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
 	    nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
 	    nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) ||
-	    nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark))
+	    nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) ||
+	    nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
 		goto nla_put_failure;
 
 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
@@ -1578,8 +1804,23 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
 	[IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
 };
 
+static void ip6erspan_tap_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops = &ip6erspan_netdev_ops;
+	dev->needs_free_netdev = true;
+	dev->priv_destructor = ip6gre_dev_free;
+
+	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	netif_keep_dst(dev);
+}
+
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.kind		= "ip6gre",
 	.maxtype	= IFLA_GRE_MAX,
@@ -1609,6 +1850,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
 	.get_link_net	= ip6_tnl_get_link_net,
 };
 
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
+	.kind		= "ip6erspan",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ip6gre_policy,
+	.priv_size	= sizeof(struct ip6_tnl),
+	.setup		= ip6erspan_tap_setup,
+	.validate	= ip6erspan_tap_validate,
+	.newlink	= ip6gre_newlink,
+	.changelink	= ip6gre_changelink,
+	.get_size	= ip6gre_get_size,
+	.fill_info	= ip6gre_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
+};
+
 /*
  *	And now the modules code and kernel interface.
  */
@@ -1637,9 +1892,15 @@ static int __init ip6gre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	err = rtnl_link_register(&ip6erspan_tap_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
 out:
 	return err;
 
+erspan_link_failed:
+	rtnl_link_unregister(&ip6gre_tap_ops);
 tap_ops_failed:
 	rtnl_link_unregister(&ip6gre_link_ops);
 rtnl_link_failed:
@@ -1653,6 +1914,7 @@ static void __exit ip6gre_fini(void)
 {
 	rtnl_link_unregister(&ip6gre_tap_ops);
 	rtnl_link_unregister(&ip6gre_link_ops);
+	rtnl_link_unregister(&ip6erspan_tap_ops);
 	inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
 	unregister_pernet_device(&ip6gre_net_ops);
 }
-- 
cgit v1.2.3


From 118b4aa25d90d0930611b71dd28a749c67309ccb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 1 Dec 2017 15:08:55 -0800
Subject: net: xdp: avoid output parameters when querying XDP prog

The output parameters will get unwieldy if we want to add more
information about the program.  Simply pass the entire
struct netdev_bpf in.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h |  3 ++-
 net/core/dev.c            | 24 ++++++++++++++----------
 net/core/rtnetlink.c      |  6 +++++-
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef789e1d679e..667bdd3ad33e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3330,7 +3330,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id);
+void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
+		     struct netdev_bpf *xdp);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 07ed21d64f92..3f271c9cb5e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7073,17 +7073,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
+void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+		     struct netdev_bpf *xdp)
 {
-	struct netdev_bpf xdp;
-
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_QUERY_PROG;
+	memset(xdp, 0, sizeof(*xdp));
+	xdp->command = XDP_QUERY_PROG;
 
 	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, &xdp) < 0);
-	if (prog_id)
-		*prog_id = xdp.prog_id;
+	WARN_ON(bpf_op(dev, xdp) < 0);
+}
+
+static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+{
+	struct netdev_bpf xdp;
+
+	__dev_xdp_query(dev, bpf_op, &xdp);
 
 	return xdp.prog_attached;
 }
@@ -7134,10 +7138,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
+		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, bpf_op, NULL))
+		    __dev_xdp_attached(dev, bpf_op))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..9c4cb584bfb0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1261,6 +1261,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	const struct bpf_prog *generic_xdp_prog;
+	struct netdev_bpf xdp;
 
 	ASSERT_RTNL();
 
@@ -1273,7 +1274,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 	if (!ops->ndo_bpf)
 		return XDP_ATTACHED_NONE;
 
-	return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
+	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
+	*prog_id = xdp.prog_id;
+
+	return xdp.prog_attached;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From bd0b2e7fe611953470ec7c533b455fb2abd382cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 1 Dec 2017 15:08:57 -0800
Subject: net: xdp: make the stack take care of the tear down

Since day one of XDP drivers had to remember to free the program
on the remove path.  This leads to code duplication and is error
prone.  Make the stack query the installed programs on unregister
and if something is installed, remove the program.  Freeing of
program attached to XDP generic is moved from free_netdev() as well.

Because the remove will now be called before notifiers are
invoked, BPF offload state of the program will not get destroyed
before uninstall.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  2 --
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  3 ---
 drivers/net/ethernet/netronome/nfp/bpf/main.c      |  7 ------
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  3 ---
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  4 ---
 drivers/net/tun.c                                  |  4 ---
 net/core/dev.c                                     | 29 ++++++++++++++++------
 7 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c5c38d4b7d1c..8c1dd60eab6f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7800,8 +7800,6 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 	bnxt_dcb_free(bp);
 	kfree(bp->edev);
 	bp->edev = NULL;
-	if (bp->xdp_prog)
-		bpf_prog_put(bp->xdp_prog);
 	bnxt_cleanup_pci(bp);
 	free_netdev(dev);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d2b057a3e512..0f5c012de52e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4308,9 +4308,6 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
 {
 	mlx5e_ipsec_cleanup(priv);
 	mlx5e_vxlan_cleanup(priv);
-
-	if (priv->channels.params.xdp_prog)
-		bpf_prog_put(priv->channels.params.xdp_prog);
 }
 
 static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index e379b78e86ef..54bfd7846f6d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -82,12 +82,6 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, struct nfp_net *nn)
 	return nfp_net_ebpf_capable(nn) ? "BPF" : "";
 }
 
-static void nfp_bpf_vnic_free(struct nfp_app *app, struct nfp_net *nn)
-{
-	if (nn->dp.bpf_offload_xdp)
-		nfp_bpf_xdp_offload(app, nn, NULL);
-}
-
 static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 				     void *type_data, void *cb_priv)
 {
@@ -168,7 +162,6 @@ const struct nfp_app_type app_bpf = {
 	.extra_cap	= nfp_bpf_extra_cap,
 
 	.vnic_alloc	= nfp_app_nic_vnic_alloc,
-	.vnic_free	= nfp_bpf_vnic_free,
 
 	.setup_tc	= nfp_bpf_setup_tc,
 	.tc_busy	= nfp_bpf_tc_busy,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index ea6bbf1efefc..ad3e9f6a61e5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3562,9 +3562,6 @@ struct nfp_net *nfp_net_alloc(struct pci_dev *pdev, bool needs_netdev,
  */
 void nfp_net_free(struct nfp_net *nn)
 {
-	if (nn->xdp_prog)
-		bpf_prog_put(nn->xdp_prog);
-
 	if (nn->dp.netdev)
 		free_netdev(nn->dp.netdev);
 	else
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8f9b3eb82137..57332b3e5e64 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1068,10 +1068,6 @@ static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode)
 
 	pci_set_drvdata(pdev, NULL);
 
-	/* Release edev's reference to XDP's bpf if such exist */
-	if (edev->xdp_prog)
-		bpf_prog_put(edev->xdp_prog);
-
 	/* Use global ops since we've freed edev */
 	qed_ops->common->slowpath_stop(cdev);
 	if (system_state == SYSTEM_POWER_OFF)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 95749006d687..6746e498dc61 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -673,7 +673,6 @@ static void tun_detach(struct tun_file *tfile, bool clean)
 static void tun_detach_all(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
 	struct tun_file *tfile, *tmp;
 	int i, n = tun->numqueues;
 
@@ -708,9 +707,6 @@ static void tun_detach_all(struct net_device *dev)
 	}
 	BUG_ON(tun->numdisabled != 0);
 
-	if (xdp_prog)
-		bpf_prog_put(xdp_prog);
-
 	if (tun->flags & IFF_PERSIST)
 		module_put(THIS_MODULE);
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 3f271c9cb5e0..6bea8931bb62 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7110,6 +7110,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 	return bpf_op(dev, &xdp);
 }
 
+static void dev_xdp_uninstall(struct net_device *dev)
+{
+	struct netdev_bpf xdp;
+	bpf_op_t ndo_bpf;
+
+	/* Remove generic XDP */
+	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+
+	/* Remove from the driver */
+	ndo_bpf = dev->netdev_ops->ndo_bpf;
+	if (!ndo_bpf)
+		return;
+
+	__dev_xdp_query(dev, ndo_bpf, &xdp);
+	if (xdp.prog_attached == XDP_ATTACHED_NONE)
+		return;
+
+	/* Program removal should always succeed */
+	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -7240,6 +7261,7 @@ static void rollback_registered_many(struct list_head *head)
 		/* Shutdown queueing discipline. */
 		dev_shutdown(dev);
 
+		dev_xdp_uninstall(dev);
 
 		/* Notify protocols, that we are about to destroy
 		 * this device. They should clean all the things.
@@ -8199,7 +8221,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
 void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
-	struct bpf_prog *prog;
 
 	might_sleep();
 	netif_free_tx_queues(dev);
@@ -8218,12 +8239,6 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->pcpu_refcnt);
 	dev->pcpu_refcnt = NULL;
 
-	prog = rcu_dereference_protected(dev->xdp_prog, 1);
-	if (prog) {
-		bpf_prog_put(prog);
-		static_key_slow_dec(&generic_xdp_needed);
-	}
-
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		netdev_freemem(dev);
-- 
cgit v1.2.3


From 183dea5818315c0a172d21ecbcd2554894bf01e3 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 30 Nov 2017 15:35:33 +0100
Subject: openvswitch: do not propagate headroom updates to internal port

After commit 3a927bc7cf9d ("ovs: propagate per dp max headroom to
all vports") the need_headroom for the internal vport is updated
accordingly to the max needed headroom in its datapath.

That avoids the pskb_expand_head() costs when sending/forwarding
packets towards tunnel devices, at least for some scenarios.

We still require such copy when using the ovs-preferred configuration
for vxlan tunnels:

    br_int
  /       \
tap      vxlan
           (remote_ip:X)

br_phy
     \
    NIC

where the route towards the IP 'X' is via 'br_phy'.

When forwarding traffic from the tap towards the vxlan device, we
will call pskb_expand_head() in vxlan_build_skb() because
br-phy->needed_headroom is equal to tun->needed_headroom.

With this change we avoid updating the internal vport needed_headroom,
so that in the above scenario no head copy is needed, giving 5%
performance improvement in UDP throughput test.

As a trade-off, packets sent from the internal port towards a tunnel
device will now experience the head copy overhead. The rationale is
that the latter use-case is less relevant performance-wise.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/vport-internal_dev.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 04a3128adcf0..3e7747549f90 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -126,18 +126,12 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	}
 }
 
-static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
-{
-	dev->needed_headroom = new_hr < 0 ? 0 : new_hr;
-}
-
 static const struct net_device_ops internal_dev_netdev_ops = {
 	.ndo_open = internal_dev_open,
 	.ndo_stop = internal_dev_stop,
 	.ndo_start_xmit = internal_dev_xmit,
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_get_stats64 = internal_get_stats,
-	.ndo_set_rx_headroom = internal_set_rx_headroom,
 };
 
 static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -154,7 +148,7 @@ static void do_setup(struct net_device *netdev)
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
-			      IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
+			      IFF_NO_QUEUE;
 	netdev->needs_free_netdev = true;
 	netdev->priv_destructor = internal_dev_destructor;
 	netdev->ethtool_ops = &internal_dev_ethtool_ops;
@@ -195,7 +189,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
 		err = -ENOMEM;
 		goto error_free_netdev;
 	}
-	vport->dev->needed_headroom = vport->dp->max_headroom;
 
 	dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
 	internal_dev = internal_dev_priv(vport->dev);
-- 
cgit v1.2.3


From 80e023607982faa6245507c45acf93bb0feb0ded Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 30 Nov 2017 11:23:57 -0500
Subject: net: dsa: remove trans argument from vlan ops

The DSA switch VLAN ops pass the switchdev_trans structure down to the
drivers, but no one is using them and they aren't supposed to anyway.

Remove the trans argument from VLAN prepare and add operations.

At the same time, fix the following checkpatch warning:

    WARNING: line over 80 characters
    #74: FILE: drivers/net/dsa/dsa_loop.c:177:
    +				      const struct switchdev_obj_port_vlan *vlan)

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       |  6 ++----
 drivers/net/dsa/b53/b53_priv.h         |  6 ++----
 drivers/net/dsa/dsa_loop.c             |  9 ++++-----
 drivers/net/dsa/microchip/ksz_common.c |  6 ++----
 drivers/net/dsa/mv88e6xxx/chip.c       |  6 ++----
 include/net/dsa.h                      | 10 ++++------
 net/dsa/switch.c                       |  4 ++--
 7 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 7f26f5dafca7..561b05089cb6 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1029,8 +1029,7 @@ int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering)
 EXPORT_SYMBOL(b53_vlan_filtering);
 
 int b53_vlan_prepare(struct dsa_switch *ds, int port,
-		     const struct switchdev_obj_port_vlan *vlan,
-		     struct switchdev_trans *trans)
+		     const struct switchdev_obj_port_vlan *vlan)
 {
 	struct b53_device *dev = ds->priv;
 
@@ -1047,8 +1046,7 @@ int b53_vlan_prepare(struct dsa_switch *ds, int port,
 EXPORT_SYMBOL(b53_vlan_prepare);
 
 void b53_vlan_add(struct dsa_switch *ds, int port,
-		  const struct switchdev_obj_port_vlan *vlan,
-		  struct switchdev_trans *trans)
+		  const struct switchdev_obj_port_vlan *vlan)
 {
 	struct b53_device *dev = ds->priv;
 	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 2af0155efce2..d954cf36ecd8 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -295,11 +295,9 @@ void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
 void b53_br_fast_age(struct dsa_switch *ds, int port);
 int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering);
 int b53_vlan_prepare(struct dsa_switch *ds, int port,
-		     const struct switchdev_obj_port_vlan *vlan,
-		     struct switchdev_trans *trans);
+		     const struct switchdev_obj_port_vlan *vlan);
 void b53_vlan_add(struct dsa_switch *ds, int port,
-		  const struct switchdev_obj_port_vlan *vlan,
-		  struct switchdev_trans *trans);
+		  const struct switchdev_obj_port_vlan *vlan);
 int b53_vlan_del(struct dsa_switch *ds, int port,
 		 const struct switchdev_obj_port_vlan *vlan);
 int b53_fdb_add(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index bb71d3d6f65b..7aa84ee4e771 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -174,9 +174,9 @@ static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port,
 	return 0;
 }
 
-static int dsa_loop_port_vlan_prepare(struct dsa_switch *ds, int port,
-				      const struct switchdev_obj_port_vlan *vlan,
-				      struct switchdev_trans *trans)
+static int
+dsa_loop_port_vlan_prepare(struct dsa_switch *ds, int port,
+			   const struct switchdev_obj_port_vlan *vlan)
 {
 	struct dsa_loop_priv *ps = ds->priv;
 	struct mii_bus *bus = ps->bus;
@@ -193,8 +193,7 @@ static int dsa_loop_port_vlan_prepare(struct dsa_switch *ds, int port,
 }
 
 static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port,
-				   const struct switchdev_obj_port_vlan *vlan,
-				   struct switchdev_trans *trans)
+				   const struct switchdev_obj_port_vlan *vlan)
 {
 	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
 	bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index b5be93a1e0df..25b94edc5526 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -559,8 +559,7 @@ static int ksz_port_vlan_filtering(struct dsa_switch *ds, int port, bool flag)
 }
 
 static int ksz_port_vlan_prepare(struct dsa_switch *ds, int port,
-				 const struct switchdev_obj_port_vlan *vlan,
-				 struct switchdev_trans *trans)
+				 const struct switchdev_obj_port_vlan *vlan)
 {
 	/* nothing needed */
 
@@ -568,8 +567,7 @@ static int ksz_port_vlan_prepare(struct dsa_switch *ds, int port,
 }
 
 static void ksz_port_vlan_add(struct dsa_switch *ds, int port,
-			      const struct switchdev_obj_port_vlan *vlan,
-			      struct switchdev_trans *trans)
+			      const struct switchdev_obj_port_vlan *vlan)
 {
 	struct ksz_device *dev = ds->priv;
 	u32 vlan_table[3];
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 8171055fde7a..eff624fbd220 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1185,8 +1185,7 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 
 static int
 mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port,
-			    const struct switchdev_obj_port_vlan *vlan,
-			    struct switchdev_trans *trans)
+			    const struct switchdev_obj_port_vlan *vlan)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err;
@@ -1295,8 +1294,7 @@ static int _mv88e6xxx_port_vlan_add(struct mv88e6xxx_chip *chip, int port,
 }
 
 static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
-				    const struct switchdev_obj_port_vlan *vlan,
-				    struct switchdev_trans *trans)
+				    const struct switchdev_obj_port_vlan *vlan)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2a05738570d8..0c4fbb34379e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -412,12 +412,10 @@ struct dsa_switch_ops {
 	 */
 	int	(*port_vlan_filtering)(struct dsa_switch *ds, int port,
 				       bool vlan_filtering);
-	int	(*port_vlan_prepare)(struct dsa_switch *ds, int port,
-				     const struct switchdev_obj_port_vlan *vlan,
-				     struct switchdev_trans *trans);
-	void	(*port_vlan_add)(struct dsa_switch *ds, int port,
-				 const struct switchdev_obj_port_vlan *vlan,
-				 struct switchdev_trans *trans);
+	int (*port_vlan_prepare)(struct dsa_switch *ds, int port,
+				 const struct switchdev_obj_port_vlan *vlan);
+	void (*port_vlan_add)(struct dsa_switch *ds, int port,
+			      const struct switchdev_obj_port_vlan *vlan);
 	int	(*port_vlan_del)(struct dsa_switch *ds, int port,
 				 const struct switchdev_obj_port_vlan *vlan);
 	/*
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 29608d087a7c..205f074fa524 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -178,7 +178,7 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
 			return -EOPNOTSUPP;
 
 		for_each_set_bit(port, members, ds->num_ports) {
-			err = ds->ops->port_vlan_prepare(ds, port, vlan, trans);
+			err = ds->ops->port_vlan_prepare(ds, port, vlan);
 			if (err)
 				return err;
 		}
@@ -187,7 +187,7 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
 	}
 
 	for_each_set_bit(port, members, ds->num_ports)
-		ds->ops->port_vlan_add(ds, port, vlan, trans);
+		ds->ops->port_vlan_add(ds, port, vlan);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3709aadc8375a1b0c42da5b12e38eddf8133dd4e Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 30 Nov 2017 11:23:58 -0500
Subject: net: dsa: remove trans argument from mdb ops

The DSA switch MDB ops pass the switchdev_trans structure down to the
drivers, but no one is using them and they aren't supposed to anyway.

Remove the trans argument from MDB prepare and add operations.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/lan9303-core.c         |  6 ++----
 drivers/net/dsa/microchip/ksz_common.c |  6 ++----
 drivers/net/dsa/mv88e6xxx/chip.c       |  6 ++----
 include/net/dsa.h                      | 10 ++++------
 net/dsa/switch.c                       |  4 ++--
 5 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index b24566bb74d2..ea59dadefb33 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -1217,8 +1217,7 @@ static int lan9303_port_fdb_dump(struct dsa_switch *ds, int port,
 }
 
 static int lan9303_port_mdb_prepare(struct dsa_switch *ds, int port,
-				    const struct switchdev_obj_port_mdb *mdb,
-				    struct switchdev_trans *trans)
+				    const struct switchdev_obj_port_mdb *mdb)
 {
 	struct lan9303 *chip = ds->priv;
 
@@ -1235,8 +1234,7 @@ static int lan9303_port_mdb_prepare(struct dsa_switch *ds, int port,
 }
 
 static void lan9303_port_mdb_add(struct dsa_switch *ds, int port,
-				 const struct switchdev_obj_port_mdb *mdb,
-				 struct switchdev_trans *trans)
+				 const struct switchdev_obj_port_mdb *mdb)
 {
 	struct lan9303 *chip = ds->priv;
 
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 25b94edc5526..663b0d5b982b 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -856,16 +856,14 @@ exit:
 }
 
 static int ksz_port_mdb_prepare(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_mdb *mdb,
-				struct switchdev_trans *trans)
+				const struct switchdev_obj_port_mdb *mdb)
 {
 	/* nothing to do */
 	return 0;
 }
 
 static void ksz_port_mdb_add(struct dsa_switch *ds, int port,
-			     const struct switchdev_obj_port_mdb *mdb,
-			     struct switchdev_trans *trans)
+			     const struct switchdev_obj_port_mdb *mdb)
 {
 	struct ksz_device *dev = ds->priv;
 	u32 static_table[4];
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index eff624fbd220..b5e0987c88f0 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3786,8 +3786,7 @@ free:
 }
 
 static int mv88e6xxx_port_mdb_prepare(struct dsa_switch *ds, int port,
-				      const struct switchdev_obj_port_mdb *mdb,
-				      struct switchdev_trans *trans)
+				      const struct switchdev_obj_port_mdb *mdb)
 {
 	/* We don't need any dynamic resource from the kernel (yet),
 	 * so skip the prepare phase.
@@ -3797,8 +3796,7 @@ static int mv88e6xxx_port_mdb_prepare(struct dsa_switch *ds, int port,
 }
 
 static void mv88e6xxx_port_mdb_add(struct dsa_switch *ds, int port,
-				   const struct switchdev_obj_port_mdb *mdb,
-				   struct switchdev_trans *trans)
+				   const struct switchdev_obj_port_mdb *mdb)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0c4fbb34379e..6700dff46a80 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -431,12 +431,10 @@ struct dsa_switch_ops {
 	/*
 	 * Multicast database
 	 */
-	int	(*port_mdb_prepare)(struct dsa_switch *ds, int port,
-				    const struct switchdev_obj_port_mdb *mdb,
-				    struct switchdev_trans *trans);
-	void	(*port_mdb_add)(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_mdb *mdb,
-				struct switchdev_trans *trans);
+	int (*port_mdb_prepare)(struct dsa_switch *ds, int port,
+				const struct switchdev_obj_port_mdb *mdb);
+	void (*port_mdb_add)(struct dsa_switch *ds, int port,
+			     const struct switchdev_obj_port_mdb *mdb);
 	int	(*port_mdb_del)(struct dsa_switch *ds, int port,
 				const struct switchdev_obj_port_mdb *mdb);
 	/*
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 205f074fa524..5ee04e9b5796 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -129,7 +129,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 			return -EOPNOTSUPP;
 
 		for_each_set_bit(port, group, ds->num_ports) {
-			err = ds->ops->port_mdb_prepare(ds, port, mdb, trans);
+			err = ds->ops->port_mdb_prepare(ds, port, mdb);
 			if (err)
 				return err;
 		}
@@ -138,7 +138,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 	}
 
 	for_each_set_bit(port, group, ds->num_ports)
-		ds->ops->port_mdb_add(ds, port, mdb, trans);
+		ds->ops->port_mdb_add(ds, port, mdb);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 9c428c593fb7533595c439b510e5eb5e94aec65e Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 30 Nov 2017 11:23:59 -0500
Subject: net: dsa: add switch vlan bitmap functions

This patch brings no functional changes.
It moves out the VLAN code iterating on a list of VLAN members into new
dsa_switch_vlan_{prepare,add}_bitmap() functions.

This gives us a better isolation of the two switchdev phases.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/switch.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 5ee04e9b5796..17cd03d6bc7d 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -157,13 +157,43 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds,
 	return 0;
 }
 
+static int
+dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds,
+			       const struct switchdev_obj_port_vlan *vlan,
+			       const unsigned long *bitmap)
+{
+	int port, err;
+
+	if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
+		return -EOPNOTSUPP;
+
+	for_each_set_bit(port, bitmap, ds->num_ports) {
+		err = ds->ops->port_vlan_prepare(ds, port, vlan);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void
+dsa_switch_vlan_add_bitmap(struct dsa_switch *ds,
+			   const struct switchdev_obj_port_vlan *vlan,
+			   const unsigned long *bitmap)
+{
+	int port;
+
+	for_each_set_bit(port, bitmap, ds->num_ports)
+		ds->ops->port_vlan_add(ds, port, vlan);
+}
+
 static int dsa_switch_vlan_add(struct dsa_switch *ds,
 			       struct dsa_notifier_vlan_info *info)
 {
 	const struct switchdev_obj_port_vlan *vlan = info->vlan;
 	struct switchdev_trans *trans = info->trans;
 	DECLARE_BITMAP(members, ds->num_ports);
-	int port, err;
+	int port;
 
 	/* Build a mask of VLAN members */
 	bitmap_zero(members, ds->num_ports);
@@ -173,21 +203,10 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
 		if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
 			set_bit(port, members);
 
-	if (switchdev_trans_ph_prepare(trans)) {
-		if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
-			return -EOPNOTSUPP;
+	if (switchdev_trans_ph_prepare(trans))
+		return dsa_switch_vlan_prepare_bitmap(ds, vlan, members);
 
-		for_each_set_bit(port, members, ds->num_ports) {
-			err = ds->ops->port_vlan_prepare(ds, port, vlan);
-			if (err)
-				return err;
-		}
-
-		return 0;
-	}
-
-	for_each_set_bit(port, members, ds->num_ports)
-		ds->ops->port_vlan_add(ds, port, vlan);
+	dsa_switch_vlan_add_bitmap(ds, vlan, members);
 
 	return 0;
 }
-- 
cgit v1.2.3


From e6db98db8a9595f8001958e489ff07ed97a15a54 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 30 Nov 2017 11:24:00 -0500
Subject: net: dsa: add switch mdb bitmap functions

This patch brings no functional changes.
It moves out the MDB code iterating on a multicast group into new
dsa_switch_mdb_{prepare,add}_bitmap() functions.

This gives us a better isolation of the two switchdev phases.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/switch.c | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 17cd03d6bc7d..9a01514ea9f3 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -108,13 +108,42 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds,
 				     info->vid);
 }
 
+static int
+dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds,
+			      const struct switchdev_obj_port_mdb *mdb,
+			      const unsigned long *bitmap)
+{
+	int port, err;
+
+	if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
+		return -EOPNOTSUPP;
+
+	for_each_set_bit(port, bitmap, ds->num_ports) {
+		err = ds->ops->port_mdb_prepare(ds, port, mdb);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds,
+				      const struct switchdev_obj_port_mdb *mdb,
+				      const unsigned long *bitmap)
+{
+	int port;
+
+	for_each_set_bit(port, bitmap, ds->num_ports)
+		ds->ops->port_mdb_add(ds, port, mdb);
+}
+
 static int dsa_switch_mdb_add(struct dsa_switch *ds,
 			      struct dsa_notifier_mdb_info *info)
 {
 	const struct switchdev_obj_port_mdb *mdb = info->mdb;
 	struct switchdev_trans *trans = info->trans;
 	DECLARE_BITMAP(group, ds->num_ports);
-	int port, err;
+	int port;
 
 	/* Build a mask of Multicast group members */
 	bitmap_zero(group, ds->num_ports);
@@ -124,21 +153,10 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 		if (dsa_is_dsa_port(ds, port))
 			set_bit(port, group);
 
-	if (switchdev_trans_ph_prepare(trans)) {
-		if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
-			return -EOPNOTSUPP;
-
-		for_each_set_bit(port, group, ds->num_ports) {
-			err = ds->ops->port_mdb_prepare(ds, port, mdb);
-			if (err)
-				return err;
-		}
-
-		return 0;
-	}
+	if (switchdev_trans_ph_prepare(trans))
+		return dsa_switch_mdb_prepare_bitmap(ds, mdb, group);
 
-	for_each_set_bit(port, group, ds->num_ports)
-		ds->ops->port_mdb_add(ds, port, mdb);
+	dsa_switch_mdb_add_bitmap(ds, mdb, group);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3169241f55e194278294c7a4c43ef558c75cb0b7 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 30 Nov 2017 12:56:43 -0500
Subject: net: dsa: support cross-chip FDB operations

When a MAC address is added to or removed from a switch port in the
fabric, the target switch must program its port and adjacent switches
must program their local DSA port used to reach the target switch.

For this purpose, use the dsa_towards_port() helper to identify the
local switch port which must be programmed.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/switch.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 9a01514ea9f3..b93511726069 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -83,29 +83,23 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 static int dsa_switch_fdb_add(struct dsa_switch *ds,
 			      struct dsa_notifier_fdb_info *info)
 {
-	/* Do not care yet about other switch chips of the fabric */
-	if (ds->index != info->sw_index)
-		return 0;
+	int port = dsa_towards_port(ds, info->sw_index, info->port);
 
 	if (!ds->ops->port_fdb_add)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_fdb_add(ds, info->port, info->addr,
-				     info->vid);
+	return ds->ops->port_fdb_add(ds, port, info->addr, info->vid);
 }
 
 static int dsa_switch_fdb_del(struct dsa_switch *ds,
 			      struct dsa_notifier_fdb_info *info)
 {
-	/* Do not care yet about other switch chips of the fabric */
-	if (ds->index != info->sw_index)
-		return 0;
+	int port = dsa_towards_port(ds, info->sw_index, info->port);
 
 	if (!ds->ops->port_fdb_del)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_fdb_del(ds, info->port, info->addr,
-				     info->vid);
+	return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
 }
 
 static int
-- 
cgit v1.2.3


From 76d013b20ba9a5f88eee7c90ac82cbc3ee64be18 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 1 Dec 2017 12:52:29 -0800
Subject: inet: Add a count to struct inet_listen_hashbucket

This patch adds a count to the 'struct inet_listen_hashbucket'.
It counts how many sk is hashed to a bucket.  It will be
used to decide if the (to-be-added) portaddr listener's hashtable
should be used during inet[6]_lookup_listener().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h |  1 +
 net/ipv4/inet_hashtables.c    | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 2dbbbff5e1e3..4cce516c41ac 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -111,6 +111,7 @@ struct inet_bind_hashbucket {
  */
 struct inet_listen_hashbucket {
 	spinlock_t		lock;
+	unsigned int		count;
 	struct hlist_head	head;
 };
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 427b705d7c64..80cfd3fa21ca 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -476,6 +476,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
 	else
 		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+	ilb->count++;
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
@@ -502,6 +503,7 @@ EXPORT_SYMBOL_GPL(inet_hash);
 void inet_unhash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	struct inet_listen_hashbucket *ilb;
 	spinlock_t *lock;
 	bool listener = false;
 	int done;
@@ -510,7 +512,8 @@ void inet_unhash(struct sock *sk)
 		return;
 
 	if (sk->sk_state == TCP_LISTEN) {
-		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		lock = &ilb->lock;
 		listener = true;
 	} else {
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
@@ -522,8 +525,11 @@ void inet_unhash(struct sock *sk)
 		done = __sk_del_node_init(sk);
 	else
 		done = __sk_nulls_del_node_init_rcu(sk);
-	if (done)
+	if (done) {
+		if (listener)
+			ilb->count--;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	}
 	spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
@@ -658,6 +664,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
 		spin_lock_init(&h->listening_hash[i].lock);
 		INIT_HLIST_HEAD(&h->listening_hash[i].head);
+		h->listening_hash[i].count = 0;
 	}
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
-- 
cgit v1.2.3


From f0b1e64c1331dd8a2f0c30fcd0838db6cb406098 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 1 Dec 2017 12:52:30 -0800
Subject: udp: Move udp[46]_portaddr_hash() to net/ip[v6].h

This patch moves the udp[46]_portaddr_hash()
to net/ip[v6].h.  The function name is renamed to
ipv[46]_portaddr_hash().

It will be used by a later patch which adds a second listener
hashtable hashed by the address and port.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h   |  9 +++++++++
 include/net/ipv6.h | 17 +++++++++++++++++
 net/ipv4/udp.c     | 22 ++++++++--------------
 net/ipv6/udp.c     | 32 ++++++++------------------------
 4 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 9896f46cbbf1..fc9bf1b1fe2c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -26,12 +26,14 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 #include <linux/skbuff.h>
+#include <linux/jhash.h>
 
 #include <net/inet_sock.h>
 #include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
 #include <net/flow_dissector.h>
+#include <net/netns/hash.h>
 
 #define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
 
@@ -521,6 +523,13 @@ static inline unsigned int ipv4_addr_hash(__be32 ip)
 	return (__force unsigned int) ip;
 }
 
+static inline u32 ipv4_portaddr_hash(const struct net *net,
+				     __be32 saddr,
+				     unsigned int port)
+{
+	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
+}
+
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f73797e2fa60..25be4715578c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -22,6 +22,7 @@
 #include <net/flow.h>
 #include <net/flow_dissector.h>
 #include <net/snmp.h>
+#include <net/netns/hash.h>
 
 #define SIN6_LEN_RFC2133	24
 
@@ -673,6 +674,22 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
 					cpu_to_be32(0x0000ffff))) == 0UL;
 }
 
+static inline u32 ipv6_portaddr_hash(const struct net *net,
+				     const struct in6_addr *addr6,
+				     unsigned int port)
+{
+	unsigned int hash, mix = net_hash_mix(net);
+
+	if (ipv6_addr_any(addr6))
+		hash = jhash_1word(0, mix);
+	else if (ipv6_addr_v4mapped(addr6))
+		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
+	else
+		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
+
+	return hash ^ port;
+}
+
 /*
  * Check for a RFC 4843 ORCHID address
  * (Overlay Routable Cryptographic Hash Identifiers)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 36f857c87fe2..e9c0d1e1772e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -357,18 +357,12 @@ fail:
 }
 EXPORT_SYMBOL(udp_lib_get_port);
 
-static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
-			      unsigned int port)
-{
-	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
-}
-
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+		ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
 	unsigned int hash2_partial =
-		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+		ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -485,7 +479,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
-		hash2 = udp4_portaddr_hash(net, daddr, hnum);
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
 		hslot2 = &udptable->hash2[slot2];
 		if (hslot->count < hslot2->count)
@@ -496,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 					  exact_dif, hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
-			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+			hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
 			slot2 = hash2 & udptable->mask;
 			/* avoid searching the same slot again. */
 			if (unlikely(slot2 == old_slot2))
@@ -1761,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash);
 
 static void udp_v4_rehash(struct sock *sk)
 {
-	u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+	u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
 					  inet_sk(sk)->inet_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 	udp_lib_rehash(sk, new_hash);
@@ -1952,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sk_buff *nskb;
 
 	if (use_hash2) {
-		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+		hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
 			    udptable->mask;
-		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
 		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -2186,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 					    int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+	unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c9f91c28b81d..eecf9f0faf29 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net,
 			       udp_ipv6_hash_secret + net_hash_mix(net));
 }
 
-static u32 udp6_portaddr_hash(const struct net *net,
-			      const struct in6_addr *addr6,
-			      unsigned int port)
-{
-	unsigned int hash, mix = net_hash_mix(net);
-
-	if (ipv6_addr_any(addr6))
-		hash = jhash_1word(0, mix);
-	else if (ipv6_addr_v4mapped(addr6))
-		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
-	else
-		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
-
-	return hash ^ port;
-}
-
 int udp_v6_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+		ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
 	unsigned int hash2_partial =
-		udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+		ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 
 static void udp_v6_rehash(struct sock *sk)
 {
-	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+	u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
 					  &sk->sk_v6_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 
@@ -225,7 +209,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
-		hash2 = udp6_portaddr_hash(net, daddr, hnum);
+		hash2 = ipv6_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
 		hslot2 = &udptable->hash2[slot2];
 		if (hslot->count < hslot2->count)
@@ -236,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
-			hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+			hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
 			slot2 = hash2 & udptable->mask;
 			/* avoid searching the same slot again. */
 			if (unlikely(slot2 == old_slot2))
@@ -705,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sk_buff *nskb;
 
 	if (use_hash2) {
-		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+		hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
 			    udptable->mask;
-		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
+		hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
 		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -895,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
 			int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
+	unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
-- 
cgit v1.2.3


From 61b7c691c7317529375f90f0a81a331990b1ec1b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 1 Dec 2017 12:52:31 -0800
Subject: inet: Add a 2nd listener hashtable (port+addr)

The current listener hashtable is hashed by port only.
When a process is listening at many IP addresses with the same port (e.g.
[IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
performance is degraded to a link list.  It is prone to syn attack.

UDP had a similar issue and a second hashtable was added to resolve it.

This patch adds a second hashtable for the listener's sockets.
The second hashtable is hashed by port and address.

It cannot reuse the existing skc_portaddr_node which is shared
with skc_bind_node.  TCP listener needs to use skc_bind_node.
Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to
the inet_connection_sock which the listener (like TCP) also belongs to.

The new portaddr hashtable may need two lookup (First by IP:PORT.
Second by INADDR_ANY:PORT if the IP:PORT is a not found).   Hence,
it implements a similar cut off as UDP such that it will only consult the
new portaddr hashtable if the current port-only hashtable has >10
sk in the link-list.

lhash2 and lhash2_mask are added to 'struct inet_hashinfo'.  I take
this chance to plug a 4 bytes hole.  It is done by first moving
the existing bind_bucket_cachep up and then add the new
(int lhash2_mask, *lhash2) after the existing bhash_size.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |   2 +
 include/net/inet_hashtables.h      |  28 +++++--
 net/ipv4/inet_hashtables.c         | 168 +++++++++++++++++++++++++++++++++++--
 net/ipv6/inet6_hashtables.c        |  66 +++++++++++++++
 4 files changed, 249 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 0358745ea059..8e1bf9ae4a5e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ulp_ops	   Pluggable ULP control hook
  * @icsk_ulp_data	   ULP private data
+ * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -101,6 +102,7 @@ struct inet_connection_sock {
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	const struct tcp_ulp_ops  *icsk_ulp_ops;
 	void			  *icsk_ulp_data;
+	struct hlist_node         icsk_listen_portaddr_node;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:6,
 				  icsk_ca_setsockopt:1,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4cce516c41ac..9141e95529e7 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -133,12 +133,13 @@ struct inet_hashinfo {
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
 	 */
+	struct kmem_cache		*bind_bucket_cachep;
 	struct inet_bind_hashbucket	*bhash;
-
 	unsigned int			bhash_size;
-	/* 4 bytes hole on 64 bit */
 
-	struct kmem_cache		*bind_bucket_cachep;
+	/* The 2nd listener table hashed by local port and address */
+	unsigned int			lhash2_mask;
+	struct inet_listen_hashbucket	*lhash2;
 
 	/* All the above members are written once at bootup and
 	 * never written again _or_ are predominantly read-access.
@@ -146,14 +147,25 @@ struct inet_hashinfo {
 	 * Now align to a new cache line as all the following members
 	 * might be often dirty.
 	 */
-	/* All sockets in TCP_LISTEN state will be in here.  This is the only
-	 * table where wildcard'd TCP sockets can exist.  Hash function here
-	 * is just local port number.
+	/* All sockets in TCP_LISTEN state will be in listening_hash.
+	 * This is the only table where wildcard'd TCP sockets can
+	 * exist.  listening_hash is only hashed by local port number.
+	 * If lhash2 is initialized, the same socket will also be hashed
+	 * to lhash2 by port and address.
 	 */
 	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
 					____cacheline_aligned_in_smp;
 };
 
+#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
+	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
+
+static inline struct inet_listen_hashbucket *
+inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
+{
+	return &h->lhash2[hash & h->lhash2_mask];
+}
+
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	struct inet_hashinfo *hashinfo,
 	unsigned int hash)
@@ -209,6 +221,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child);
 void inet_put_port(struct sock *sk);
 
 void inet_hashinfo_init(struct inet_hashinfo *h);
+void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+			 unsigned long numentries, int scale,
+			 unsigned long low_limit,
+			 unsigned long high_limit);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 80cfd3fa21ca..f6f58108b4c5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
+#include <linux/bootmem.h>
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+	u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		hash = ipv6_portaddr_hash(sock_net(sk),
+					  &sk->sk_v6_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	else
+#endif
+		hash = ipv4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+	struct inet_listen_hashbucket *ilb2;
+
+	if (!h->lhash2)
+		return;
+
+	ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+	spin_lock(&ilb2->lock);
+	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+				   &ilb2->head);
+	else
+		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+				   &ilb2->head);
+	ilb2->count++;
+	spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+	struct inet_listen_hashbucket *ilb2;
+
+	if (!h->lhash2 ||
+	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+		return;
+
+	ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+	spin_lock(&ilb2->lock);
+	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+	ilb2->count--;
+	spin_unlock(&ilb2->lock);
+}
+
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum, const __be32 daddr,
 				const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
  */
 
 /* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+				struct inet_listen_hashbucket *ilb2,
+				struct sk_buff *skb, int doff,
+				const __be32 saddr, __be16 sport,
+				const __be32 daddr, const unsigned short hnum,
+				const int dif, const int sdif)
+{
+	bool exact_dif = inet_exact_dif_match(net, skb);
+	struct inet_connection_sock *icsk;
+	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	u32 phash = 0;
+
+	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+		sk = (struct sock *)icsk;
+		score = compute_score(sk, net, hnum, daddr,
+				      dif, sdif, exact_dif);
+		if (score > hiscore) {
+			if (sk->sk_reuseport) {
+				phash = inet_ehashfn(net, daddr, hnum,
+						     saddr, sport);
+				result = reuseport_select_sock(sk, phash,
+							       skb, doff);
+				if (result)
+					return result;
+			}
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
 				    struct sk_buff *skb, int doff,
@@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	bool exact_dif = inet_exact_dif_match(net, skb);
+	struct inet_listen_hashbucket *ilb2;
 	struct sock *sk, *result = NULL;
 	int score, hiscore = 0;
+	unsigned int hash2;
 	u32 phash = 0;
 
+	if (ilb->count <= 10 || !hashinfo->lhash2)
+		goto port_lookup;
+
+	/* Too many sk in the ilb bucket (which is hashed by port alone).
+	 * Try lhash2 (which is hashed by port and addr) instead.
+	 */
+
+	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	result = inet_lhash2_lookup(net, ilb2, skb, doff,
+				    saddr, sport, daddr, hnum,
+				    dif, sdif);
+	if (result)
+		return result;
+
+	/* Lookup lhash2 with INADDR_ANY */
+
+	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	return inet_lhash2_lookup(net, ilb2, skb, doff,
+				  saddr, sport, daddr, hnum,
+				  dif, sdif);
+
+port_lookup:
 	sk_for_each_rcu(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr,
 				      dif, sdif, exact_dif);
@@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
 	else
 		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+	inet_hash2(hashinfo, sk);
 	ilb->count++;
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk)
 	struct inet_listen_hashbucket *ilb;
 	spinlock_t *lock;
 	bool listener = false;
-	int done;
 
 	if (sk_unhashed(sk))
 		return;
@@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk)
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 	}
 	spin_lock_bh(lock);
+	if (sk_unhashed(sk))
+		goto unlock;
+
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
 		reuseport_detach_sock(sk);
-	if (listener)
-		done = __sk_del_node_init(sk);
-	else
-		done = __sk_nulls_del_node_init_rcu(sk);
-	if (done) {
-		if (listener)
-			ilb->count--;
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	if (listener) {
+		inet_unhash2(hashinfo, sk);
+		 __sk_del_node_init(sk);
+		 ilb->count--;
+	} else {
+		__sk_nulls_del_node_init_rcu(sk);
 	}
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
 	spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
@@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 		INIT_HLIST_HEAD(&h->listening_hash[i].head);
 		h->listening_hash[i].count = 0;
 	}
+
+	h->lhash2 = NULL;
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
 
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+				unsigned long numentries, int scale,
+				unsigned long low_limit,
+				unsigned long high_limit)
+{
+	unsigned int i;
+
+	h->lhash2 = alloc_large_system_hash(name,
+					    sizeof(*h->lhash2),
+					    numentries,
+					    scale,
+					    0,
+					    NULL,
+					    &h->lhash2_mask,
+					    low_limit,
+					    high_limit);
+
+	for (i = 0; i <= h->lhash2_mask; i++) {
+		spin_lock_init(&h->lhash2[i].lock);
+		INIT_HLIST_HEAD(&h->lhash2[i].head);
+		h->lhash2[i].count = 0;
+	}
+}
+
 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
 {
 	unsigned int locksz = sizeof(spinlock_t);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 0d1451381f5c..2febe26de6a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 /* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(struct net *net,
+		struct inet_listen_hashbucket *ilb2,
+		struct sk_buff *skb, int doff,
+		const struct in6_addr *saddr,
+		const __be16 sport, const struct in6_addr *daddr,
+		const unsigned short hnum, const int dif, const int sdif)
+{
+	bool exact_dif = inet6_exact_dif_match(net, skb);
+	struct inet_connection_sock *icsk;
+	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	u32 phash = 0;
+
+	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+		sk = (struct sock *)icsk;
+		score = compute_score(sk, net, hnum, daddr, dif, sdif,
+				      exact_dif);
+		if (score > hiscore) {
+			if (sk->sk_reuseport) {
+				phash = inet6_ehashfn(net, daddr, hnum,
+						      saddr, sport);
+				result = reuseport_select_sock(sk, phash,
+							       skb, doff);
+				if (result)
+					return result;
+			}
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
 struct sock *inet6_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo,
 		struct sk_buff *skb, int doff,
@@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	bool exact_dif = inet6_exact_dif_match(net, skb);
+	struct inet_listen_hashbucket *ilb2;
 	struct sock *sk, *result = NULL;
 	int score, hiscore = 0;
+	unsigned int hash2;
 	u32 phash = 0;
 
+	if (ilb->count <= 10 || !hashinfo->lhash2)
+		goto port_lookup;
+
+	/* Too many sk in the ilb bucket (which is hashed by port alone).
+	 * Try lhash2 (which is hashed by port and addr) instead.
+	 */
+
+	hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+				     saddr, sport, daddr, hnum,
+				     dif, sdif);
+	if (result)
+		return result;
+
+	/* Lookup lhash2 with in6addr_any */
+
+	hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	return inet6_lhash2_lookup(net, ilb2, skb, doff,
+				   saddr, sport, daddr, hnum,
+				   dif, sdif);
+
+port_lookup:
 	sk_for_each(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
 		if (score > hiscore) {
-- 
cgit v1.2.3


From 27da6d37e207e7ad9c692d3d0924f09e63a98e38 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 1 Dec 2017 12:52:32 -0800
Subject: tcp: Enable 2nd listener hashtable in TCP

Enable the second listener hashtable in TCP.
The scale is the same as UDP which is one slot per 2MB.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..180311636023 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3577,6 +3577,9 @@ void __init tcp_init(void)
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
 	inet_hashinfo_init(&tcp_hashinfo);
+	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+			    thash_entries, 21,  /* one slot per 2 MB*/
+			    0, 64 * 1024);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-- 
cgit v1.2.3


From 02dd9a8afab42d63ca89a0e7dc8c45c5bebeb710 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Mon, 4 Dec 2017 13:14:39 +0100
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index edb2f239d04d..bb8003cf2296 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2017.4"
+#define BATADV_SOURCE_VERSION "2018.0"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3


From 3326afe6bac383a354c947d878e3916a74bbd59a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 19 Nov 2017 17:59:13 +0100
Subject: batman-adv: Deinline batadv_orig_hash_find, save 7339 bytes

This function compiles to 288 bytes of machine code for Linux 4.14 on
Debian Stretch amd64 and (6.3.0-18) with the default configuration. 27
callsites (25 used in default config).

   text    data     bss     dec     hex filename
 179291   10317    4416  194024   2f5e8 batman-adv.ko.pre
 171952   10317    4416  186685   2d93d batman-adv.ko.post

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
[sven@narfation.org: Fix includes, correct sizes+counts in commit message]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/originator.c | 32 ++++++++++++++++++++++++++++++++
 net/batman-adv/originator.h | 37 ++-----------------------------------
 2 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 2967b86c13da..0a565d0422bb 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -30,10 +30,12 @@
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <linux/rculist.h>
+#include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/stddef.h>
 #include <linux/workqueue.h>
 #include <net/sock.h>
 #include <uapi/linux/batman_adv.h>
@@ -55,6 +57,36 @@
 /* hash class keys */
 static struct lock_class_key batadv_orig_hash_lock_class_key;
 
+struct batadv_orig_node *
+batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
+{
+	struct batadv_hashtable *hash = bat_priv->orig_hash;
+	struct hlist_head *head;
+	struct batadv_orig_node *orig_node, *orig_node_tmp = NULL;
+	int index;
+
+	if (!hash)
+		return NULL;
+
+	index = batadv_choose_orig(data, hash->size);
+	head = &hash->table[index];
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+		if (!batadv_compare_eth(orig_node, data))
+			continue;
+
+		if (!kref_get_unless_zero(&orig_node->refcount))
+			continue;
+
+		orig_node_tmp = orig_node;
+		break;
+	}
+	rcu_read_unlock();
+
+	return orig_node_tmp;
+}
+
 static void batadv_purge_orig(struct work_struct *work);
 
 /**
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index d94220a6d21a..40c7f039d5d7 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -23,14 +23,8 @@
 #include <linux/compiler.h>
 #include <linux/if_ether.h>
 #include <linux/jhash.h>
-#include <linux/kref.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/stddef.h>
 #include <linux/types.h>
 
-#include "hash.h"
-
 struct netlink_callback;
 struct seq_file;
 struct sk_buff;
@@ -100,34 +94,7 @@ static inline u32 batadv_choose_orig(const void *data, u32 size)
 	return hash % size;
 }
 
-static inline struct batadv_orig_node *
-batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
-{
-	struct batadv_hashtable *hash = bat_priv->orig_hash;
-	struct hlist_head *head;
-	struct batadv_orig_node *orig_node, *orig_node_tmp = NULL;
-	int index;
-
-	if (!hash)
-		return NULL;
-
-	index = batadv_choose_orig(data, hash->size);
-	head = &hash->table[index];
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
-		if (!batadv_compare_eth(orig_node, data))
-			continue;
-
-		if (!kref_get_unless_zero(&orig_node->refcount))
-			continue;
-
-		orig_node_tmp = orig_node;
-		break;
-	}
-	rcu_read_unlock();
-
-	return orig_node_tmp;
-}
+struct batadv_orig_node *
+batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data);
 
 #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
-- 
cgit v1.2.3


From 6712abc168ebac90b46088b89798aa31a1bc79f9 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 1 Dec 2017 15:26:08 -0800
Subject: ip6_gre: add ip6 gre and gretap collect_md mode

Similar to gre, vxlan, geneve, ipip tunnels, allow ip6 gre and gretap
tunnels to operate in collect metadata mode.  bpf_skb_[gs]et_tunnel_key()
helpers can make use of it right away.  OVS can use it as well in the
future.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c    | 105 +++++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv6/ip6_tunnel.c |   5 ++-
 2 files changed, 99 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 76379f01bcd2..1510ce9a4e4e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -56,6 +56,7 @@
 #include <net/ip6_tunnel.h>
 #include <net/gre.h>
 #include <net/erspan.h>
+#include <net/dst_metadata.h>
 
 
 static bool log_ecn_error = true;
@@ -69,6 +70,7 @@ static unsigned int ip6gre_net_id __read_mostly;
 struct ip6gre_net {
 	struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
+	struct ip6_tnl __rcu *collect_md_tun;
 	struct net_device *fb_tunnel_dev;
 };
 
@@ -229,6 +231,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 	if (cand)
 		return cand;
 
+	t = rcu_dereference(ign->collect_md_tun);
+	if (t && t->dev->flags & IFF_UP)
+		return t;
+
 	dev = ign->fb_tunnel_dev;
 	if (dev->flags & IFF_UP)
 		return netdev_priv(dev);
@@ -264,6 +270,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, t);
+
 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -273,6 +282,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, NULL);
+
 	for (tp = ip6gre_bucket(ign, t);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -463,7 +475,22 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
 				      tpi->proto);
 	if (tunnel) {
-		ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->parms.collect_md) {
+			struct metadata_dst *tun_dst;
+			__be64 tun_id;
+			__be16 flags;
+
+			flags = tpi->flags;
+			tun_id = key32_to_tunnel_id(tpi->key);
+
+			tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		} else {
+			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		}
 
 		return PACKET_RCVD;
 	}
@@ -633,8 +660,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
-	gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
-			 protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+
+	if (tunnel->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+		__be16 flags;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info ||
+			     !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -EINVAL;
+
+		key = &tun_info->key;
+		memset(fl6, 0, sizeof(*fl6));
+		fl6->flowi6_proto = IPPROTO_GRE;
+		fl6->daddr = key->u.ipv6.dst;
+		fl6->flowlabel = key->label;
+		fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+		dsfield = key->tos;
+		flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+		tunnel->tun_hlen = gre_calc_hlen(flags);
+
+		gre_build_header(skb, tunnel->tun_hlen,
+				 flags, protocol,
+				 tunnel_id_to_key32(tun_info->key.tun_id), 0);
+
+	} else {
+		gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+				 protocol, tunnel->parms.o_key,
+				 htonl(tunnel->o_seqno));
+	}
 
 	return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
 			    NEXTHDR_GRE);
@@ -645,13 +702,15 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 	struct ip6_tnl *t = netdev_priv(dev);
 	int encap_limit = -1;
 	struct flowi6 fl6;
-	__u8 dsfield;
+	__u8 dsfield = 0;
 	__u32 mtu;
 	int err;
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit);
+	if (!t->parms.collect_md)
+		prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+					 &dsfield, &encap_limit);
 
 	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
 	if (err)
@@ -676,14 +735,15 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	int encap_limit = -1;
 	struct flowi6 fl6;
-	__u8 dsfield;
+	__u8 dsfield = 0;
 	__u32 mtu;
 	int err;
 
 	if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
 		return -1;
 
-	if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
+	if (!t->parms.collect_md &&
+	    prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
 		return -1;
 
 	if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
@@ -731,7 +791,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
 	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		encap_limit = t->parms.encap_limit;
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+	if (!t->parms.collect_md)
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
 
 	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
 	if (err)
@@ -1201,6 +1262,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		dev->mtu -= 8;
 
+	if (tunnel->parms.collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
+
 	return 0;
 }
 
@@ -1215,6 +1281,9 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 
 	tunnel = netdev_priv(dev);
 
+	if (tunnel->parms.collect_md)
+		return 0;
+
 	memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
 	memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
 
@@ -1464,6 +1533,9 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_GRE_ERSPAN_INDEX])
 		parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+	if (data[IFLA_GRE_COLLECT_METADATA])
+		parms->collect_md = true;
 }
 
 static int ip6gre_tap_init(struct net_device *dev)
@@ -1622,8 +1694,13 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 
 	ip6gre_netlink_parms(data, &nt->parms);
 
-	if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
-		return -EEXIST;
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ign->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+			return -EEXIST;
+	}
 
 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
@@ -1742,6 +1819,8 @@ static size_t ip6gre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_GRE_COLLECT_METADATA */
+		nla_total_size(0) +
 		/* IFLA_GRE_FWMARK */
 		nla_total_size(4) +
 		/* IFLA_GRE_ERSPAN_INDEX */
@@ -1781,6 +1860,11 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			t->encap.flags))
 		goto nla_put_failure;
 
+	if (p->collect_md) {
+		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -1803,6 +1887,7 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
+	[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
 	[IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
 	[IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
 };
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3d3092adf1d2..6a3b1a54a952 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -861,7 +861,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
 		struct metadata_dst *tun_dst,
 		bool log_ecn_err)
 {
-	return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate,
+	return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate,
 			     log_ecn_err);
 }
 EXPORT_SYMBOL(ip6_tnl_rcv);
@@ -979,6 +979,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
 	int ret = 0;
 	struct net *net = t->net;
 
+	if (t->parms.collect_md)
+		return 1;
+
 	if ((p->flags & IP6_TNL_F_CAP_XMIT) ||
 	    ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
 	     (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) {
-- 
cgit v1.2.3


From b8da518c6e87504a6790bfb1539134c41b61a45a Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 1 Dec 2017 15:26:09 -0800
Subject: bpf: allow disabling tunnel csum for ipv6

Before the patch, BPF_F_ZERO_CSUM_TX can be used only for ipv4 tunnel.
With introduction of ip6gretap collect_md mode, the flag should be also
supported for ipv6.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 6a85e67fafce..8ec5a504eb28 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3026,10 +3026,11 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
-		if (flags & BPF_F_ZERO_CSUM_TX)
-			info->key.tun_flags &= ~TUNNEL_CSUM;
 	}
 
+	if (flags & BPF_F_ZERO_CSUM_TX)
+		info->key.tun_flags &= ~TUNNEL_CSUM;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From addf9b90de22f7aaad0db39bccb5d51ac47dd4e1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:05 +0100
Subject: net: rtnetlink: use rcu to free rtnl message handlers

rtnetlink is littered with READ_ONCE() because we can have read accesses
while another cpu can write to the structure we're reading by
(un)registering doit or dumpit handlers.

This patch changes this so that (un)registering cpu allocates a new
structure and then publishes it via rcu_assign_pointer, i.e. once
another cpu can see such pointer no modifications will occur anymore.

based on initial patch from Peter Zijlstra.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 154 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 101 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..ff292d3f2c41 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -63,6 +63,7 @@ struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
 	unsigned int		flags;
+	struct rcu_head		rcu;
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
@@ -127,7 +128,7 @@ bool lockdep_rtnl_is_held(void)
 EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
-static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
 static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
@@ -144,6 +145,20 @@ static inline int rtm_msgindex(int msgtype)
 	return msgindex;
 }
 
+static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
+{
+	struct rtnl_link **tab;
+
+	if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
+		protocol = PF_UNSPEC;
+
+	tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
+	if (!tab)
+		tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
+
+	return tab[msgtype];
+}
+
 /**
  * __rtnl_register - Register a rtnetlink message type
  * @protocol: Protocol family or PF_UNSPEC
@@ -166,28 +181,52 @@ int __rtnl_register(int protocol, int msgtype,
 		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 		    unsigned int flags)
 {
-	struct rtnl_link *tab;
+	struct rtnl_link **tab, *link, *old;
 	int msgindex;
+	int ret = -ENOBUFS;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 	msgindex = rtm_msgindex(msgtype);
 
-	tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]);
+	rtnl_lock();
+	tab = rtnl_msg_handlers[protocol];
 	if (tab == NULL) {
-		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
-		if (tab == NULL)
-			return -ENOBUFS;
+		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
+		if (!tab)
+			goto unlock;
 
+		/* ensures we see the 0 stores */
 		rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
 	}
 
+	old = rtnl_dereference(tab[msgindex]);
+	if (old) {
+		link = kmemdup(old, sizeof(*old), GFP_KERNEL);
+		if (!link)
+			goto unlock;
+	} else {
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
+		if (!link)
+			goto unlock;
+	}
+
+	WARN_ON(doit && link->doit && link->doit != doit);
 	if (doit)
-		tab[msgindex].doit = doit;
+		link->doit = doit;
+	WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
 	if (dumpit)
-		tab[msgindex].dumpit = dumpit;
-	tab[msgindex].flags |= flags;
+		link->dumpit = dumpit;
 
-	return 0;
+	link->flags |= flags;
+
+	/* publish protocol:msgtype */
+	rcu_assign_pointer(tab[msgindex], link);
+	ret = 0;
+	if (old)
+		kfree_rcu(old, rcu);
+unlock:
+	rtnl_unlock();
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__rtnl_register);
 
@@ -220,24 +259,25 @@ EXPORT_SYMBOL_GPL(rtnl_register);
  */
 int rtnl_unregister(int protocol, int msgtype)
 {
-	struct rtnl_link *handlers;
+	struct rtnl_link **tab, *link;
 	int msgindex;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 	msgindex = rtm_msgindex(msgtype);
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
-	if (!handlers) {
+	tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+	if (!tab) {
 		rtnl_unlock();
 		return -ENOENT;
 	}
 
-	handlers[msgindex].doit = NULL;
-	handlers[msgindex].dumpit = NULL;
-	handlers[msgindex].flags = 0;
+	link = tab[msgindex];
+	rcu_assign_pointer(tab[msgindex], NULL);
 	rtnl_unlock();
 
+	kfree_rcu(link, rcu);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister);
@@ -251,20 +291,29 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
  */
 void rtnl_unregister_all(int protocol)
 {
-	struct rtnl_link *handlers;
+	struct rtnl_link **tab, *link;
+	int msgindex;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
+	tab = rtnl_msg_handlers[protocol];
 	RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
+	for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
+		link = tab[msgindex];
+		if (!link)
+			continue;
+
+		rcu_assign_pointer(tab[msgindex], NULL);
+		kfree_rcu(link, rcu);
+	}
 	rtnl_unlock();
 
 	synchronize_net();
 
 	while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
 		schedule();
-	kfree(handlers);
+	kfree(tab);
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
 
@@ -2973,18 +3022,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 		s_idx = 1;
 
 	for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+		struct rtnl_link **tab;
 		int type = cb->nlh->nlmsg_type-RTM_BASE;
-		struct rtnl_link *handlers;
+		struct rtnl_link *link;
 		rtnl_dumpit_func dumpit;
 
 		if (idx < s_idx || idx == PF_PACKET)
 			continue;
 
-		handlers = rtnl_dereference(rtnl_msg_handlers[idx]);
-		if (!handlers)
+		if (type < 0 || type >= RTM_NR_MSGTYPES)
 			continue;
 
-		dumpit = READ_ONCE(handlers[type].dumpit);
+		tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
+		if (!tab)
+			continue;
+
+		link = tab[type];
+		if (!link)
+			continue;
+
+		dumpit = link->dumpit;
 		if (!dumpit)
 			continue;
 
@@ -4314,7 +4371,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct rtnl_link *handlers;
+	struct rtnl_link *link;
 	int err = -EOPNOTSUPP;
 	rtnl_doit_func doit;
 	unsigned int flags;
@@ -4338,32 +4395,20 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	if (family >= ARRAY_SIZE(rtnl_msg_handlers))
-		family = PF_UNSPEC;
-
 	rcu_read_lock();
-	handlers = rcu_dereference(rtnl_msg_handlers[family]);
-	if (!handlers) {
-		family = PF_UNSPEC;
-		handlers = rcu_dereference(rtnl_msg_handlers[family]);
-	}
-
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
 		struct sock *rtnl;
 		rtnl_dumpit_func dumpit;
 		u16 min_dump_alloc = 0;
 
-		dumpit = READ_ONCE(handlers[type].dumpit);
-		if (!dumpit) {
+		link = rtnl_get_link(family, type);
+		if (!link || !link->dumpit) {
 			family = PF_UNSPEC;
-			handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]);
-			if (!handlers)
-				goto err_unlock;
-
-			dumpit = READ_ONCE(handlers[type].dumpit);
-			if (!dumpit)
+			link = rtnl_get_link(family, type);
+			if (!link || !link->dumpit)
 				goto err_unlock;
 		}
+		dumpit = link->dumpit;
 
 		refcount_inc(&rtnl_msg_handlers_ref[family]);
 
@@ -4384,33 +4429,36 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return err;
 	}
 
-	doit = READ_ONCE(handlers[type].doit);
-	if (!doit) {
+	link = rtnl_get_link(family, type);
+	if (!link || !link->doit) {
 		family = PF_UNSPEC;
-		handlers = rcu_dereference(rtnl_msg_handlers[family]);
+		link = rtnl_get_link(PF_UNSPEC, type);
+		if (!link || !link->doit)
+			goto out_unlock;
 	}
 
-	flags = READ_ONCE(handlers[type].flags);
+	flags = link->flags;
 	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
 		refcount_inc(&rtnl_msg_handlers_ref[family]);
-		doit = READ_ONCE(handlers[type].doit);
+		doit = link->doit;
 		rcu_read_unlock();
 		if (doit)
 			err = doit(skb, nlh, extack);
 		refcount_dec(&rtnl_msg_handlers_ref[family]);
 		return err;
 	}
-
 	rcu_read_unlock();
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[family]);
-	if (handlers) {
-		doit = READ_ONCE(handlers[type].doit);
-		if (doit)
-			err = doit(skb, nlh, extack);
-	}
+	link = rtnl_get_link(family, type);
+	if (link && link->doit)
+		err = link->doit(skb, nlh, extack);
 	rtnl_unlock();
+
+	return err;
+
+out_unlock:
+	rcu_read_unlock();
 	return err;
 
 err_unlock:
-- 
cgit v1.2.3


From e4202511480da5f8e6870d8f6ecbb821aeaa8caf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:06 +0100
Subject: rtnetlink: get reference on module before invoking handlers

Add yet another rtnl_register function.  It will be used by modules
that can be removed.

The passed module struct is used to prevent module unload while
a netlink dump is in progress or when a DOIT_UNLOCKED doit callback
is called.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |   2 +
 net/core/rtnetlink.c    | 113 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 80 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index ead018744ff5..e326b3f9eb5f 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -17,6 +17,8 @@ int __rtnl_register(int protocol, int msgtype,
 		    rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
+int rtnl_register_module(struct module *owner, int protocol, int msgtype,
+			 rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 int rtnl_unregister(int protocol, int msgtype);
 void rtnl_unregister_all(int protocol);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ff292d3f2c41..de6390365c90 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -62,6 +62,7 @@
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
+	struct module		*owner;
 	unsigned int		flags;
 	struct rcu_head		rcu;
 };
@@ -129,7 +130,6 @@ EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
 static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
-static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
 {
@@ -159,27 +159,10 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
 	return tab[msgtype];
 }
 
-/**
- * __rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		    unsigned int flags)
+static int rtnl_register_internal(struct module *owner,
+				  int protocol, int msgtype,
+				  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+				  unsigned int flags)
 {
 	struct rtnl_link **tab, *link, *old;
 	int msgindex;
@@ -210,6 +193,9 @@ int __rtnl_register(int protocol, int msgtype,
 			goto unlock;
 	}
 
+	WARN_ON(link->owner && link->owner != owner);
+	link->owner = owner;
+
 	WARN_ON(doit && link->doit && link->doit != doit);
 	if (doit)
 		link->doit = doit;
@@ -228,6 +214,54 @@ unlock:
 	rtnl_unlock();
 	return ret;
 }
+
+/**
+ * rtnl_register_module - Register a rtnetlink message type
+ *
+ * @owner: module registering the hook (THIS_MODULE)
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Like rtnl_register, but for use by removable modules.
+ */
+int rtnl_register_module(struct module *owner,
+			 int protocol, int msgtype,
+			 rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+			 unsigned int flags)
+{
+	return rtnl_register_internal(owner, protocol, msgtype,
+				      doit, dumpit, flags);
+}
+EXPORT_SYMBOL_GPL(rtnl_register_module);
+
+/**
+ * __rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_register(int protocol, int msgtype,
+		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+		    unsigned int flags)
+{
+	return rtnl_register_internal(NULL, protocol, msgtype,
+				      doit, dumpit, flags);
+}
 EXPORT_SYMBOL_GPL(__rtnl_register);
 
 /**
@@ -311,8 +345,6 @@ void rtnl_unregister_all(int protocol)
 
 	synchronize_net();
 
-	while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
-		schedule();
 	kfree(tab);
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
@@ -4372,6 +4404,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct rtnl_link *link;
+	struct module *owner;
 	int err = -EOPNOTSUPP;
 	rtnl_doit_func doit;
 	unsigned int flags;
@@ -4408,24 +4441,32 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			if (!link || !link->dumpit)
 				goto err_unlock;
 		}
+		owner = link->owner;
 		dumpit = link->dumpit;
 
-		refcount_inc(&rtnl_msg_handlers_ref[family]);
-
 		if (type == RTM_GETLINK - RTM_BASE)
 			min_dump_alloc = rtnl_calcit(skb, nlh);
 
+		err = 0;
+		/* need to do this before rcu_read_unlock() */
+		if (!try_module_get(owner))
+			err = -EPROTONOSUPPORT;
+
 		rcu_read_unlock();
 
 		rtnl = net->rtnl;
-		{
+		if (err == 0) {
 			struct netlink_dump_control c = {
 				.dump		= dumpit,
 				.min_dump_alloc	= min_dump_alloc,
+				.module		= owner,
 			};
 			err = netlink_dump_start(rtnl, skb, nlh, &c);
+			/* netlink_dump_start() will keep a reference on
+			 * module if dump is still in progress.
+			 */
+			module_put(owner);
 		}
-		refcount_dec(&rtnl_msg_handlers_ref[family]);
 		return err;
 	}
 
@@ -4437,14 +4478,19 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			goto out_unlock;
 	}
 
+	owner = link->owner;
+	if (!try_module_get(owner)) {
+		err = -EPROTONOSUPPORT;
+		goto out_unlock;
+	}
+
 	flags = link->flags;
 	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
-		refcount_inc(&rtnl_msg_handlers_ref[family]);
 		doit = link->doit;
 		rcu_read_unlock();
 		if (doit)
 			err = doit(skb, nlh, extack);
-		refcount_dec(&rtnl_msg_handlers_ref[family]);
+		module_put(owner);
 		return err;
 	}
 	rcu_read_unlock();
@@ -4455,6 +4501,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		err = link->doit(skb, nlh, extack);
 	rtnl_unlock();
 
+	module_put(owner);
+
 	return err;
 
 out_unlock:
@@ -4546,11 +4594,6 @@ static struct pernet_operations rtnetlink_net_ops = {
 
 void __init rtnetlink_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++)
-		refcount_set(&rtnl_msg_handlers_ref[i], 1);
-
 	if (register_pernet_subsys(&rtnetlink_net_ops))
 		panic("rtnetlink_init: cannot initialize rtnetlink\n");
 
-- 
cgit v1.2.3


From c1c502b511503ee5de55382744859b622411f32b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:07 +0100
Subject: net: use rtnl_register_module where needed

all of these can be compiled as a module, so use new
_module version to make sure module can no longer be removed
while callback/dump is in use.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c     |  6 +++---
 net/can/gw.c            | 14 ++++++++++----
 net/decnet/dn_dev.c     |  9 ++++++---
 net/decnet/dn_fib.c     |  6 ++++--
 net/decnet/dn_route.c   |  8 ++++----
 net/mpls/af_mpls.c      | 15 +++++++++------
 net/phonet/pn_netlink.c | 21 +++++++++++++--------
 net/qrtr/qrtr.c         |  8 ++++++--
 8 files changed, 55 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index b0f4c734900b..6d9f48bd374a 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -760,9 +760,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 void br_mdb_init(void)
 {
-	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
-	rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
-	rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
+	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
 }
 
 void br_mdb_uninit(void)
diff --git a/net/can/gw.c b/net/can/gw.c
index 73a02af4b5d7..398dd0395ad9 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1014,6 +1014,8 @@ static struct pernet_operations cangw_pernet_ops = {
 
 static __init int cgw_module_init(void)
 {
+	int ret;
+
 	/* sanitize given module parameter */
 	max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
 
@@ -1031,15 +1033,19 @@ static __init int cgw_module_init(void)
 	notifier.notifier_call = cgw_notifier;
 	register_netdevice_notifier(&notifier);
 
-	if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0)) {
+	ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE,
+				   NULL, cgw_dump_jobs, 0);
+	if (ret) {
 		unregister_netdevice_notifier(&notifier);
 		kmem_cache_destroy(cgw_cache);
 		return -ENOBUFS;
 	}
 
-	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0);
-	__rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0);
+	/* Only the first call to rtnl_register_module can fail */
+	rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE,
+			     cgw_create_job, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE,
+			     cgw_remove_job, NULL, 0);
 
 	return 0;
 }
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 9153247dad28..d1885cf59319 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1418,9 +1418,12 @@ void __init dn_dev_init(void)
 
 	dn_dev_devices_on();
 
-	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, 0);
-	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, 0);
-	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR,
+			     dn_nl_newaddr, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR,
+			     dn_nl_deladdr, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
+			     NULL, dn_nl_dump_ifaddr, 0);
 
 	proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops);
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index b37a1b833c77..fce94cbd4378 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -792,8 +792,10 @@ void __init dn_fib_init(void)
 
 	register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
 
-	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, 0);
-	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE,
+			     dn_fib_rtm_newroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
+			     dn_fib_rtm_delroute, NULL, 0);
 }
 
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 4b3ca70be723..73160d4aebbe 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1923,11 +1923,11 @@ void __init dn_route_init(void)
 		    &dn_rt_cache_seq_fops);
 
 #ifdef CONFIG_DECNET_ROUTER
-	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_fib_dump, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
+			     dn_cache_getroute, dn_fib_dump, 0);
 #else
-	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_cache_dump, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
+			     dn_cache_getroute, dn_cache_dump, 0);
 #endif
 }
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8ca9915befc8..5dce8336d33f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2510,12 +2510,15 @@ static int __init mpls_init(void)
 
 	rtnl_af_register(&mpls_af_ops);
 
-	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0);
-	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0);
-	rtnl_register(PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes,
-		      0);
-	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
-		      mpls_netconf_dump_devconf, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE,
+			     mpls_rtm_newroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE,
+			     mpls_rtm_delroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE,
+			     mpls_getroute, mpls_dump_routes, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF,
+			     mpls_netconf_get_devconf,
+			     mpls_netconf_dump_devconf, 0);
 	err = ipgre_tunnel_encap_add_mpls_ops();
 	if (err)
 		pr_err("Can't add mpls over gre tunnel ops\n");
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index da754fc926e7..871eaf2cb85e 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -299,16 +299,21 @@ out:
 
 int __init phonet_netlink_register(void)
 {
-	int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit,
-				  NULL, 0);
+	int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR,
+				       addr_doit, NULL, 0);
 	if (err)
 		return err;
 
-	/* Further __rtnl_register() cannot fail */
-	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0);
-	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0);
-	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0);
-	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0);
-	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0);
+	/* Further rtnl_register_module() cannot fail */
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR,
+			     addr_doit, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR,
+			     NULL, getaddr_dumpit, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE,
+			     route_doit, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE,
+			     route_doit, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE,
+			     NULL, route_dumpit, 0);
 	return 0;
 }
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 77ab05e23001..5fb3929e3d7d 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1116,9 +1116,13 @@ static int __init qrtr_proto_init(void)
 		return rc;
 	}
 
-	rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
+	rc = rtnl_register_module(THIS_MODULE, PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
+	if (rc) {
+		sock_unregister(qrtr_family.family);
+		proto_unregister(&qrtr_proto);
+	}
 
-	return 0;
+	return rc;
 }
 postcore_initcall(qrtr_proto_init);
 
-- 
cgit v1.2.3


From 16feebcf2350aa369001a529f50ce33f2472c01c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:08 +0100
Subject: rtnetlink: remove __rtnl_register

This removes __rtnl_register and switches callers to either
rtnl_register or rtnl_register_module.

Also, rtnl_register() will now print an error if memory allocation
failed rather than panic the kernel.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |  2 --
 net/core/rtnetlink.c    | 33 ++++++++-------------------------
 net/ipv6/addrconf.c     | 44 ++++++++++++++++++++++++++++++--------------
 net/ipv6/addrlabel.c    | 13 ++++++-------
 net/ipv6/ip6_fib.c      |  4 ++--
 net/ipv6/route.c        | 20 +++++++++++++++-----
 6 files changed, 61 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index e326b3f9eb5f..14b6b3af8918 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -13,8 +13,6 @@ enum rtnl_link_flags {
 	RTNL_FLAG_DOIT_UNLOCKED = 1,
 };
 
-int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 int rtnl_register_module(struct module *owner, int protocol, int msgtype,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index de6390365c90..fb2d61df1e2f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -238,7 +238,7 @@ int rtnl_register_module(struct module *owner,
 EXPORT_SYMBOL_GPL(rtnl_register_module);
 
 /**
- * __rtnl_register - Register a rtnetlink message type
+ * rtnl_register - Register a rtnetlink message type
  * @protocol: Protocol family or PF_UNSPEC
  * @msgtype: rtnetlink message type
  * @doit: Function pointer called for each request message
@@ -252,35 +252,18 @@ EXPORT_SYMBOL_GPL(rtnl_register_module);
  * The special protocol family PF_UNSPEC may be used to define fallback
  * function pointers for the case when no entry for the specific protocol
  * family exists.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		    unsigned int flags)
-{
-	return rtnl_register_internal(NULL, protocol, msgtype,
-				      doit, dumpit, flags);
-}
-EXPORT_SYMBOL_GPL(__rtnl_register);
-
-/**
- * rtnl_register - Register a rtnetlink message type
- *
- * Identical to __rtnl_register() but panics on failure. This is useful
- * as failure of this function is very unlikely, it can only happen due
- * to lack of memory when allocating the chain to store all message
- * handlers for a protocol. Meant for use in init functions where lack
- * of memory implies no sense in continuing.
  */
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 		   unsigned int flags)
 {
-	if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0)
-		panic("Unable to register rtnetlink message handler, "
-		      "protocol = %d, message type = %d\n",
-		      protocol, msgtype);
+	int err;
+
+	err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
+				     flags);
+	if (err)
+		pr_err("Unable to register rtnetlink message handler, "
+		       "protocol = %d, message type = %d\n", protocol, msgtype);
 }
 EXPORT_SYMBOL_GPL(rtnl_register);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f49bd7897e95..a5ad8425551a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6595,27 +6595,43 @@ int __init addrconf_init(void)
 
 	rtnl_af_register(&inet6_ops);
 
-	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
-			      0);
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK,
+				   NULL, inet6_dump_ifinfo, 0);
 	if (err < 0)
 		goto errout;
 
-	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
-	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
-	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
-			inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED);
-	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
-			inet6_dump_ifmcaddr, 0);
-	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
-			inet6_dump_ifacaddr, 0);
-	__rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
-			inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED);
-
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR,
+				   inet6_rtm_newaddr, NULL, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR,
+				   inet6_rtm_deladdr, NULL, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR,
+				   inet6_rtm_getaddr, inet6_dump_ifaddr,
+				   RTNL_FLAG_DOIT_UNLOCKED);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST,
+				   NULL, inet6_dump_ifmcaddr, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST,
+				   NULL, inet6_dump_ifacaddr, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF,
+				   inet6_netconf_get_devconf,
+				   inet6_netconf_dump_devconf,
+				   RTNL_FLAG_DOIT_UNLOCKED);
+	if (err < 0)
+		goto errout;
 	ipv6_addr_label_rtnl_register();
 
 	return 0;
 errout:
+	rtnl_unregister_all(PF_INET6);
 	rtnl_af_unregister(&inet6_ops);
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 errlo:
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 00e1f8ee08f8..303fcce5beef 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -549,11 +549,10 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 void __init ipv6_addr_label_rtnl_register(void)
 {
-	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
-			NULL, RTNL_FLAG_DOIT_UNLOCKED);
-	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
-			NULL, RTNL_FLAG_DOIT_UNLOCKED);
-	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
-			ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
+	rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
+		      NULL, RTNL_FLAG_DOIT_UNLOCKED);
+	rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
+		      NULL, RTNL_FLAG_DOIT_UNLOCKED);
+	rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
+		      ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
 }
-
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c43cbaedfa35..a64d559fa513 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2142,8 +2142,8 @@ int __init fib6_init(void)
 	if (ret)
 		goto out_kmem_cache_create;
 
-	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
-			      0);
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
+				   inet6_dump_fib, 0);
 	if (ret)
 		goto out_unregister_subsys;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 46fd53b268da..b3f4d19b3ca5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4772,11 +4772,20 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto fib6_rules_init;
 
-	ret = -ENOBUFS;
-	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
-	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
-	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
-			    RTNL_FLAG_DOIT_UNLOCKED))
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
+				   inet6_rtm_newroute, NULL, 0);
+	if (ret < 0)
+		goto out_register_late_subsys;
+
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
+				   inet6_rtm_delroute, NULL, 0);
+	if (ret < 0)
+		goto out_register_late_subsys;
+
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
+				   inet6_rtm_getroute, NULL,
+				   RTNL_FLAG_DOIT_UNLOCKED);
+	if (ret < 0)
 		goto out_register_late_subsys;
 
 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
@@ -4794,6 +4803,7 @@ out:
 	return ret;
 
 out_register_late_subsys:
+	rtnl_unregister_all(PF_INET6);
 	unregister_pernet_subsys(&ip6_route_net_late_ops);
 fib6_rules_init:
 	fib6_rules_cleanup();
-- 
cgit v1.2.3


From a3fde2addd5f0218b64102005a237ef727b0dc30 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 Dec 2017 19:19:18 +0100
Subject: rtnetlink: ipv6: convert remaining users to rtnl_register_module

convert remaining users of rtnl_register to rtnl_register_module
and un-export rtnl_register.

Requested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  2 +-
 net/core/rtnetlink.c   |  1 -
 net/ipv6/addrconf.c    |  4 +++-
 net/ipv6/addrlabel.c   | 24 +++++++++++++++++-------
 net/ipv6/ip6mr.c       |  9 ++++++---
 5 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index b623b65a79d1..c4185a7b0e90 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -180,7 +180,7 @@ static inline int addrconf_finite_timeout(unsigned long timeout)
  */
 int ipv6_addr_label_init(void);
 void ipv6_addr_label_cleanup(void);
-void ipv6_addr_label_rtnl_register(void);
+int ipv6_addr_label_rtnl_register(void);
 u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
 		    int type, int ifindex);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a55d0c236b40..642b3afb12b9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -265,7 +265,6 @@ void rtnl_register(int protocol, int msgtype,
 		pr_err("Unable to register rtnetlink message handler, "
 		       "protocol = %d, message type = %d\n", protocol, msgtype);
 }
-EXPORT_SYMBOL_GPL(rtnl_register);
 
 /**
  * rtnl_unregister - Unregister a rtnetlink message type
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a5ad8425551a..ed06b1190f05 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6627,7 +6627,9 @@ int __init addrconf_init(void)
 				   RTNL_FLAG_DOIT_UNLOCKED);
 	if (err < 0)
 		goto errout;
-	ipv6_addr_label_rtnl_register();
+	err = ipv6_addr_label_rtnl_register();
+	if (err < 0)
+		goto errout;
 
 	return 0;
 errout:
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 303fcce5beef..1d6ced37ad71 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -547,12 +547,22 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-void __init ipv6_addr_label_rtnl_register(void)
+int __init ipv6_addr_label_rtnl_register(void)
 {
-	rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
-		      NULL, RTNL_FLAG_DOIT_UNLOCKED);
-	rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
-		      NULL, RTNL_FLAG_DOIT_UNLOCKED);
-	rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
-		      ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
+	int ret;
+
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL,
+				   ip6addrlbl_newdel,
+				   NULL, RTNL_FLAG_DOIT_UNLOCKED);
+	if (ret < 0)
+		return ret;
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL,
+				   ip6addrlbl_newdel,
+				   NULL, RTNL_FLAG_DOIT_UNLOCKED);
+	if (ret < 0)
+		return ret;
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL,
+				   ip6addrlbl_get,
+				   ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
+	return ret;
 }
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a2e1a864eb46..890f9bda06a4 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1425,10 +1425,13 @@ int __init ip6_mr_init(void)
 		goto add_proto_fail;
 	}
 #endif
-	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
-		      ip6mr_rtm_dumproute, 0);
-	return 0;
+	err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE,
+				   NULL, ip6mr_rtm_dumproute, 0);
+	if (err == 0)
+		return 0;
+
 #ifdef CONFIG_IPV6_PIMSM_V2
+	inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
 add_proto_fail:
 	unregister_netdevice_notifier(&ip6_mr_notifier);
 #endif
-- 
cgit v1.2.3


From 792f3dd6f02b25afc0b42adce2efd523e81c3fb1 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Mon, 4 Dec 2017 14:18:29 -0800
Subject: bpf: move bpf csum flag check

trivial move the BPF_F_ZERO_CSUM_TX check right below the
'flags & BPF_F_DONT_FRAGMENT', so common tun_flags handling
is logically together.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 8ec5a504eb28..4d644ad17457 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3013,6 +3013,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
 	if (flags & BPF_F_DONT_FRAGMENT)
 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+	if (flags & BPF_F_ZERO_CSUM_TX)
+		info->key.tun_flags &= ~TUNNEL_CSUM;
 
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
@@ -3028,9 +3030,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 	}
 
-	if (flags & BPF_F_ZERO_CSUM_TX)
-		info->key.tun_flags &= ~TUNNEL_CSUM;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From f19397a5c65665d66e3866b42056f1f58b7a366b Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 1 Dec 2017 10:15:04 -0800
Subject: bpf: Add access to snd_cwnd and others in sock_ops

Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these
fields are only valid if the socket associated with the sock_ops program
call is a full socket, the field is_fullsock is also added to the
bpf_sock_ops struct. If the socket is not a full socket, reading these
fields returns 0.

Note that in most cases it will not be necessary to check is_fullsock to
know if there is a full socket. The context of the call, as specified by
the 'op' field, can sometimes determine whether there is a full socket.

The struct bpf_sock_ops has the following fields added:

  __u32 is_fullsock;      /* Some TCP fields are only valid if
                           * there is a full socket. If not, the
                           * fields read as zero.
			   */
  __u32 snd_cwnd;
  __u32 srtt_us;          /* Averaged RTT << 3 in usecs */

There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add
read access to more 32 bit tcp_sock fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        |  6 ++++--
 include/uapi/linux/bpf.h |  6 ++++++
 net/core/filter.c        | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 80b5b482cb46..0062302e1285 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -985,6 +985,7 @@ struct bpf_sock_ops_kern {
 		u32 reply;
 		u32 replylong[4];
 	};
+	u32	is_fullsock;
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4e09398009c1..89a656077884 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2012,10 +2012,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	if (sk_fullsock(sk))
+	memset(&sock_ops, 0, sizeof(sock_ops));
+	if (sk_fullsock(sk)) {
+		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
+	}
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
 	sock_ops.sk = sk;
 	sock_ops.op = op;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4c223ab30293..80d62e88590c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -941,6 +941,12 @@ struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 4d644ad17457..754abe1041b7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct bpf_sock_ops, is_fullsock):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern,
+						is_fullsock),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern,
+					       is_fullsock));
+		break;
+
+/* Helper macro for adding read access to tcp_sock fields. */
+#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+	do {								      \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+				      offsetof(struct tcp_sock, FIELD_NAME)); \
+	} while (0)
+
+	case offsetof(struct bpf_sock_ops, snd_cwnd):
+		SOCK_OPS_GET_TCP32(snd_cwnd);
+		break;
+
+	case offsetof(struct bpf_sock_ops, srtt_us):
+		SOCK_OPS_GET_TCP32(srtt_us);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From b0e9fe1ba7f8305dc1c640fbeb1b8c5c609e604c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 Dec 2017 22:42:30 +0100
Subject: rtnetlink: fix rtnl_link msghandler rcu annotations

Incorrect/missing annotations caused a few sparse warnings:

rtnetlink.c:155:15: incompatible types .. (different address spaces)
rtnetlink.c:157:23: incompatible types .. (different address spaces)
rtnetlink.c:185:15: incompatible types .. (different address spaces)
rtnetlink.c:285:15: incompatible types .. (different address spaces)
rtnetlink.c:317:9: incompatible types .. (different address spaces)
rtnetlink.c:3054:23: incompatible types .. (different address spaces)

no change in generated code.

Fixes: addf9b90de22f7 ("net: rtnetlink: use rcu to free rtnl message handlers")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 642b3afb12b9..a4faefd65006 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -129,7 +129,7 @@ bool lockdep_rtnl_is_held(void)
 EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
-static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
 {
@@ -164,7 +164,8 @@ static int rtnl_register_internal(struct module *owner,
 				  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 				  unsigned int flags)
 {
-	struct rtnl_link **tab, *link, *old;
+	struct rtnl_link *link, *old;
+	struct rtnl_link __rcu **tab;
 	int msgindex;
 	int ret = -ENOBUFS;
 
-- 
cgit v1.2.3


From 62b32379fd124fea521484ba7e220d8a449f0b59 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Mon, 4 Dec 2017 11:31:48 +0100
Subject: flow_dissector: dissect tunnel info outside __skb_flow_dissect()

Move dissection of tunnel info to outside of the main flow dissection
function, __skb_flow_dissect(). The sole user of this feature, the flower
classifier, is updated to call tunnel info dissection directly, using
skb_flow_dissect_tunnel_info().

This results in a slightly less complex implementation of
__skb_flow_dissect(), in particular removing logic from that call path
which is not used by the majority of users. The expense of this is borne by
the flower classifier which now has to make an extra call for tunnel info
dissection.

This patch should not result in any behavioural change.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  5 +++++
 net/core/flow_dissector.c | 12 +++++-------
 net/sched/cls_flower.c    |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a38c80e9f91e..b8e0da6c27d6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1211,6 +1211,11 @@ static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
 				  data, proto, nhoff, hlen, flags);
 }
 
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+			     struct flow_dissector *flow_dissector,
+			     void *target_container);
+
 static inline __u32 skb_get_hash(struct sk_buff *skb)
 {
 	if (!skb->l4_hash && !skb->sw_hash)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 15ce30063765..cc75488d3653 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -133,10 +133,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 	ctrl->addr_type = type;
 }
 
-static void
-__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
-			       struct flow_dissector *flow_dissector,
-			       void *target_container)
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+			     struct flow_dissector *flow_dissector,
+			     void *target_container)
 {
 	struct ip_tunnel_info *info;
 	struct ip_tunnel_key *key;
@@ -212,6 +212,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 		tp->dst = key->tp_dst;
 	}
 }
+EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
 
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
@@ -576,9 +577,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
-	__skb_flow_dissect_tunnel_info(skb, flow_dissector,
-				       target_container);
-
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 543a3e875d05..6132a7317efa 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -166,6 +166,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	 * so do it rather here.
 	 */
 	skb_key.basic.n_proto = skb->protocol;
+	skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key);
 	skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
-- 
cgit v1.2.3


From efbf78973978b0d25af59bc26c8013a942af6e64 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 4 Dec 2017 10:48:18 -0800
Subject: net_sched: get rid of rcu_barrier() in tcf_block_put_ext()

Both Eric and Paolo noticed the rcu_barrier() we use in
tcf_block_put_ext() could be a performance bottleneck when
we have a lot of tc classes.

Paolo provided the following to demonstrate the issue:

tc qdisc add dev lo root htb
for I in `seq 1 1000`; do
        tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
        tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
        for J in `seq 1 10`; do
                tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
        done
done
time tc qdisc del dev root

real    0m54.764s
user    0m0.023s
sys     0m0.000s

The rcu_barrier() there is to ensure we free the block after all chains
are gone, that is, to queue tcf_block_put_final() at the tail of workqueue.
We can achieve this ordering requirement by refcnt'ing tcf block instead,
that is, the tcf block is freed only when the last chain in this block is
gone. This also simplifies the code.

Paolo reported after this patch we get:

real    0m0.017s
user    0m0.000s
sys     0m0.017s

Tested-by: Paolo Abeni <pabeni@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 -
 net/sched/cls_api.c       | 30 +++++++++---------------------
 2 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..02e7ad8b8dad 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -278,7 +278,6 @@ struct tcf_block {
 	struct net *net;
 	struct Qdisc *q;
 	struct list_head cb_list;
-	struct work_struct work;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ddcf04b4ab43..d51051dd8f7d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -218,8 +218,12 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 
 static void tcf_chain_destroy(struct tcf_chain *chain)
 {
+	struct tcf_block *block = chain->block;
+
 	list_del(&chain->list);
 	kfree(chain);
+	if (list_empty(&block->chain_list))
+		kfree(block);
 }
 
 static void tcf_chain_hold(struct tcf_chain *chain)
@@ -330,27 +334,13 @@ int tcf_block_get(struct tcf_block **p_block,
 }
 EXPORT_SYMBOL(tcf_block_get);
 
-static void tcf_block_put_final(struct work_struct *work)
-{
-	struct tcf_block *block = container_of(work, struct tcf_block, work);
-	struct tcf_chain *chain, *tmp;
-
-	rtnl_lock();
-
-	/* At this point, all the chains should have refcnt == 1. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
-	rtnl_unlock();
-	kfree(block);
-}
-
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
  * actions should be all removed after flushing.
  */
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei)
 {
-	struct tcf_chain *chain;
+	struct tcf_chain *chain, *tmp;
 
 	/* Hold a refcnt for all chains, except 0, so that they don't disappear
 	 * while we are iterating.
@@ -364,13 +354,11 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	INIT_WORK(&block->work, tcf_block_put_final);
-	/* Wait for existing RCU callbacks to cool down, make sure their works
-	 * have been queued before this. We can not flush pending works here
-	 * because we are holding the RTNL lock.
+	/* At this point, all the chains should have refcnt >= 1. Block will be
+	 * freed after all chains are gone.
 	 */
-	rcu_barrier();
-	tcf_queue_work(&block->work);
+	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		tcf_chain_put(chain);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
-- 
cgit v1.2.3


From 0ac4bd68ab50a9f0860b10caacc1285fda5da0ca Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Mon, 4 Dec 2017 18:39:59 -0500
Subject: net: sched: sch_api: fix code style issues

This patch fix checkpatch issues for upcomming patches according to the
sched api file. It changes checking on null pointer, remove unnecessary
brackets, add variable names for parameters and adjust 80 char width.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 10 ++++++----
 net/sched/sch_api.c       | 11 ++++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 02e7ad8b8dad..7dd8b0b0d244 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -161,7 +161,8 @@ struct Qdisc_class_ops {
 	void			(*walk)(struct Qdisc *, struct qdisc_walker * arg);
 
 	/* Filter manipulation */
-	struct tcf_block *	(*tcf_block)(struct Qdisc *, unsigned long);
+	struct tcf_block *	(*tcf_block)(struct Qdisc *sch,
+					     unsigned long arg);
 	unsigned long		(*bind_tcf)(struct Qdisc *, unsigned long,
 					u32 classid);
 	void			(*unbind_tcf)(struct Qdisc *, unsigned long);
@@ -185,11 +186,12 @@ struct Qdisc_ops {
 	struct sk_buff *	(*dequeue)(struct Qdisc *);
 	struct sk_buff *	(*peek)(struct Qdisc *);
 
-	int			(*init)(struct Qdisc *, struct nlattr *arg);
+	int			(*init)(struct Qdisc *sch, struct nlattr *arg);
 	void			(*reset)(struct Qdisc *);
 	void			(*destroy)(struct Qdisc *);
-	int			(*change)(struct Qdisc *, struct nlattr *arg);
-	void			(*attach)(struct Qdisc *);
+	int			(*change)(struct Qdisc *sch,
+					  struct nlattr *arg);
+	void			(*attach)(struct Qdisc *sch);
 
 	int			(*dump)(struct Qdisc *, struct sk_buff *);
 	int			(*dump_stats)(struct Qdisc *, struct gnet_dump *);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..8f7f5378cc33 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1020,7 +1020,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 #endif
 
 	err = -ENOENT;
-	if (ops == NULL)
+	if (!ops)
 		goto err_out;
 
 	sch = qdisc_alloc(dev_queue, ops);
@@ -1087,7 +1087,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 			if (sch->flags & TCQ_F_MQROOT)
 				goto err_out4;
 
-			if ((sch->parent != TC_H_ROOT) &&
+			if (sch->parent != TC_H_ROOT &&
 			    !(sch->flags & TCQ_F_INGRESS) &&
 			    (!p || !(p->flags & TCQ_F_MQROOT)))
 				running = qdisc_root_sleeping_running(sch);
@@ -1139,7 +1139,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 	int err = 0;
 
 	if (tca[TCA_OPTIONS]) {
-		if (sch->ops->change == NULL)
+		if (!sch->ops->change)
 			return -EINVAL;
 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 		if (err)
@@ -1344,7 +1344,8 @@ replay:
 					goto create_n_graft;
 				if (n->nlmsg_flags & NLM_F_EXCL)
 					return -EEXIST;
-				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+				if (tca[TCA_KIND] &&
+				    nla_strcmp(tca[TCA_KIND], q->ops->id))
 					return -EINVAL;
 				if (q == p ||
 				    (p && check_loop(q, p, 0)))
@@ -1389,7 +1390,7 @@ replay:
 	}
 
 	/* Change qdisc parameters */
-	if (q == NULL)
+	if (!q)
 		return -ENOENT;
 	if (n->nlmsg_flags & NLM_F_EXCL)
 		return -EEXIST;
-- 
cgit v1.2.3


From 54160ef6ec64e5a27b8f4ab4105ae81a57064dca Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Mon, 4 Dec 2017 18:40:00 -0500
Subject: net: sched: sch_api: rearrange init handling

This patch fixes the following checkpatch error:

ERROR: do not use assignment in if condition

by rearranging the if condition to execute init callback only if init
callback exists. The whole setup afterwards is called in any case,
doesn't matter if init callback is set or not. This patch has the same
behaviour as before, just without assign err variable in if condition.
It also makes the code easier to read.

Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 88 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 8f7f5378cc33..a48ca41b7ecf 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1060,54 +1060,60 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
 	}
 
-	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
-		if (qdisc_is_percpu_stats(sch)) {
-			sch->cpu_bstats =
-				netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
-			if (!sch->cpu_bstats)
-				goto err_out4;
-
-			sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
-			if (!sch->cpu_qstats)
-				goto err_out4;
-		}
+	if (ops->init) {
+		err = ops->init(sch, tca[TCA_OPTIONS]);
+		if (err != 0)
+			goto err_out5;
+	}
 
-		if (tca[TCA_STAB]) {
-			stab = qdisc_get_stab(tca[TCA_STAB]);
-			if (IS_ERR(stab)) {
-				err = PTR_ERR(stab);
-				goto err_out4;
-			}
-			rcu_assign_pointer(sch->stab, stab);
-		}
-		if (tca[TCA_RATE]) {
-			seqcount_t *running;
+	if (qdisc_is_percpu_stats(sch)) {
+		sch->cpu_bstats =
+			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!sch->cpu_bstats)
+			goto err_out4;
 
-			err = -EOPNOTSUPP;
-			if (sch->flags & TCQ_F_MQROOT)
-				goto err_out4;
+		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+		if (!sch->cpu_qstats)
+			goto err_out4;
+	}
 
-			if (sch->parent != TC_H_ROOT &&
-			    !(sch->flags & TCQ_F_INGRESS) &&
-			    (!p || !(p->flags & TCQ_F_MQROOT)))
-				running = qdisc_root_sleeping_running(sch);
-			else
-				running = &sch->running;
-
-			err = gen_new_estimator(&sch->bstats,
-						sch->cpu_bstats,
-						&sch->rate_est,
-						NULL,
-						running,
-						tca[TCA_RATE]);
-			if (err)
-				goto err_out4;
+	if (tca[TCA_STAB]) {
+		stab = qdisc_get_stab(tca[TCA_STAB]);
+		if (IS_ERR(stab)) {
+			err = PTR_ERR(stab);
+			goto err_out4;
 		}
+		rcu_assign_pointer(sch->stab, stab);
+	}
+	if (tca[TCA_RATE]) {
+		seqcount_t *running;
 
-		qdisc_hash_add(sch, false);
+		err = -EOPNOTSUPP;
+		if (sch->flags & TCQ_F_MQROOT)
+			goto err_out4;
 
-		return sch;
+		if (sch->parent != TC_H_ROOT &&
+		    !(sch->flags & TCQ_F_INGRESS) &&
+		    (!p || !(p->flags & TCQ_F_MQROOT)))
+			running = qdisc_root_sleeping_running(sch);
+		else
+			running = &sch->running;
+
+		err = gen_new_estimator(&sch->bstats,
+					sch->cpu_bstats,
+					&sch->rate_est,
+					NULL,
+					running,
+					tca[TCA_RATE]);
+		if (err)
+			goto err_out4;
 	}
+
+	qdisc_hash_add(sch, false);
+
+	return sch;
+
+err_out5:
 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
 	if (ops->destroy)
 		ops->destroy(sch);
-- 
cgit v1.2.3


From 986d7ccf7e4ba2bc0c0929acfe54b6d41dec0805 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 5 Dec 2017 15:34:12 -0500
Subject: net: dsa: assign a CPU port to DSA port

DSA ports also need to have a dedicated CPU port assigned to them,
because they need to know where to egress frames targeting the CPU,
e.g. To_Cpu frames received on a Marvell Tag port.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 1e287420ff49..21f9bed11988 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -241,7 +241,7 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
 		for (port = 0; port < ds->num_ports; port++) {
 			dp = &ds->ports[port];
 
-			if (dsa_port_is_user(dp))
+			if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
 				dp->cpu_dp = dst->cpu_dp;
 		}
 	}
-- 
cgit v1.2.3


From 9a63b255dffd6de31fe47a80d16d26d0291d3714 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 5 Dec 2017 12:53:07 -0800
Subject: net_sched: remove unused parameter from act cleanup ops

No one actually uses it.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      |  2 +-
 net/sched/act_api.c        |  2 +-
 net/sched/act_bpf.c        |  2 +-
 net/sched/act_ife.c        | 10 +++++-----
 net/sched/act_ipt.c        |  2 +-
 net/sched/act_mirred.c     |  2 +-
 net/sched/act_pedit.c      |  2 +-
 net/sched/act_sample.c     |  2 +-
 net/sched/act_simple.c     |  2 +-
 net/sched/act_skbmod.c     |  2 +-
 net/sched/act_tunnel_key.c |  2 +-
 net/sched/act_vlan.c       |  2 +-
 12 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index fd08df74c466..02bf409140d0 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -86,7 +86,7 @@ struct tc_action_ops {
 	int     (*act)(struct sk_buff *, const struct tc_action *,
 		       struct tcf_result *);
 	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
-	void	(*cleanup)(struct tc_action *, int bind);
+	void	(*cleanup)(struct tc_action *);
 	int     (*lookup)(struct net *, struct tc_action **, u32);
 	int     (*init)(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act, int ovr,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4d33a50a8a6d..52622a3d2517 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -99,7 +99,7 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
 		p->tcfa_refcnt--;
 		if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
 			if (p->ops->cleanup)
-				p->ops->cleanup(p, bind);
+				p->ops->cleanup(p);
 			tcf_idr_remove(p->idrinfo, p);
 			ret = ACT_P_DELETED;
 		}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 5ef8ce8c83d4..e6c477fa9ca5 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -357,7 +357,7 @@ out:
 	return ret;
 }
 
-static void tcf_bpf_cleanup(struct tc_action *act, int bind)
+static void tcf_bpf_cleanup(struct tc_action *act)
 {
 	struct tcf_bpf_cfg tmp;
 
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 3007cb1310ea..dee9cf22686c 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -387,7 +387,7 @@ out_nlmsg_trim:
 }
 
 /* under ife->tcf_lock */
-static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+static void _tcf_ife_cleanup(struct tc_action *a)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_meta_info *e, *n;
@@ -405,13 +405,13 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind)
 	}
 }
 
-static void tcf_ife_cleanup(struct tc_action *a, int bind)
+static void tcf_ife_cleanup(struct tc_action *a)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_ife_params *p;
 
 	spin_lock_bh(&ife->tcf_lock);
-	_tcf_ife_cleanup(a, bind);
+	_tcf_ife_cleanup(a);
 	spin_unlock_bh(&ife->tcf_lock);
 
 	p = rcu_dereference_protected(ife->params, 1);
@@ -546,7 +546,7 @@ metadata_parse_err:
 			if (exists)
 				tcf_idr_release(*a, bind);
 			if (ret == ACT_P_CREATED)
-				_tcf_ife_cleanup(*a, bind);
+				_tcf_ife_cleanup(*a);
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
@@ -567,7 +567,7 @@ metadata_parse_err:
 		err = use_all_metadata(ife);
 		if (err) {
 			if (ret == ACT_P_CREATED)
-				_tcf_ife_cleanup(*a, bind);
+				_tcf_ife_cleanup(*a);
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d9e399a7e3d5..2479b255dc1d 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -77,7 +77,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
 	module_put(par.target->me);
 }
 
-static void tcf_ipt_release(struct tc_action *a, int bind)
+static void tcf_ipt_release(struct tc_action *a)
 {
 	struct tcf_ipt *ipt = to_ipt(a);
 	ipt_destroy_target(ipt->tcfi_t);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8b3e59388480..590f56afb985 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -50,7 +50,7 @@ static bool tcf_mirred_act_wants_ingress(int action)
 	}
 }
 
-static void tcf_mirred_release(struct tc_action *a, int bind)
+static void tcf_mirred_release(struct tc_action *a)
 {
 	struct tcf_mirred *m = to_mirred(a);
 	struct net_device *dev;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 491fe5deb09e..dba996bcd6dc 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -216,7 +216,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_pedit_cleanup(struct tc_action *a, int bind)
+static void tcf_pedit_cleanup(struct tc_action *a)
 {
 	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 9438969290a6..859a93903339 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -96,7 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_sample_cleanup(struct tc_action *a, int bind)
+static void tcf_sample_cleanup(struct tc_action *a)
 {
 	struct tcf_sample *s = to_sample(a);
 	struct psample_group *psample_group;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index e7b57e5071a3..eda57b47a6b6 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
 	return d->tcf_action;
 }
 
-static void tcf_simp_release(struct tc_action *a, int bind)
+static void tcf_simp_release(struct tc_action *a)
 {
 	struct tcf_defact *d = to_defact(a);
 	kfree(d->tcfd_defdata);
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index b642ad3d39dd..f090bba1a79e 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -184,7 +184,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+static void tcf_skbmod_cleanup(struct tc_action *a)
 {
 	struct tcf_skbmod *d = to_skbmod(a);
 	struct tcf_skbmod_params  *p;
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 30c96274c638..57b63bdec3ae 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -201,7 +201,7 @@ err_out:
 	return ret;
 }
 
-static void tunnel_key_release(struct tc_action *a, int bind)
+static void tunnel_key_release(struct tc_action *a)
 {
 	struct tcf_tunnel_key *t = to_tunnel_key(a);
 	struct tcf_tunnel_key_params *params;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 97f717a13ad5..41f0878ad26e 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -219,7 +219,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_vlan_cleanup(struct tc_action *a, int bind)
+static void tcf_vlan_cleanup(struct tc_action *a)
 {
 	struct tcf_vlan *v = to_vlan(a);
 	struct tcf_vlan_params *p;
-- 
cgit v1.2.3


From ef7baf5e083c09b66af500331cbb2be0dae37468 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 5 Dec 2017 15:15:44 -0800
Subject: ip6_gre: add ip6 erspan collect_md mode

Similar to ip6 gretap and ip4 gretap, the patch allows
erspan tunnel to operate in collect metadata mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of
it right away.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 110 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 85 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 1510ce9a4e4e..4562579797d1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -524,8 +524,37 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 					   false, false) < 0)
 			return PACKET_REJECT;
 
-		tunnel->parms.index = ntohl(index);
-		ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->parms.collect_md) {
+			struct metadata_dst *tun_dst;
+			struct ip_tunnel_info *info;
+			struct erspan_metadata *md;
+			__be64 tun_id;
+			__be16 flags;
+
+			tpi->flags |= TUNNEL_KEY;
+			flags = tpi->flags;
+			tun_id = key32_to_tunnel_id(tpi->key);
+
+			tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
+						  sizeof(*md));
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			info = &tun_dst->u.tun_info;
+			md = ip_tunnel_info_opts(info);
+			if (!md)
+				return PACKET_REJECT;
+
+			md->index = index;
+			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+			info->options_len = sizeof(*md);
+
+			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+
+		} else {
+			tunnel->parms.index = ntohl(index);
+			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		}
 
 		return PACKET_RCVD;
 	}
@@ -857,42 +886,73 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	if (gre_handle_offloads(skb, false))
 		goto tx_err;
 
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-		prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
-					 &dsfield, &encap_limit);
-		break;
-	case htons(ETH_P_IPV6):
-		if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
-			goto tx_err;
-		if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
-					     &dsfield, &encap_limit))
-			goto tx_err;
-		break;
-	default:
-		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-		break;
-	}
-
 	if (skb->len > dev->mtu + dev->hard_header_len) {
 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
 		truncate = true;
 	}
 
-	erspan_build_header(skb, t->parms.o_key, t->parms.index,
-			    truncate, false);
 	t->parms.o_flags &= ~TUNNEL_KEY;
-
 	IPCB(skb)->flags = 0;
-	fl6.daddr = t->parms.raddr;
+
+	/* For collect_md mode, derive fl6 from the tunnel key,
+	 * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
+	 */
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+		struct erspan_metadata *md;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info ||
+			     !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -EINVAL;
+
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_GRE;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+		fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+		dsfield = key->tos;
+		md = ip_tunnel_info_opts(tun_info);
+		if (!md)
+			goto tx_err;
+
+		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+				    ntohl(md->index), truncate, false);
+
+	} else {
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+			prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+						 &dsfield, &encap_limit);
+			break;
+		case htons(ETH_P_IPV6):
+			if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+				goto tx_err;
+			if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
+						     &dsfield, &encap_limit))
+				goto tx_err;
+			break;
+		default:
+			memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+			break;
+		}
+
+		erspan_build_header(skb, t->parms.o_key, t->parms.index,
+				    truncate, false);
+		fl6.daddr = t->parms.raddr;
+	}
 
 	/* Push GRE header. */
 	gre_build_header(skb, 8, TUNNEL_SEQ,
 			 htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++));
 
 	/* TooBig packet may have updated dst->dev's mtu */
-	if (dst && dst_mtu(dst) > dst->dev->mtu)
+	if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
 		dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
 
 	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
-- 
cgit v1.2.3


From 9f8a739e72f1546fb0f8c518af1193522c45be12 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 5 Dec 2017 16:17:26 -0800
Subject: act_mirred: get rid of tcfm_ifindex from struct tcf_mirred

tcfm_dev always points to the correct netdev and we already
hold a refcnt, so no need to use tcfm_ifindex to lookup again.

If we would support moving target netdev across netns, using
pointer would be better than ifindex.

This also fixes dumping obsolete ifindex, now after the
target device is gone we just dump 0 as ifindex.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c          |  6 ++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c  | 12 +++++-------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c     |  8 ++++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c         |  6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c       |  5 ++---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c        |  5 +----
 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c |  3 +--
 drivers/net/ethernet/netronome/nfp/flower/action.c    |  4 +---
 include/net/tc_act/tc_mirred.h                        |  6 ++----
 net/dsa/slave.c                                       |  5 +----
 net/sched/act_mirred.c                                |  7 +++----
 11 files changed, 26 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 0a5d72c8d04e..9807214da206 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -54,12 +54,10 @@ static int bnxt_tc_parse_redir(struct bnxt *bp,
 			       struct bnxt_tc_actions *actions,
 			       const struct tc_action *tc_act)
 {
-	int ifindex = tcf_mirred_ifindex(tc_act);
-	struct net_device *dev;
+	struct net_device *dev = tcf_mirred_dev(tc_act);
 
-	dev = __dev_get_by_index(dev_net(bp->dev), ifindex);
 	if (!dev) {
-		netdev_info(bp->dev, "no dev for ifindex=%d", ifindex);
+		netdev_info(bp->dev, "no dev in mirred action");
 		return -EINVAL;
 	}
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index d4a548a6a55c..a12b894f135d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -405,9 +405,7 @@ static void cxgb4_process_flow_actions(struct net_device *in,
 		} else if (is_tcf_gact_shot(a)) {
 			fs->action = FILTER_DROP;
 		} else if (is_tcf_mirred_egress_redirect(a)) {
-			int ifindex = tcf_mirred_ifindex(a);
-			struct net_device *out = __dev_get_by_index(dev_net(in),
-								    ifindex);
+			struct net_device *out = tcf_mirred_dev(a);
 			struct port_info *pi = netdev_priv(out);
 
 			fs->action = FILTER_SWITCH;
@@ -582,14 +580,14 @@ static int cxgb4_validate_flow_actions(struct net_device *dev,
 			/* Do nothing */
 		} else if (is_tcf_mirred_egress_redirect(a)) {
 			struct adapter *adap = netdev2adap(dev);
-			struct net_device *n_dev;
-			unsigned int i, ifindex;
+			struct net_device *n_dev, *target_dev;
+			unsigned int i;
 			bool found = false;
 
-			ifindex = tcf_mirred_ifindex(a);
+			target_dev = tcf_mirred_dev(a);
 			for_each_port(adap, i) {
 				n_dev = adap->port[i];
-				if (ifindex == n_dev->ifindex) {
+				if (target_dev == n_dev) {
 					found = true;
 					break;
 				}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index cd0cd13a964d..ab174bcfbfb0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -114,14 +114,14 @@ static int fill_action_fields(struct adapter *adap,
 
 		/* Re-direct to specified port in hardware. */
 		if (is_tcf_mirred_egress_redirect(a)) {
-			struct net_device *n_dev;
-			unsigned int i, index;
+			struct net_device *n_dev, *target_dev;
 			bool found = false;
+			unsigned int i;
 
-			index = tcf_mirred_ifindex(a);
+			target_dev = tcf_mirred_dev(a);
 			for_each_port(adap, i) {
 				n_dev = adap->port[i];
-				if (index == n_dev->ifindex) {
+				if (target_dev == n_dev) {
 					fs->action = FILTER_SWITCH;
 					fs->eport = i;
 					found = true;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 62a18914f00f..7737a05c717c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9101,9 +9101,11 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
 
 		/* Redirect to a VF or a offloaded macvlan */
 		if (is_tcf_mirred_egress_redirect(a)) {
-			int ifindex = tcf_mirred_ifindex(a);
+			struct net_device *dev = tcf_mirred_dev(a);
 
-			err = handle_redirect_action(adapter, ifindex, queue,
+			if (!dev)
+				return -EINVAL;
+			err = handle_redirect_action(adapter, dev->ifindex, queue,
 						     action);
 			if (err == 0)
 				return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 55979ec2e88a..3e03d2e8f96a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1982,11 +1982,10 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 		}
 
 		if (is_tcf_mirred_egress_redirect(a)) {
-			int ifindex = tcf_mirred_ifindex(a);
 			struct net_device *out_dev;
 			struct mlx5e_priv *out_priv;
 
-			out_dev = __dev_get_by_index(dev_net(priv->netdev), ifindex);
+			out_dev = tcf_mirred_dev(a);
 
 			if (switchdev_port_same_parent_id(priv->netdev,
 							  out_dev)) {
@@ -1996,7 +1995,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				rpriv = out_priv->ppriv;
 				attr->out_rep = rpriv->rep;
 			} else if (encap) {
-				parse_attr->mirred_ifindex = ifindex;
+				parse_attr->mirred_ifindex = out_dev->ifindex;
 				parse_attr->tun_info = *info;
 				attr->parse_attr = parse_attr;
 				attr->action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 2d0897b7d860..d2d945f2fc02 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1571,14 +1571,11 @@ mlxsw_sp_port_add_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
 				      const struct tc_action *a,
 				      bool ingress)
 {
-	struct net *net = dev_net(mlxsw_sp_port->dev);
 	enum mlxsw_sp_span_type span_type;
 	struct mlxsw_sp_port *to_port;
 	struct net_device *to_dev;
-	int ifindex;
 
-	ifindex = tcf_mirred_ifindex(a);
-	to_dev = __dev_get_by_index(net, ifindex);
+	to_dev = tcf_mirred_dev(a);
 	if (!to_dev) {
 		netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n");
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 2f0e57857ea4..347e96461273 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -92,7 +92,6 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 			if (err)
 				return err;
 		} else if (is_tcf_mirred_egress_redirect(a)) {
-			int ifindex = tcf_mirred_ifindex(a);
 			struct net_device *out_dev;
 			struct mlxsw_sp_fid *fid;
 			u16 fid_index;
@@ -104,7 +103,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 			if (err)
 				return err;
 
-			out_dev = __dev_get_by_index(dev_net(dev), ifindex);
+			out_dev = tcf_mirred_dev(a);
 			if (out_dev == dev)
 				out_dev = NULL;
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index c1c595f8bb87..ca74c517f626 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -93,13 +93,11 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
 	size_t act_size = sizeof(struct nfp_fl_output);
 	struct net_device *out_dev;
 	u16 tmp_flags;
-	int ifindex;
 
 	output->head.jump_id = NFP_FL_ACTION_OPCODE_OUTPUT;
 	output->head.len_lw = act_size >> NFP_FL_LW_SIZ;
 
-	ifindex = tcf_mirred_ifindex(action);
-	out_dev = __dev_get_by_index(dev_net(in_dev), ifindex);
+	out_dev = tcf_mirred_dev(action);
 	if (!out_dev)
 		return -EOPNOTSUPP;
 
diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 21d253c9a8c6..a2e9cbca5c9e 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -8,10 +8,8 @@
 struct tcf_mirred {
 	struct tc_action	common;
 	int			tcfm_eaction;
-	int			tcfm_ifindex;
 	bool			tcfm_mac_header_xmit;
 	struct net_device __rcu	*tcfm_dev;
-	struct net		*net;
 	struct list_head	tcfm_list;
 };
 #define to_mirred(a) ((struct tcf_mirred *)a)
@@ -34,9 +32,9 @@ static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a)
 	return false;
 }
 
-static inline int tcf_mirred_ifindex(const struct tc_action *a)
+static inline struct net_device *tcf_mirred_dev(const struct tc_action *a)
 {
-	return to_mirred(a)->tcfm_ifindex;
+	return rtnl_dereference(to_mirred(a)->tcfm_dev);
 }
 
 #endif /* __NET_TC_MIR_H */
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d6e7a642493b..895db05d8c82 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -709,14 +709,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_mall_tc_entry *mall_tc_entry;
 	__be16 protocol = cls->common.protocol;
-	struct net *net = dev_net(dev);
 	struct dsa_switch *ds = dp->ds;
 	struct net_device *to_dev;
 	const struct tc_action *a;
 	struct dsa_port *to_dp;
 	int err = -EOPNOTSUPP;
 	LIST_HEAD(actions);
-	int ifindex;
 
 	if (!ds->ops->port_mirror_add)
 		return err;
@@ -730,8 +728,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
 	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
 		struct dsa_mall_mirror_tc_entry *mirror;
 
-		ifindex = tcf_mirred_ifindex(a);
-		to_dev = __dev_get_by_index(net, ifindex);
+		to_dev = tcf_mirred_dev(a);
 		if (!to_dev)
 			return -EINVAL;
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 590f56afb985..ff497909c0ad 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -139,8 +139,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	m->tcf_action = parm->action;
 	m->tcfm_eaction = parm->eaction;
 	if (dev != NULL) {
-		m->tcfm_ifindex = parm->ifindex;
-		m->net = net;
 		if (ret != ACT_P_CREATED)
 			dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
 		dev_hold(dev);
@@ -247,13 +245,14 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_mirred *m = to_mirred(a);
+	struct net_device *dev = rtnl_dereference(m->tcfm_dev);
 	struct tc_mirred opt = {
 		.index   = m->tcf_index,
 		.action  = m->tcf_action,
 		.refcnt  = m->tcf_refcnt - ref,
 		.bindcnt = m->tcf_bindcnt - bind,
 		.eaction = m->tcfm_eaction,
-		.ifindex = m->tcfm_ifindex,
+		.ifindex = dev ? dev->ifindex : 0,
 	};
 	struct tcf_t t;
 
@@ -318,7 +317,7 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
 {
 	struct tcf_mirred *m = to_mirred(a);
 
-	return __dev_get_by_index(m->net, m->tcfm_ifindex);
+	return rtnl_dereference(m->tcfm_dev);
 }
 
 static struct tc_action_ops act_mirred_ops = {
-- 
cgit v1.2.3


From 4bee00eb25a7b487706d17926db4679ae8f0e411 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 5 Dec 2017 16:17:27 -0800
Subject: act_mirred: get rid of mirred_list_lock spinlock

TC actions are no longer freed in RCU callbacks and we should
always have RTNL lock, so this spinlock is no longer needed.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_mirred.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index ff497909c0ad..cee2d413bf57 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,7 +29,6 @@
 #include <net/tc_act/tc_mirred.h>
 
 static LIST_HEAD(mirred_list);
-static DEFINE_SPINLOCK(mirred_list_lock);
 
 static bool tcf_mirred_is_act_redirect(int action)
 {
@@ -55,13 +54,10 @@ static void tcf_mirred_release(struct tc_action *a)
 	struct tcf_mirred *m = to_mirred(a);
 	struct net_device *dev;
 
-	/* We could be called either in a RCU callback or with RTNL lock held. */
-	spin_lock_bh(&mirred_list_lock);
 	list_del(&m->tcfm_list);
-	dev = rcu_dereference_protected(m->tcfm_dev, 1);
+	dev = rtnl_dereference(m->tcfm_dev);
 	if (dev)
 		dev_put(dev);
-	spin_unlock_bh(&mirred_list_lock);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -147,9 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED) {
-		spin_lock_bh(&mirred_list_lock);
 		list_add(&m->tcfm_list, &mirred_list);
-		spin_unlock_bh(&mirred_list_lock);
 		tcf_idr_insert(tn, *a);
 	}
 
@@ -293,7 +287,6 @@ static int mirred_device_event(struct notifier_block *unused,
 
 	ASSERT_RTNL();
 	if (event == NETDEV_UNREGISTER) {
-		spin_lock_bh(&mirred_list_lock);
 		list_for_each_entry(m, &mirred_list, tcfm_list) {
 			if (rcu_access_pointer(m->tcfm_dev) == dev) {
 				dev_put(dev);
@@ -303,7 +296,6 @@ static int mirred_device_event(struct notifier_block *unused,
 				RCU_INIT_POINTER(m->tcfm_dev, NULL);
 			}
 		}
-		spin_unlock_bh(&mirred_list_lock);
 	}
 
 	return NOTIFY_DONE;
-- 
cgit v1.2.3


From d9b8693783e8a1ce8e4495d13d1dd6d8081f5070 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Wed, 6 Dec 2017 10:47:04 +0900
Subject: rds: debug: fix null check on static array

t_name cannot be NULL since it is an array field of a struct.
Replacing null check on static array with string length check using
strnlen()

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 9efc82c665b5..6492c0b608a4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 
 	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
 	  conn, &laddr, &faddr,
-	  trans->t_name ? trans->t_name : "[unknown]",
-	  is_outgoing ? "(outgoing)" : "");
+	  strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
+	  "[unknown]", is_outgoing ? "(outgoing)" : "");
 
 	/*
 	 * Since we ran without holding the conn lock, someone could
-- 
cgit v1.2.3


From 2a93c1a3651fb41b580676c849887b68af6da02b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 6 Dec 2017 15:03:33 -0800
Subject: net: dsa: Allow compiling out legacy support

Introduce a configuration option: CONFIG_NET_DSA_LEGACY allowing to compile out
support for the old platform device and Device Tree binding registration.
Support for these configurations is scheduled to be removed in 4.17.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/Kconfig          |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c |  4 ++++
 include/net/dsa.h                | 11 +++++++++++
 net/dsa/Kconfig                  |  9 +++++++++
 net/dsa/Makefile                 |  3 ++-
 net/dsa/dsa_priv.h               |  9 +++++++++
 net/dsa/legacy.c                 | 20 --------------------
 net/dsa/slave.c                  | 20 ++++++++++++++++++++
 8 files changed, 56 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 83a9bc892a3b..2b81b97e994f 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -33,7 +33,7 @@ config NET_DSA_MT7530
 
 config NET_DSA_MV88E6060
 	tristate "Marvell 88E6060 ethernet switch chip support"
-	depends on NET_DSA
+	depends on NET_DSA && NET_DSA_LEGACY
 	select NET_DSA_TAG_TRAILER
 	---help---
 	  This enables support for the Marvell 88E6060 ethernet switch
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 29b79d6d2925..24e5d98f15a1 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3755,6 +3755,7 @@ static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds,
 	return chip->info->tag_protocol;
 }
 
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
 static const char *mv88e6xxx_drv_probe(struct device *dsa_dev,
 				       struct device *host_dev, int sw_addr,
 				       void **priv)
@@ -3802,6 +3803,7 @@ free:
 
 	return NULL;
 }
+#endif
 
 static int mv88e6xxx_port_mdb_prepare(struct dsa_switch *ds, int port,
 				      const struct switchdev_obj_port_mdb *mdb)
@@ -3841,7 +3843,9 @@ static int mv88e6xxx_port_mdb_del(struct dsa_switch *ds, int port,
 }
 
 static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
 	.probe			= mv88e6xxx_drv_probe,
+#endif
 	.get_tag_protocol	= mv88e6xxx_get_tag_protocol,
 	.setup			= mv88e6xxx_setup,
 	.adjust_link		= mv88e6xxx_adjust_link,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d29feccaefab..6cb602dd970c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -321,12 +321,14 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port)
 typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
 			      bool is_static, void *data);
 struct dsa_switch_ops {
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
 	/*
 	 * Legacy probing.
 	 */
 	const char	*(*probe)(struct device *dsa_dev,
 				  struct device *host_dev, int sw_addr,
 				  void **priv);
+#endif
 
 	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
 						  int port);
@@ -474,11 +476,20 @@ struct dsa_switch_driver {
 	const struct dsa_switch_ops *ops;
 };
 
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
 /* Legacy driver registration */
 void register_switch_driver(struct dsa_switch_driver *type);
 void unregister_switch_driver(struct dsa_switch_driver *type);
 struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev);
 
+#else
+static inline void register_switch_driver(struct dsa_switch_driver *type) { }
+static inline void unregister_switch_driver(struct dsa_switch_driver *type) { }
+static inline struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
+{
+	return NULL;
+}
+#endif
 struct net_device *dsa_dev_to_net_device(struct device *dev);
 
 /* Keep inline for faster access in hot path */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 03c3bdf25468..bbf2c82cf7b2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -16,6 +16,15 @@ config NET_DSA
 
 if NET_DSA
 
+config NET_DSA_LEGACY
+	bool "Support for older platform device and Device Tree registration"
+	default y
+	---help---
+	  Say Y if you want to enable support for the older platform device and
+	  deprecated Device Tree binding registration.
+
+	  This feature is scheduled for removal in 4.17.
+
 # tagging formats
 config NET_DSA_TAG_BRCM
 	bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 0e13c1f95d13..9e4d3536f977 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o
+dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o
+dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7d036696e8c4..b03665e8fb4e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -97,8 +97,17 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 bool dsa_schedule_work(struct work_struct *work);
 
 /* legacy.c */
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
 int dsa_legacy_register(void);
 void dsa_legacy_unregister(void);
+#else
+static inline int dsa_legacy_register(void)
+{
+	return -ENODEV;
+}
+
+static inline void dsa_legacy_unregister(void) { }
+#endif
 int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		       struct net_device *dev,
 		       const unsigned char *addr, u16 vid,
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 84611d7fcfa2..aa56d3fb5da4 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -718,26 +718,6 @@ static int dsa_resume(struct device *d)
 }
 #endif
 
-/* legacy way, bypassing the bridge *****************************************/
-int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev,
-		       const unsigned char *addr, u16 vid,
-		       u16 flags)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return dsa_port_fdb_add(dp, addr, vid);
-}
-
-int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev,
-		       const unsigned char *addr, u16 vid)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return dsa_port_fdb_del(dp, addr, vid);
-}
-
 static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
 
 static const struct of_device_id dsa_of_match_table[] = {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 895db05d8c82..5d6475a6cc5d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -941,6 +941,26 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.set_rxnfc		= dsa_slave_set_rxnfc,
 };
 
+/* legacy way, bypassing the bridge *****************************************/
+int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+		       struct net_device *dev,
+		       const unsigned char *addr, u16 vid,
+		       u16 flags)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return dsa_port_fdb_add(dp, addr, vid);
+}
+
+int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+		       struct net_device *dev,
+		       const unsigned char *addr, u16 vid)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return dsa_port_fdb_del(dp, addr, vid);
+}
+
 static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
-- 
cgit v1.2.3


From a8ae890b9c5f0d812337d97f2552c0d772957dc2 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 13:38:44 +0100
Subject: smc: make smc_close_active_abort() static

smc_close_active_abort() is used in smc_close.c only.
Make it static.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 2 +-
 net/smc/smc_close.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 48615d2ac4aa..e194c6cc308a 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -113,7 +113,7 @@ static int smc_close_abort(struct smc_connection *conn)
 /* terminate smc socket abnormally - active abort
  * RDMA communication no longer possible
  */
-void smc_close_active_abort(struct smc_sock *smc)
+static void smc_close_active_abort(struct smc_sock *smc)
 {
 	struct smc_cdc_conn_state_flags *txflags =
 		&smc->conn.local_tx_ctrl.conn_state_flags;
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index ed82506b1b0a..8c498885d758 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -20,7 +20,6 @@
 #define SMC_CLOSE_SOCK_PUT_DELAY		HZ
 
 void smc_close_wake_tx_prepared(struct smc_sock *smc);
-void smc_close_active_abort(struct smc_sock *smc);
 int smc_close_active(struct smc_sock *smc);
 void smc_close_sock_put_work(struct work_struct *work);
 int smc_close_shutdown_write(struct smc_sock *smc);
-- 
cgit v1.2.3


From 0c9f1515aa80f12734123e5fcc50ffe525e1d533 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 13:38:45 +0100
Subject: smc: improve smc_clc_send_decline() error handling

Let smc_clc_send_decline() return with an error, if the amount
sent is smaller than the length of an smc decline message.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  | 9 +++------
 net/smc/smc_clc.c | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6451c5013e06..d3ae0d5b1677 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -520,7 +520,7 @@ decline_rdma:
 	smc->use_fallback = true;
 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 		rc = smc_clc_send_decline(smc, reason_code);
-		if (rc < sizeof(struct smc_clc_msg_decline))
+		if (rc < 0)
 			goto out_err;
 	}
 	goto out_connected;
@@ -879,11 +879,9 @@ static void smc_listen_work(struct work_struct *work)
 		}
 		/* QP confirmation over RoCE fabric */
 		reason_code = smc_serv_conf_first_link(new_smc);
-		if (reason_code < 0) {
+		if (reason_code < 0)
 			/* peer is not aware of a problem */
-			rc = reason_code;
 			goto out_err_unlock;
-		}
 		if (reason_code > 0)
 			goto decline_rdma_unlock;
 	}
@@ -916,8 +914,7 @@ decline_rdma:
 	smc_conn_free(&new_smc->conn);
 	new_smc->use_fallback = true;
 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
-		rc = smc_clc_send_decline(new_smc, reason_code);
-		if (rc < sizeof(struct smc_clc_msg_decline))
+		if (smc_clc_send_decline(new_smc, reason_code) < 0)
 			goto out_err;
 	}
 	goto out_connected;
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 1800e16b2a02..f5e17d29112b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -133,7 +133,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
 		smc->sk.sk_err = EPROTO;
 	if (len < 0)
 		smc->sk.sk_err = -len;
-	return len;
+	return sock_error(&smc->sk);
 }
 
 /* send CLC PROPOSAL message across internal TCP socket */
-- 
cgit v1.2.3


From 4bd3e7fbfadcd284b1582a284d076afebbe3479d Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 13:38:46 +0100
Subject: smc: no update for unused sk_write_pending

The smc code never checks the sk_write_pending sock field.
Thus there is no need to update it.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tx.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index c48dc2d5fd3a..77555c6ed199 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -104,14 +104,12 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 		if (atomic_read(&conn->sndbuf_space))
 			break; /* at least 1 byte of free space available */
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		sk->sk_write_pending++;
 		sk_wait_event(sk, &timeo,
 			      sk->sk_err ||
 			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
 			      smc_cdc_rxed_any_close_or_senddone(conn) ||
 			      atomic_read(&conn->sndbuf_space),
 			      &wait);
-		sk->sk_write_pending--;
 	}
 	remove_wait_queue(sk_sleep(sk), &wait);
 	return rc;
-- 
cgit v1.2.3


From 71c125c3f23d714c1d0725ca11c9f27416f697c8 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 13:38:47 +0100
Subject: smc: cleanup close checking during data receival

When waiting for data to be received it must be checked if the
peer signals shutdown. The SMC code uses two different checks
for this purpose, even though just one check is sufficient.
This patch removes the superfluous test for SOCK_DONE.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_rx.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index cbf58637ee14..9dc392ca06bf 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -65,7 +65,6 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
 	rc = sk_wait_event(sk, timeo,
 			   sk->sk_err ||
 			   sk->sk_shutdown & RCV_SHUTDOWN ||
-			   sock_flag(sk, SOCK_DONE) ||
 			   atomic_read(&conn->bytes_to_rcv) ||
 			   smc_cdc_rxed_any_close_or_senddone(conn),
 			   &wait);
@@ -116,7 +115,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 		if (read_done) {
 			if (sk->sk_err ||
 			    sk->sk_state == SMC_CLOSED ||
-			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    sk->sk_shutdown & RCV_SHUTDOWN ||
 			    !timeo ||
 			    signal_pending(current) ||
 			    smc_cdc_rxed_any_close_or_senddone(conn) ||
@@ -124,8 +123,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 			    peer_conn_abort)
 				break;
 		} else {
-			if (sock_flag(sk, SOCK_DONE))
-				break;
 			if (sk->sk_err) {
 				read_done = sock_error(sk);
 				break;
-- 
cgit v1.2.3


From 6b5771aa3c351b118b1eee7bc98e1483eb0c8ca8 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 13:38:48 +0100
Subject: smc: no consumer update in tasklet context

The SMC protocol requires to send a separate consumer cursor update,
if it cannot be piggybacked to updates of the producer cursor.
When receiving a blocked signal from the sender, this update is sent
already in tasklet context. In addition consumer cursor updates are
sent after data receival.
Sending of cursor updates is controlled by sequence numbers.
Assuming receiving stray messages the receiver drops updates with older
sequence numbers than an already received cursor update with a higher
sequence number.
Sending consumer cursor updates in tasklet context may result in
wrong order sends and its corresponding drops at the receiver. Since
it is sufficient to send consumer cursor updates once the data is
received, this patch gets rid of the consumer cursor update in tasklet
context to guarantee in-sequence arrival of cursor updates.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c | 12 +++---------
 net/smc/smc_tx.c  |  9 ++-------
 2 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 87f7bede6eab..d4155ff6acde 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -213,6 +213,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
 		smp_mb__after_atomic();
 		smc->sk.sk_data_ready(&smc->sk);
+	} else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+		   (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
+		smc->sk.sk_data_ready(&smc->sk);
 	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
@@ -234,15 +237,6 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* trigger socket release if connection closed */
 		smc_close_wake_tx_prepared(smc);
 	}
-
-	/* socket connected but not accepted */
-	if (!smc->sk.sk_socket)
-		return;
-
-	/* data available */
-	if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
-	    (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
-		smc_tx_consumer_update(conn);
 }
 
 /* called under tasklet context */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 77555c6ed199..2e50fddf8ce9 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -448,9 +448,7 @@ static void smc_tx_work(struct work_struct *work)
 void smc_tx_consumer_update(struct smc_connection *conn)
 {
 	union smc_host_cursor cfed, cons;
-	struct smc_cdc_tx_pend *pend;
-	struct smc_wr_buf *wr_buf;
-	int to_confirm, rc;
+	int to_confirm;
 
 	smc_curs_write(&cons,
 		       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
@@ -464,10 +462,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmbe_size / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
-		rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
-		if (!rc)
-			rc = smc_cdc_msg_send(conn, wr_buf, pend);
-		if (rc < 0) {
+		if (smc_cdc_get_slot_and_msg_send(conn) < 0) {
 			schedule_delayed_work(&conn->tx_work,
 					      SMC_TX_WORK_DELAY);
 			return;
-- 
cgit v1.2.3


From e7b7a64a8493d47433fd003efbe6543e3f676294 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 13:38:49 +0100
Subject: smc: support variable CLC proposal messages

According to RFC7609 [1] the CLC proposal message contains an area of
unknown length for future growth. Additionally it may contain up to
8 IPv6 prefixes. The current version of the SMC-code does not
understand CLC proposal messages using these variable length fields and,
thus, is incompatible with SMC implementations in other operating
systems.

This patch makes sure, SMC understands incoming CLC proposals
* with arbitrary length values for future growth
* with up to 8 IPv6 prefixes

[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reviewed-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  | 15 ++++++----
 net/smc/smc_clc.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++---------
 net/smc/smc_clc.h | 34 +++++++++++++++++++----
 3 files changed, 107 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index d3ae0d5b1677..daf8075f5a4c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -751,14 +751,16 @@ static void smc_listen_work(struct work_struct *work)
 {
 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
 						smc_listen_work);
+	struct smc_clc_msg_proposal_prefix *pclc_prfx;
 	struct socket *newclcsock = new_smc->clcsock;
 	struct smc_sock *lsmc = new_smc->listen_smc;
 	struct smc_clc_msg_accept_confirm cclc;
 	int local_contact = SMC_REUSE_CONTACT;
 	struct sock *newsmcsk = &new_smc->sk;
-	struct smc_clc_msg_proposal pclc;
+	struct smc_clc_msg_proposal *pclc;
 	struct smc_ib_device *smcibdev;
 	struct sockaddr_in peeraddr;
+	u8 buf[SMC_CLC_MAX_LEN];
 	struct smc_link *link;
 	int reason_code = 0;
 	int rc = 0, len;
@@ -775,7 +777,7 @@ static void smc_listen_work(struct work_struct *work)
 	/* do inband token exchange -
 	 *wait for and receive SMC Proposal CLC message
 	 */
-	reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
 				       SMC_CLC_PROPOSAL);
 	if (reason_code < 0)
 		goto out_err;
@@ -804,8 +806,11 @@ static void smc_listen_work(struct work_struct *work)
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
 	}
-	if ((pclc.outgoing_subnet != subnet) ||
-	    (pclc.prefix_len != prefix_len)) {
+
+	pclc = (struct smc_clc_msg_proposal *)&buf;
+	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+	if (pclc_prfx->outgoing_subnet != subnet ||
+	    pclc_prfx->prefix_len != prefix_len) {
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
 	}
@@ -816,7 +821,7 @@ static void smc_listen_work(struct work_struct *work)
 	/* allocate connection / link group */
 	mutex_lock(&smc_create_lgr_pending);
 	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
-					smcibdev, ibport, &pclc.lcl, 0);
+					smcibdev, ibport, &pclc->lcl, 0);
 	if (local_contact < 0) {
 		rc = local_contact;
 		if (rc == -ENOMEM)
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index f5e17d29112b..abf7ceb6690b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -22,6 +22,54 @@
 #include "smc_clc.h"
 #include "smc_ib.h"
 
+/* check if received message has a correct header length and contains valid
+ * heading and trailing eyecatchers
+ */
+static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
+{
+	struct smc_clc_msg_proposal_prefix *pclc_prfx;
+	struct smc_clc_msg_accept_confirm *clc;
+	struct smc_clc_msg_proposal *pclc;
+	struct smc_clc_msg_decline *dclc;
+	struct smc_clc_msg_trail *trl;
+
+	if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+		return false;
+	switch (clcm->type) {
+	case SMC_CLC_PROPOSAL:
+		pclc = (struct smc_clc_msg_proposal *)clcm;
+		pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+		if (ntohs(pclc->hdr.length) !=
+			sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+			sizeof(*pclc_prfx) +
+			pclc_prfx->ipv6_prefixes_cnt *
+				sizeof(struct smc_clc_ipv6_prefix) +
+			sizeof(*trl))
+			return false;
+		trl = (struct smc_clc_msg_trail *)
+			((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
+		break;
+	case SMC_CLC_ACCEPT:
+	case SMC_CLC_CONFIRM:
+		clc = (struct smc_clc_msg_accept_confirm *)clcm;
+		if (ntohs(clc->hdr.length) != sizeof(*clc))
+			return false;
+		trl = &clc->trl;
+		break;
+	case SMC_CLC_DECLINE:
+		dclc = (struct smc_clc_msg_decline *)clcm;
+		if (ntohs(dclc->hdr.length) != sizeof(*dclc))
+			return false;
+		trl = &dclc->trl;
+		break;
+	default:
+		return false;
+	}
+	if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+		return false;
+	return true;
+}
+
 /* Wait for data on the tcp-socket, analyze received data
  * Returns:
  * 0 if success and it was not a decline that we received.
@@ -72,9 +120,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	}
 	datlen = ntohs(clcm->length);
 	if ((len < sizeof(struct smc_clc_msg_hdr)) ||
-	    (datlen < sizeof(struct smc_clc_msg_decline)) ||
-	    (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
-	    memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+	    (datlen > buflen) ||
 	    ((clcm->type != SMC_CLC_DECLINE) &&
 	     (clcm->type != expected_type))) {
 		smc->sk.sk_err = EPROTO;
@@ -89,7 +135,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	krflags = MSG_WAITALL;
 	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
 	len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
-	if (len < datlen) {
+	if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
 		smc->sk.sk_err = EPROTO;
 		reason_code = -EPROTO;
 		goto out;
@@ -141,33 +187,43 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 			  struct smc_ib_device *smcibdev,
 			  u8 ibport)
 {
+	struct smc_clc_msg_proposal_prefix pclc_prfx;
 	struct smc_clc_msg_proposal pclc;
+	struct smc_clc_msg_trail trl;
 	int reason_code = 0;
+	struct kvec vec[3];
 	struct msghdr msg;
-	struct kvec vec;
-	int len, rc;
+	int len, plen, rc;
 
 	/* send SMC Proposal CLC message */
+	plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl);
 	memset(&pclc, 0, sizeof(pclc));
 	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	pclc.hdr.type = SMC_CLC_PROPOSAL;
-	pclc.hdr.length = htons(sizeof(pclc));
+	pclc.hdr.length = htons(plen);
 	pclc.hdr.version = SMC_CLC_V1;		/* SMC version */
 	memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
 	memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
 	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
+	pclc.iparea_offset = htons(0);
 
+	memset(&pclc_prfx, 0, sizeof(pclc_prfx));
 	/* determine subnet and mask from internal TCP socket */
-	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
-				  &pclc.prefix_len);
+	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
+				  &pclc_prfx.prefix_len);
 	if (rc)
 		return SMC_CLC_DECL_CNFERR; /* configuration error */
-	memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	pclc_prfx.ipv6_prefixes_cnt = 0;
+	memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	memset(&msg, 0, sizeof(msg));
-	vec.iov_base = &pclc;
-	vec.iov_len = sizeof(pclc);
+	vec[0].iov_base = &pclc;
+	vec[0].iov_len = sizeof(pclc);
+	vec[1].iov_base = &pclc_prfx;
+	vec[1].iov_len = sizeof(pclc_prfx);
+	vec[2].iov_base = &trl;
+	vec[2].iov_len = sizeof(trl);
 	/* due to the few bytes needed for clc-handshake this cannot block */
-	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+	len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen);
 	if (len < sizeof(pclc)) {
 		if (len >= 0) {
 			reason_code = -ENETUNREACH;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 12a9af1539a2..c145a0f36a68 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -44,7 +44,7 @@ struct smc_clc_msg_hdr {	/* header1 of clc messages */
 #if defined(__BIG_ENDIAN_BITFIELD)
 	u8 version : 4,
 	   flag    : 1,
-	   rsvd	   : 3;
+	   rsvd    : 3;
 #elif defined(__LITTLE_ENDIAN_BITFIELD)
 	u8 rsvd    : 3,
 	   flag    : 1,
@@ -62,17 +62,31 @@ struct smc_clc_msg_local {	/* header2 of clc messages */
 	u8 mac[6];		/* mac of ib_device port */
 };
 
-struct smc_clc_msg_proposal {	/* clc proposal message */
-	struct smc_clc_msg_hdr hdr;
-	struct smc_clc_msg_local lcl;
-	__be16 iparea_offset;	/* offset to IP address information area */
+struct smc_clc_ipv6_prefix {
+	u8 prefix[4];
+	u8 prefix_len;
+} __packed;
+
+struct smc_clc_msg_proposal_prefix {	/* prefix part of clc proposal message*/
 	__be32 outgoing_subnet;	/* subnet mask */
 	u8 prefix_len;		/* number of significant bits in mask */
 	u8 reserved[2];
 	u8 ipv6_prefixes_cnt;	/* number of IPv6 prefixes in prefix array */
-	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
 } __aligned(4);
 
+struct smc_clc_msg_proposal {	/* clc proposal message sent by Linux */
+	struct smc_clc_msg_hdr hdr;
+	struct smc_clc_msg_local lcl;
+	__be16 iparea_offset;	/* offset to IP address information area */
+} __aligned(4);
+
+#define SMC_CLC_PROPOSAL_MAX_OFFSET	0x28
+#define SMC_CLC_PROPOSAL_MAX_PREFIX	(8 * sizeof(struct smc_clc_ipv6_prefix))
+#define SMC_CLC_MAX_LEN		(sizeof(struct smc_clc_msg_proposal) + \
+				 SMC_CLC_PROPOSAL_MAX_OFFSET + \
+				 SMC_CLC_PROPOSAL_MAX_PREFIX + \
+				 sizeof(struct smc_clc_msg_trail))
+
 struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
 	struct smc_clc_msg_hdr hdr;
 	struct smc_clc_msg_local lcl;
@@ -102,6 +116,14 @@ struct smc_clc_msg_decline {	/* clc decline message */
 	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
 } __aligned(4);
 
+/* determine start of the prefix area within the proposal message */
+static inline struct smc_clc_msg_proposal_prefix *
+smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
+{
+	return (struct smc_clc_msg_proposal_prefix *)
+	       ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
+}
+
 struct smc_sock;
 struct smc_ib_device;
 
-- 
cgit v1.2.3


From 6c148184b5c868ad2c8a5a4a777cd8097622368a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:54:06 -0800
Subject: net: sched: cleanup qdisc_run and __qdisc_run semantics

Currently __qdisc_run calls qdisc_run_end() but does not call
qdisc_run_begin(). This makes it hard to track pairs of
qdisc_run_{begin,end} across function calls.

To simplify reading these code paths this patch moves begin/end calls
into qdisc_run().

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 4 +++-
 net/core/dev.c          | 5 +++--
 net/sched/sch_generic.c | 2 --
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index d1f413f06c72..4eea7198898e 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -113,8 +113,10 @@ void __qdisc_run(struct Qdisc *q);
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-	if (qdisc_run_begin(q))
+	if (qdisc_run_begin(q)) {
 		__qdisc_run(q);
+		qdisc_run_end(q);
+	}
 }
 
 static inline __be16 tc_skb_protocol(const struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index 6bea8931bb62..44c7de365f55 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3192,9 +3192,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
-		} else
-			qdisc_run_end(q);
+		}
 
+		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
@@ -3204,6 +3204,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
+			qdisc_run_end(q);
 		}
 	}
 	spin_unlock(root_lock);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3839cbbdc32b..f6803e1d6783 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -266,8 +266,6 @@ void __qdisc_run(struct Qdisc *q)
 			break;
 		}
 	}
-
-	qdisc_run_end(q);
 }
 
 unsigned long dev_trans_start(struct net_device *dev)
-- 
cgit v1.2.3


From 6b3ba9146fe64b9bebb6346c9dcfe3b4851de2d7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:54:25 -0800
Subject: net: sched: allow qdiscs to handle locking

This patch adds a flag for queueing disciplines to indicate the stack
does not need to use the qdisc lock to protect operations. This can
be used to build lockless scheduling algorithms and improving
performance.

The flag is checked in the tx path and the qdisc lock is only taken
if it is not set. For now use a conditional if statement. Later we
could be more aggressive if it proves worthwhile and use a static key
or wrap this in a likely().

Also the lockless case drops the TCQ_F_CAN_BYPASS logic. The reason
for this is synchronizing a qlen counter across threads proves to
cost more than doing the enqueue/dequeue operations when tested with
pktgen.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 +
 net/core/dev.c            | 26 ++++++++++++++++++++++----
 net/sched/sch_generic.c   | 30 ++++++++++++++++++++----------
 3 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7dd8b0b0d244..77791fa055de 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
diff --git a/net/core/dev.c b/net/core/dev.c
index 44c7de365f55..e32cf5c7f200 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+			__qdisc_drop(skb, &to_free);
+			rc = NET_XMIT_DROP;
+		} else {
+			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+			__qdisc_run(q);
+		}
+
+		if (unlikely(to_free))
+			kfree_skb_list(to_free);
+		return rc;
+	}
+
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -4144,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 
 		while (head) {
 			struct Qdisc *q = head;
-			spinlock_t *root_lock;
+			spinlock_t *root_lock = NULL;
 
 			head = head->next_sched;
 
-			root_lock = qdisc_lock(q);
-			spin_lock(root_lock);
+			if (!(q->flags & TCQ_F_NOLOCK)) {
+				root_lock = qdisc_lock(q);
+				spin_lock(root_lock);
+			}
 			/* We need to make sure head->next_sched is read
 			 * before clearing __QDISC_STATE_SCHED
 			 */
 			smp_mb__before_atomic();
 			clear_bit(__QDISC_STATE_SCHED, &q->state);
 			qdisc_run(q);
-			spin_unlock(root_lock);
+			if (root_lock)
+				spin_unlock(root_lock);
 		}
 	}
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f6803e1d6783..ec757f66896a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -174,7 +174,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	int ret = NETDEV_TX_BUSY;
 
 	/* And release qdisc */
-	spin_unlock(root_lock);
+	if (root_lock)
+		spin_unlock(root_lock);
 
 	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
 	if (validate)
@@ -187,10 +188,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 		HARD_TX_UNLOCK(dev, txq);
 	} else {
-		spin_lock(root_lock);
+		if (root_lock)
+			spin_lock(root_lock);
 		return qdisc_qlen(q);
 	}
-	spin_lock(root_lock);
+
+	if (root_lock)
+		spin_lock(root_lock);
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
@@ -231,9 +235,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  */
 static inline int qdisc_restart(struct Qdisc *q, int *packets)
 {
+	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
 	struct net_device *dev;
-	spinlock_t *root_lock;
 	struct sk_buff *skb;
 	bool validate;
 
@@ -242,7 +246,9 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 	if (unlikely(!skb))
 		return 0;
 
-	root_lock = qdisc_lock(q);
+	if (!(q->flags & TCQ_F_NOLOCK))
+		root_lock = qdisc_lock(q);
+
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
@@ -880,14 +886,18 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 
 		dev_queue = netdev_get_tx_queue(dev, i);
 		q = dev_queue->qdisc_sleeping;
-		root_lock = qdisc_lock(q);
 
-		spin_lock_bh(root_lock);
+		if (q->flags & TCQ_F_NOLOCK) {
+			val = test_bit(__QDISC_STATE_SCHED, &q->state);
+		} else {
+			root_lock = qdisc_lock(q);
+			spin_lock_bh(root_lock);
 
-		val = (qdisc_is_running(q) ||
-		       test_bit(__QDISC_STATE_SCHED, &q->state));
+			val = (qdisc_is_running(q) ||
+			       test_bit(__QDISC_STATE_SCHED, &q->state));
 
-		spin_unlock_bh(root_lock);
+			spin_unlock_bh(root_lock);
+		}
 
 		if (val)
 			return true;
-- 
cgit v1.2.3


From 29b86cdac00a82f88b81c16659e64cc624550216 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:54:47 -0800
Subject: net: sched: remove remaining uses for qdisc_qlen in xmit path

sch_direct_xmit() uses qdisc_qlen as a return value but all call sites
of the routine only check if it is zero or not. Simplify the logic so
that we don't need to return an actual queue length value.

This introduces a case now where sch_direct_xmit would have returned
a qlen of zero but now it returns true. However in this case all
call sites of sch_direct_xmit will implement a dequeue() and get
a null skb and abort. This trades tracking qlen in the hotpath for
an extra dequeue operation. Overall this seems to be good for
performance.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h |  6 +++---
 net/sched/sch_generic.c | 28 +++++++++++++---------------
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 4eea7198898e..240469228851 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -105,9 +105,9 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 void qdisc_put_rtab(struct qdisc_rate_table *tab);
 void qdisc_put_stab(struct qdisc_size_table *tab);
 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
-int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
-		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock, bool validate);
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		     struct net_device *dev, struct netdev_queue *txq,
+		     spinlock_t *root_lock, bool validate);
 
 void __qdisc_run(struct Qdisc *q);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ec757f66896a..cbc0a9a00e30 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -164,12 +164,12 @@ trace:
  * only one CPU can execute this function.
  *
  * Returns to the caller:
- *				0  - queue is empty or throttled.
- *				>0 - queue is not empty.
+ *				false  - hardware queue frozen backoff
+ *				true   - feel free to send more pkts
  */
-int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
-		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock, bool validate)
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		     struct net_device *dev, struct netdev_queue *txq,
+		     spinlock_t *root_lock, bool validate)
 {
 	int ret = NETDEV_TX_BUSY;
 
@@ -190,28 +190,26 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	} else {
 		if (root_lock)
 			spin_lock(root_lock);
-		return qdisc_qlen(q);
+		return true;
 	}
 
 	if (root_lock)
 		spin_lock(root_lock);
 
-	if (dev_xmit_complete(ret)) {
-		/* Driver sent out skb successfully or skb was consumed */
-		ret = qdisc_qlen(q);
-	} else {
+	if (!dev_xmit_complete(ret)) {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
 			net_warn_ratelimited("BUG %s code %d qlen %d\n",
 					     dev->name, ret, q->q.qlen);
 
-		ret = dev_requeue_skb(skb, q);
+		dev_requeue_skb(skb, q);
+		return false;
 	}
 
 	if (ret && netif_xmit_frozen_or_stopped(txq))
-		ret = 0;
+		return false;
 
-	return ret;
+	return true;
 }
 
 /*
@@ -233,7 +231,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  *				>0 - queue is not empty.
  *
  */
-static inline int qdisc_restart(struct Qdisc *q, int *packets)
+static inline bool qdisc_restart(struct Qdisc *q, int *packets)
 {
 	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
@@ -244,7 +242,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 	/* Dequeue packet */
 	skb = dequeue_skb(q, &validate, packets);
 	if (unlikely(!skb))
-		return 0;
+		return false;
 
 	if (!(q->flags & TCQ_F_NOLOCK))
 		root_lock = qdisc_lock(q);
-- 
cgit v1.2.3


From d59f5ffa59d80ff3a2f3b56a9acea4310974c6d1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:55:26 -0800
Subject: net: sched: a dflt qdisc may be used with per cpu stats

Enable dflt qdisc support for per cpu stats before this patch a dflt
qdisc was required to use the global statistics qstats and bstats.

This adds a static flags field to qdisc_ops that is propagated
into qdisc->flags in qdisc allocate call. This allows the allocation
block to completely allocate the qdisc object so we don't have
dangling allocations after qdisc init.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 +
 net/sched/sch_generic.c   | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index eff31d824861..6fd9a4e70066 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -180,6 +180,7 @@ struct Qdisc_ops {
 	const struct Qdisc_class_ops	*cl_ops;
 	char			id[IFNAMSIZ];
 	int			priv_size;
+	unsigned int		static_flags;
 
 	int 			(*enqueue)(struct sk_buff *skb,
 					   struct Qdisc *sch,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cbc0a9a00e30..80e4ae3ad693 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -632,6 +632,19 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
+	if (ops->static_flags & TCQ_F_CPUSTATS) {
+		sch->cpu_bstats =
+			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!sch->cpu_bstats)
+			goto errout1;
+
+		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+		if (!sch->cpu_qstats) {
+			free_percpu(sch->cpu_bstats);
+			goto errout1;
+		}
+	}
+
 	spin_lock_init(&sch->busylock);
 	lockdep_set_class(&sch->busylock,
 			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
@@ -641,6 +654,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  dev->qdisc_running_key ?: &qdisc_running_key);
 
 	sch->ops = ops;
+	sch->flags = ops->static_flags;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
@@ -648,6 +662,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	refcount_set(&sch->refcnt, 1);
 
 	return sch;
+errout1:
+	kfree(p);
 errout:
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3


From a53851e2c3218aa30b77abd6e68cf1c371f15afe Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:55:45 -0800
Subject: net: sched: explicit locking in gso_cpu fallback

This work is preparing the qdisc layer to support egress lockless
qdiscs. If we are running the egress qdisc lockless in the case we
overrun the netdev, for whatever reason, the netdev returns a busy
error code and the skb is parked on the gso_skb pointer. With many
cores all hitting this case at once its possible to have multiple
sk_buffs here so we turn gso_skb into a queue.

This should be the edge case and if we see this frequently then
the netdev/qdisc layer needs to back off.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 20 ++++++-----
 net/sched/sch_generic.c   | 85 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 84 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 6fd9a4e70066..9b9e4feda127 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -88,7 +88,7 @@ struct Qdisc {
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
 	 */
-	struct sk_buff		*gso_skb ____cacheline_aligned_in_smp;
+	struct sk_buff_head	gso_skb ____cacheline_aligned_in_smp;
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_packed bstats;
 	seqcount_t		running;
@@ -796,26 +796,30 @@ static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
 /* generic pseudo peek method for non-work-conserving qdisc */
 static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
 {
+	struct sk_buff *skb = skb_peek(&sch->gso_skb);
+
 	/* we can reuse ->gso_skb because peek isn't called for root qdiscs */
-	if (!sch->gso_skb) {
-		sch->gso_skb = sch->dequeue(sch);
-		if (sch->gso_skb) {
+	if (!skb) {
+		skb = sch->dequeue(sch);
+
+		if (skb) {
+			__skb_queue_head(&sch->gso_skb, skb);
 			/* it's still part of the queue */
-			qdisc_qstats_backlog_inc(sch, sch->gso_skb);
+			qdisc_qstats_backlog_inc(sch, skb);
 			sch->q.qlen++;
 		}
 	}
 
-	return sch->gso_skb;
+	return skb;
 }
 
 /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
 static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 {
-	struct sk_buff *skb = sch->gso_skb;
+	struct sk_buff *skb = skb_peek(&sch->gso_skb);
 
 	if (skb) {
-		sch->gso_skb = NULL;
+		skb = __skb_dequeue(&sch->gso_skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
 	} else {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 80e4ae3ad693..dfeabe319c56 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,10 +45,9 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
-
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	q->gso_skb = skb;
+	__skb_queue_head(&q->gso_skb, skb);
 	q->qstats.requeues++;
 	qdisc_qstats_backlog_inc(q, skb);
 	q->q.qlen++;	/* it's still part of the queue */
@@ -57,6 +56,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 	return 0;
 }
 
+static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
+{
+	spinlock_t *lock = qdisc_lock(q);
+
+	spin_lock(lock);
+	__skb_queue_tail(&q->gso_skb, skb);
+	spin_unlock(lock);
+
+	qdisc_qstats_cpu_requeues_inc(q);
+	qdisc_qstats_cpu_backlog_inc(q, skb);
+	qdisc_qstats_cpu_qlen_inc(q);
+	__netif_schedule(q);
+
+	return 0;
+}
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+	if (q->flags & TCQ_F_NOLOCK)
+		return dev_requeue_skb_locked(skb, q);
+	else
+		return __dev_requeue_skb(skb, q);
+}
+
 static void try_bulk_dequeue_skb(struct Qdisc *q,
 				 struct sk_buff *skb,
 				 const struct netdev_queue *txq,
@@ -112,23 +135,50 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 				   int *packets)
 {
-	struct sk_buff *skb = q->gso_skb;
 	const struct netdev_queue *txq = q->dev_queue;
+	struct sk_buff *skb;
 
 	*packets = 1;
-	if (unlikely(skb)) {
+	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
+		spinlock_t *lock = NULL;
+
+		if (q->flags & TCQ_F_NOLOCK) {
+			lock = qdisc_lock(q);
+			spin_lock(lock);
+		}
+
+		skb = skb_peek(&q->gso_skb);
+
+		/* skb may be null if another cpu pulls gso_skb off in between
+		 * empty check and lock.
+		 */
+		if (!skb) {
+			if (lock)
+				spin_unlock(lock);
+			goto validate;
+		}
+
 		/* skb in gso_skb were already validated */
 		*validate = false;
 		/* check the reason of requeuing without tx lock first */
 		txq = skb_get_tx_queue(txq->dev, skb);
 		if (!netif_xmit_frozen_or_stopped(txq)) {
-			q->gso_skb = NULL;
-			qdisc_qstats_backlog_dec(q, skb);
-			q->q.qlen--;
-		} else
+			skb = __skb_dequeue(&q->gso_skb);
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_dec(q, skb);
+				qdisc_qstats_cpu_qlen_dec(q);
+			} else {
+				qdisc_qstats_backlog_dec(q, skb);
+				q->q.qlen--;
+			}
+		} else {
 			skb = NULL;
+		}
+		if (lock)
+			spin_unlock(lock);
 		goto trace;
 	}
+validate:
 	*validate = true;
 	skb = q->skb_bad_txq;
 	if (unlikely(skb)) {
@@ -629,6 +679,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 		sch->padded = (char *) sch - (char *) p;
 	}
+	__skb_queue_head_init(&sch->gso_skb);
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
@@ -697,6 +748,7 @@ EXPORT_SYMBOL(qdisc_create_dflt);
 void qdisc_reset(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops *ops = qdisc->ops;
+	struct sk_buff *skb, *tmp;
 
 	if (ops->reset)
 		ops->reset(qdisc);
@@ -704,10 +756,11 @@ void qdisc_reset(struct Qdisc *qdisc)
 	kfree_skb(qdisc->skb_bad_txq);
 	qdisc->skb_bad_txq = NULL;
 
-	if (qdisc->gso_skb) {
-		kfree_skb_list(qdisc->gso_skb);
-		qdisc->gso_skb = NULL;
+	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+		__skb_unlink(skb, &qdisc->gso_skb);
+		kfree_skb_list(skb);
 	}
+
 	qdisc->q.qlen = 0;
 	qdisc->qstats.backlog = 0;
 }
@@ -726,6 +779,7 @@ static void qdisc_free(struct Qdisc *qdisc)
 void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
+	struct sk_buff *skb, *tmp;
 
 	if (qdisc->flags & TCQ_F_BUILTIN ||
 	    !refcount_dec_and_test(&qdisc->refcnt))
@@ -745,7 +799,11 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	kfree_skb_list(qdisc->gso_skb);
+	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+		__skb_unlink(skb, &qdisc->gso_skb);
+		kfree_skb_list(skb);
+	}
+
 	kfree_skb(qdisc->skb_bad_txq);
 	qdisc_free(qdisc);
 }
@@ -973,6 +1031,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 
 	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
+	__skb_queue_head_init(&qdisc->gso_skb);
 }
 
 void dev_init_scheduler(struct net_device *dev)
-- 
cgit v1.2.3


From 7bbde83b1860c28a1cc35516352c4e7e5172c29a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:56:04 -0800
Subject: net: sched: drop qdisc_reset from dev_graft_qdisc

In qdisc_graft_qdisc a "new" qdisc is attached and the 'qdisc_destroy'
operation is called on the old qdisc. The destroy operation will wait
a rcu grace period and call qdisc_rcu_free(). At which point
gso_cpu_skb is free'd along with all stats so no need to zero stats
and gso_cpu_skb from the graft operation itself.

Further after dropping the qdisc locks we can not continue to call
qdisc_reset before waiting an rcu grace period so that the qdisc is
detached from all cpus. By removing the qdisc_reset() here we get
the correct property of waiting an rcu grace period and letting the
qdisc_destroy operation clean up the qdisc correctly.

Note, a refcnt greater than 1 would cause the destroy operation to
be aborted however if this ever happened the reference to the qdisc
would be lost and we would have a memory leak.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index dfeabe319c56..482ba2234470 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -819,10 +819,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 	root_lock = qdisc_lock(oqdisc);
 	spin_lock_bh(root_lock);
 
-	/* Prune old scheduler */
-	if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
-		qdisc_reset(oqdisc);
-
 	/* ... and graft new one */
 	if (qdisc == NULL)
 		qdisc = &noop_qdisc;
@@ -977,6 +973,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 	return false;
 }
 
+static void dev_qdisc_reset(struct net_device *dev,
+			    struct netdev_queue *dev_queue,
+			    void *none)
+{
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+	if (qdisc)
+		qdisc_reset(qdisc);
+}
+
 /**
  * 	dev_deactivate_many - deactivate transmissions on several devices
  * 	@head: list of devices to deactivate
@@ -987,7 +993,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate_many(struct list_head *head)
 {
 	struct net_device *dev;
-	bool sync_needed = false;
 
 	list_for_each_entry(dev, head, close_list) {
 		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
@@ -997,20 +1002,25 @@ void dev_deactivate_many(struct list_head *head)
 					     &noop_qdisc);
 
 		dev_watchdog_down(dev);
-		sync_needed |= !dev->dismantle;
 	}
 
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
 	 * This is avoided if all devices are in dismantle phase :
 	 * Caller will call synchronize_net() for us
 	 */
-	if (sync_needed)
-		synchronize_net();
+	synchronize_net();
 
 	/* Wait for outstanding qdisc_run calls. */
-	list_for_each_entry(dev, head, close_list)
+	list_for_each_entry(dev, head, close_list) {
 		while (some_qdisc_is_busy(dev))
 			yield();
+		/* The new qdisc is assigned at this point so we can safely
+		 * unwind stale skb lists and qdisc statistics
+		 */
+		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
+		if (dev_ingress_queue(dev))
+			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+	}
 }
 
 void dev_deactivate(struct net_device *dev)
-- 
cgit v1.2.3


From 70e57d5e3f8ec7c482b92ef43e543d87134689ab Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:56:23 -0800
Subject: net: sched: use skb list for skb_bad_tx

Similar to how gso is handled use skb list for skb_bad_tx this is
required with lockless qdiscs because we may have multiple cores
attempting to push skbs into skb_bad_tx concurrently

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |   2 +-
 net/sched/sch_generic.c   | 106 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 87 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9b9e4feda127..da2528036e2e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -95,7 +95,7 @@ struct Qdisc {
 	struct gnet_stats_queue	qstats;
 	unsigned long		state;
 	struct Qdisc            *next_sched;
-	struct sk_buff		*skb_bad_txq;
+	struct sk_buff_head	skb_bad_txq;
 	int			padded;
 	refcount_t		refcnt;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 482ba2234470..84cef0570862 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,6 +45,68 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
+
+static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
+{
+	const struct netdev_queue *txq = q->dev_queue;
+	spinlock_t *lock = NULL;
+	struct sk_buff *skb;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		lock = qdisc_lock(q);
+		spin_lock(lock);
+	}
+
+	skb = skb_peek(&q->skb_bad_txq);
+	if (skb) {
+		/* check the reason of requeuing without tx lock first */
+		txq = skb_get_tx_queue(txq->dev, skb);
+		if (!netif_xmit_frozen_or_stopped(txq)) {
+			skb = __skb_dequeue(&q->skb_bad_txq);
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_dec(q, skb);
+				qdisc_qstats_cpu_qlen_dec(q);
+			} else {
+				qdisc_qstats_backlog_dec(q, skb);
+				q->q.qlen--;
+			}
+		} else {
+			skb = NULL;
+		}
+	}
+
+	if (lock)
+		spin_unlock(lock);
+
+	return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
+{
+	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
+
+	if (unlikely(skb))
+		skb = __skb_dequeue_bad_txq(q);
+
+	return skb;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
+					     struct sk_buff *skb)
+{
+	spinlock_t *lock = NULL;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		lock = qdisc_lock(q);
+		spin_lock(lock);
+	}
+
+	__skb_queue_tail(&q->skb_bad_txq, skb);
+
+	if (lock)
+		spin_unlock(lock);
+}
+
 static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	__skb_queue_head(&q->gso_skb, skb);
@@ -117,9 +179,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 		if (!nskb)
 			break;
 		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
-			q->skb_bad_txq = nskb;
-			qdisc_qstats_backlog_inc(q, nskb);
-			q->q.qlen++;
+			qdisc_enqueue_skb_bad_txq(q, nskb);
+
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_inc(q, nskb);
+				qdisc_qstats_cpu_qlen_inc(q);
+			} else {
+				qdisc_qstats_backlog_inc(q, nskb);
+				q->q.qlen++;
+			}
 			break;
 		}
 		skb->next = nskb;
@@ -180,19 +248,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	}
 validate:
 	*validate = true;
-	skb = q->skb_bad_txq;
-	if (unlikely(skb)) {
-		/* check the reason of requeuing without tx lock first */
-		txq = skb_get_tx_queue(txq->dev, skb);
-		if (!netif_xmit_frozen_or_stopped(txq)) {
-			q->skb_bad_txq = NULL;
-			qdisc_qstats_backlog_dec(q, skb);
-			q->q.qlen--;
-			goto bulk;
-		}
-		skb = NULL;
-		goto trace;
-	}
+	skb = qdisc_dequeue_skb_bad_txq(q);
+	if (unlikely(skb))
+		goto bulk;
 	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
 	    !netif_xmit_frozen_or_stopped(txq))
 		skb = q->dequeue(q);
@@ -680,6 +738,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch->padded = (char *) sch - (char *) p;
 	}
 	__skb_queue_head_init(&sch->gso_skb);
+	__skb_queue_head_init(&sch->skb_bad_txq);
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
@@ -753,14 +812,16 @@ void qdisc_reset(struct Qdisc *qdisc)
 	if (ops->reset)
 		ops->reset(qdisc);
 
-	kfree_skb(qdisc->skb_bad_txq);
-	qdisc->skb_bad_txq = NULL;
-
 	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
 		__skb_unlink(skb, &qdisc->gso_skb);
 		kfree_skb_list(skb);
 	}
 
+	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+		__skb_unlink(skb, &qdisc->skb_bad_txq);
+		kfree_skb_list(skb);
+	}
+
 	qdisc->q.qlen = 0;
 	qdisc->qstats.backlog = 0;
 }
@@ -804,7 +865,11 @@ void qdisc_destroy(struct Qdisc *qdisc)
 		kfree_skb_list(skb);
 	}
 
-	kfree_skb(qdisc->skb_bad_txq);
+	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+		__skb_unlink(skb, &qdisc->skb_bad_txq);
+		kfree_skb_list(skb);
+	}
+
 	qdisc_free(qdisc);
 }
 EXPORT_SYMBOL(qdisc_destroy);
@@ -1042,6 +1107,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
 	__skb_queue_head_init(&qdisc->gso_skb);
+	__skb_queue_head_init(&qdisc->skb_bad_txq);
 }
 
 void dev_init_scheduler(struct net_device *dev)
-- 
cgit v1.2.3


From fd8e8d1a775d82f04215f4b884a1962774805346 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:56:42 -0800
Subject: net: sched: check for frozen queue before skb_bad_txq check

I can not think of any reason to pull the bad txq skb off the qdisc if
the txq we plan to send this on is still frozen. So check for frozen
queue first and abort before dequeuing either skb_bad_txq skb or
normal qdisc dequeue() skb.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 84cef0570862..5ff93c2b5b99 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -204,7 +204,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 				   int *packets)
 {
 	const struct netdev_queue *txq = q->dev_queue;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 
 	*packets = 1;
 	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
@@ -248,12 +248,15 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	}
 validate:
 	*validate = true;
+
+	if ((q->flags & TCQ_F_ONETXQUEUE) &&
+	    netif_xmit_frozen_or_stopped(txq))
+		return skb;
+
 	skb = qdisc_dequeue_skb_bad_txq(q);
 	if (unlikely(skb))
 		goto bulk;
-	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
-	    !netif_xmit_frozen_or_stopped(txq))
-		skb = q->dequeue(q);
+	skb = q->dequeue(q);
 	if (skb) {
 bulk:
 		if (qdisc_may_bulk(q))
-- 
cgit v1.2.3


From 7e66016f2c65bfc1181f42274fcb7f1183ab1bb5 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:57:00 -0800
Subject: net: sched: helpers to sum qlen and qlen for per cpu logic

Add qdisc qlen helper routines for lockless qdiscs to use.

The qdisc qlen is no longer used in the hotpath but it is reported
via stats query on the qdisc so it still needs to be tracked. This
adds the per cpu operations needed along with a helper to return
the summation of per cpu stats.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 20 ++++++++++++++++++++
 net/sched/sch_api.c       |  3 ++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index da2528036e2e..8f8c0afe529b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -292,11 +292,31 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 	BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
+static inline int qdisc_qlen_cpu(const struct Qdisc *q)
+{
+	return this_cpu_ptr(q->cpu_qstats)->qlen;
+}
+
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
 	return q->q.qlen;
 }
 
+static inline int qdisc_qlen_sum(const struct Qdisc *q)
+{
+	__u32 qlen = 0;
+	int i;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		for_each_possible_cpu(i)
+			qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
+	} else {
+		qlen = q->q.qlen;
+	}
+
+	return qlen;
+}
+
 static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
 {
 	return (struct qdisc_skb_cb *)skb->cb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a48ca41b7ecf..c669bb3b89b2 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -797,7 +797,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
-	qlen = q->q.qlen;
+
+	qlen = qdisc_qlen_sum(q);
 
 	stab = rtnl_dereference(q->stab);
 	if (stab && qdisc_dump_stab(skb, stab) < 0)
-- 
cgit v1.2.3


From b01ac095c740fc21f4bb21abe900b0f5b3042cf9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:57:20 -0800
Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

The sch_mq qdisc creates a sub-qdisc per tx queue which are then
called independently for enqueue and dequeue operations. However
statistics are aggregated and pushed up to the "master" qdisc.

This patch adds support for any of the sub-qdiscs to be per cpu
statistic qdiscs. To handle this case add a check when calculating
stats and aggregate the per cpu stats if needed.

Also exports __gnet_stats_copy_queue() to use as a helper function.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gen_stats.h |  3 +++
 net/core/gen_stats.c    |  9 +++++----
 net/sched/sch_mq.c      | 25 ++++++++++++++++++-------
 3 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 304f7aa9cc01..0304ba2ae353 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -49,6 +49,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d,
 int gnet_stats_copy_queue(struct gnet_dump *d,
 			  struct gnet_stats_queue __percpu *cpu_q,
 			  struct gnet_stats_queue *q, __u32 qlen);
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+			     const struct gnet_stats_queue __percpu *cpu_q,
+			     const struct gnet_stats_queue *q, __u32 qlen);
 int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
 
 int gnet_stats_finish_copy(struct gnet_dump *d);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 87f28557b329..b2b2323bdc84 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
 	}
 }
 
-static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
-				    const struct gnet_stats_queue __percpu *cpu,
-				    const struct gnet_stats_queue *q,
-				    __u32 qlen)
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+			     const struct gnet_stats_queue __percpu *cpu,
+			     const struct gnet_stats_queue *q,
+			     __u32 qlen)
 {
 	if (cpu) {
 		__gnet_stats_copy_queue_cpu(qstats, cpu);
@@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
 
 	qstats->qlen = qlen;
 }
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
 
 /**
  * gnet_stats_copy_queue - copy queue statistics into statistics TLV
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 213b586a06a0..bc59f05e1a0f 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/sch_generic.h>
 
 struct mq_sched {
 	struct Qdisc		**qdiscs;
@@ -103,15 +104,25 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+		struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+		__u32 qlen = 0;
+
 		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
 		spin_lock_bh(qdisc_lock(qdisc));
-		sch->q.qlen		+= qdisc->q.qlen;
-		sch->bstats.bytes	+= qdisc->bstats.bytes;
-		sch->bstats.packets	+= qdisc->bstats.packets;
-		sch->qstats.backlog	+= qdisc->qstats.backlog;
-		sch->qstats.drops	+= qdisc->qstats.drops;
-		sch->qstats.requeues	+= qdisc->qstats.requeues;
-		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+
+		if (qdisc_is_percpu_stats(qdisc)) {
+			cpu_bstats = qdisc->cpu_bstats;
+			cpu_qstats = qdisc->cpu_qstats;
+		}
+
+		qlen = qdisc_qlen_sum(qdisc);
+
+		__gnet_stats_copy_basic(NULL, &sch->bstats,
+					cpu_bstats, &qdisc->bstats);
+		__gnet_stats_copy_queue(&sch->qstats,
+					cpu_qstats, &qdisc->qstats, qlen);
+
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
 	return 0;
-- 
cgit v1.2.3


From ce679e8df7ed2a92660556d100cf370fe22b4eab Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:57:39 -0800
Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio

The sch_mqprio qdisc creates a sub-qdisc per tx queue which are then
called independently for enqueue and dequeue operations. However
statistics are aggregated and pushed up to the "master" qdisc.

This patch adds support for any of the sub-qdiscs to be per cpu
statistic qdiscs. To handle this case add a check when calculating
stats and aggregate the per cpu stats if needed.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mq.c     | 35 +++++++++++++++----------
 net/sched/sch_mqprio.c | 69 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 69 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index bc59f05e1a0f..8cbb5c829d59 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -98,33 +98,42 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct net_device *dev = qdisc_dev(sch);
 	struct Qdisc *qdisc;
 	unsigned int ntx;
+	__u32 qlen = 0;
 
 	sch->q.qlen = 0;
 	memset(&sch->bstats, 0, sizeof(sch->bstats));
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
+	/* MQ supports lockless qdiscs. However, statistics accounting needs
+	 * to account for all, none, or a mix of locked and unlocked child
+	 * qdiscs. Percpu stats are added to counters in-band and locking
+	 * qdisc totals are added at end.
+	 */
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
-		struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
-		struct gnet_stats_queue __percpu *cpu_qstats = NULL;
-		__u32 qlen = 0;
-
 		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
 		spin_lock_bh(qdisc_lock(qdisc));
 
 		if (qdisc_is_percpu_stats(qdisc)) {
-			cpu_bstats = qdisc->cpu_bstats;
-			cpu_qstats = qdisc->cpu_qstats;
+			qlen = qdisc_qlen_sum(qdisc);
+			__gnet_stats_copy_basic(NULL, &sch->bstats,
+						qdisc->cpu_bstats,
+						&qdisc->bstats);
+			__gnet_stats_copy_queue(&sch->qstats,
+						qdisc->cpu_qstats,
+						&qdisc->qstats, qlen);
+		} else {
+			sch->q.qlen		+= qdisc->q.qlen;
+			sch->bstats.bytes	+= qdisc->bstats.bytes;
+			sch->bstats.packets	+= qdisc->bstats.packets;
+			sch->qstats.backlog	+= qdisc->qstats.backlog;
+			sch->qstats.drops	+= qdisc->qstats.drops;
+			sch->qstats.requeues	+= qdisc->qstats.requeues;
+			sch->qstats.overlimits	+= qdisc->qstats.overlimits;
 		}
 
-		qlen = qdisc_qlen_sum(qdisc);
-
-		__gnet_stats_copy_basic(NULL, &sch->bstats,
-					cpu_bstats, &qdisc->bstats);
-		__gnet_stats_copy_queue(&sch->qstats,
-					cpu_qstats, &qdisc->qstats, qlen);
-
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
+
 	return 0;
 }
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b85885a9d8a1..8622745f3cd9 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -388,22 +388,40 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
 	struct tc_mqprio_qopt opt = { 0 };
 	struct Qdisc *qdisc;
-	unsigned int i;
+	unsigned int ntx, tc;
 
 	sch->q.qlen = 0;
 	memset(&sch->bstats, 0, sizeof(sch->bstats));
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
+	/* MQ supports lockless qdiscs. However, statistics accounting needs
+	 * to account for all, none, or a mix of locked and unlocked child
+	 * qdiscs. Percpu stats are added to counters in-band and locking
+	 * qdisc totals are added at end.
+	 */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
 		spin_lock_bh(qdisc_lock(qdisc));
-		sch->q.qlen		+= qdisc->q.qlen;
-		sch->bstats.bytes	+= qdisc->bstats.bytes;
-		sch->bstats.packets	+= qdisc->bstats.packets;
-		sch->qstats.backlog	+= qdisc->qstats.backlog;
-		sch->qstats.drops	+= qdisc->qstats.drops;
-		sch->qstats.requeues	+= qdisc->qstats.requeues;
-		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+
+		if (qdisc_is_percpu_stats(qdisc)) {
+			__u32 qlen = qdisc_qlen_sum(qdisc);
+
+			__gnet_stats_copy_basic(NULL, &sch->bstats,
+						qdisc->cpu_bstats,
+						&qdisc->bstats);
+			__gnet_stats_copy_queue(&sch->qstats,
+						qdisc->cpu_qstats,
+						&qdisc->qstats, qlen);
+		} else {
+			sch->q.qlen		+= qdisc->q.qlen;
+			sch->bstats.bytes	+= qdisc->bstats.bytes;
+			sch->bstats.packets	+= qdisc->bstats.packets;
+			sch->qstats.backlog	+= qdisc->qstats.backlog;
+			sch->qstats.drops	+= qdisc->qstats.drops;
+			sch->qstats.requeues	+= qdisc->qstats.requeues;
+			sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		}
+
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
 
@@ -411,9 +429,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
 	opt.hw = priv->hw_offload;
 
-	for (i = 0; i < netdev_get_num_tc(dev); i++) {
-		opt.count[i] = dev->tc_to_txq[i].count;
-		opt.offset[i] = dev->tc_to_txq[i].offset;
+	for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
+		opt.count[tc] = dev->tc_to_txq[tc].count;
+		opt.offset[tc] = dev->tc_to_txq[tc].offset;
 	}
 
 	if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
@@ -495,7 +513,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	if (cl >= TC_H_MIN_PRIORITY) {
 		int i;
 		__u32 qlen = 0;
-		struct Qdisc *qdisc;
 		struct gnet_stats_queue qstats = {0};
 		struct gnet_stats_basic_packed bstats = {0};
 		struct net_device *dev = qdisc_dev(sch);
@@ -511,18 +528,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 		for (i = tc.offset; i < tc.offset + tc.count; i++) {
 			struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+			struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
+			struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+			struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 
-			qdisc = rtnl_dereference(q->qdisc);
 			spin_lock_bh(qdisc_lock(qdisc));
-			qlen		  += qdisc->q.qlen;
-			bstats.bytes      += qdisc->bstats.bytes;
-			bstats.packets    += qdisc->bstats.packets;
-			qstats.backlog    += qdisc->qstats.backlog;
-			qstats.drops      += qdisc->qstats.drops;
-			qstats.requeues   += qdisc->qstats.requeues;
-			qstats.overlimits += qdisc->qstats.overlimits;
+			if (qdisc_is_percpu_stats(qdisc)) {
+				cpu_bstats = qdisc->cpu_bstats;
+				cpu_qstats = qdisc->cpu_qstats;
+			}
+
+			qlen = qdisc_qlen_sum(qdisc);
+			__gnet_stats_copy_basic(NULL, &sch->bstats,
+						cpu_bstats, &qdisc->bstats);
+			__gnet_stats_copy_queue(&sch->qstats,
+						cpu_qstats,
+						&qdisc->qstats,
+						qlen);
 			spin_unlock_bh(qdisc_lock(qdisc));
 		}
+
 		/* Reclaim root sleeping lock before completing stats */
 		if (d->lock)
 			spin_lock_bh(d->lock);
-- 
cgit v1.2.3


From c5ad119fb6c09b0297446be05bd66602fa564758 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:58:19 -0800
Subject: net: sched: pfifo_fast use skb_array

This converts the pfifo_fast qdisc to use the skb_array data structure
and set the lockless qdisc bit. pfifo_fast is the first qdisc to support
the lockless bit that can be a child of a qdisc requiring locking. So
we add logic to clear the lock bit on initialization in these cases when
the qdisc graft operation occurs.

This also removes the logic used to pick the next band to dequeue from
and instead just checks a per priority array for packets from top priority
to lowest. This might need to be a bit more clever but seems to work
for now.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c     |   5 ++
 net/sched/sch_generic.c | 140 ++++++++++++++++++++++++++++++------------------
 2 files changed, 92 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c669bb3b89b2..a904276b657d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -955,6 +955,11 @@ skip:
 	} else {
 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
+		/* Only support running class lockless if parent is lockless */
+		if (new && (new->flags & TCQ_F_NOLOCK) &&
+		    parent && !(parent->flags & TCQ_F_NOLOCK))
+			new->flags &= ~TCQ_F_NOLOCK;
+
 		err = -EOPNOTSUPP;
 		if (cops && cops->graft) {
 			unsigned long cl = cops->find(parent, classid);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5ff93c2b5b99..ff6a5acf6ab0 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/if_vlan.h>
+#include <linux/skb_array.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
@@ -578,93 +579,93 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
 
 /*
  * Private data for a pfifo_fast scheduler containing:
- * 	- queues for the three band
- * 	- bitmap indicating which of the bands contain skbs
+ *	- rings for priority bands
  */
 struct pfifo_fast_priv {
-	u32 bitmap;
-	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
+	struct skb_array q[PFIFO_FAST_BANDS];
 };
 
-/*
- * Convert a bitmap to the first band number where an skb is queued, where:
- * 	bitmap=0 means there are no skbs on any band.
- * 	bitmap=1 means there is an skb on band 0.
- *	bitmap=7 means there are skbs on all 3 bands, etc.
- */
-static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-
-static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
-					     int band)
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+					  int band)
 {
-	return priv->q + band;
+	return &priv->q[band];
 }
 
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 			      struct sk_buff **to_free)
 {
-	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
-		int band = prio2band[skb->priority & TC_PRIO_MAX];
-		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-		struct qdisc_skb_head *list = band2list(priv, band);
-
-		priv->bitmap |= (1 << band);
-		qdisc->q.qlen++;
-		return __qdisc_enqueue_tail(skb, qdisc, list);
-	}
+	int band = prio2band[skb->priority & TC_PRIO_MAX];
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	struct skb_array *q = band2list(priv, band);
+	int err;
 
-	return qdisc_drop(skb, qdisc, to_free);
+	err = skb_array_produce(q, skb);
+
+	if (unlikely(err))
+		return qdisc_drop_cpu(skb, qdisc, to_free);
+
+	qdisc_qstats_cpu_qlen_inc(qdisc);
+	qdisc_qstats_cpu_backlog_inc(qdisc, skb);
+	return NET_XMIT_SUCCESS;
 }
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 {
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-	int band = bitmap2band[priv->bitmap];
-
-	if (likely(band >= 0)) {
-		struct qdisc_skb_head *qh = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(qh);
+	struct sk_buff *skb = NULL;
+	int band;
 
-		if (likely(skb != NULL)) {
-			qdisc_qstats_backlog_dec(qdisc, skb);
-			qdisc_bstats_update(qdisc, skb);
-		}
+	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+		struct skb_array *q = band2list(priv, band);
 
-		qdisc->q.qlen--;
-		if (qh->qlen == 0)
-			priv->bitmap &= ~(1 << band);
+		if (__skb_array_empty(q))
+			continue;
 
-		return skb;
+		skb = skb_array_consume_bh(q);
+	}
+	if (likely(skb)) {
+		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+		qdisc_bstats_cpu_update(qdisc, skb);
+		qdisc_qstats_cpu_qlen_dec(qdisc);
 	}
 
-	return NULL;
+	return skb;
 }
 
 static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
 {
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-	int band = bitmap2band[priv->bitmap];
+	struct sk_buff *skb = NULL;
+	int band;
 
-	if (band >= 0) {
-		struct qdisc_skb_head *qh = band2list(priv, band);
+	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+		struct skb_array *q = band2list(priv, band);
 
-		return qh->head;
+		skb = __skb_array_peek(q);
 	}
 
-	return NULL;
+	return skb;
 }
 
 static void pfifo_fast_reset(struct Qdisc *qdisc)
 {
-	int prio;
+	int i, band;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__qdisc_reset_queue(band2list(priv, prio));
+	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+		struct skb_array *q = band2list(priv, band);
+		struct sk_buff *skb;
 
-	priv->bitmap = 0;
-	qdisc->qstats.backlog = 0;
-	qdisc->q.qlen = 0;
+		while ((skb = skb_array_consume_bh(q)) != NULL)
+			kfree_skb(skb);
+	}
+
+	for_each_possible_cpu(i) {
+		struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+
+		q->backlog = 0;
+		q->qlen = 0;
+	}
 }
 
 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
@@ -682,17 +683,48 @@ nla_put_failure:
 
 static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 {
-	int prio;
+	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int prio;
+
+	/* guard against zero length rings */
+	if (!qlen)
+		return -EINVAL;
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		qdisc_skb_head_init(band2list(priv, prio));
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+		int err;
+
+		err = skb_array_init(q, qlen, GFP_KERNEL);
+		if (err)
+			return -ENOMEM;
+	}
 
 	/* Can by-pass the queue discipline */
 	qdisc->flags |= TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
+static void pfifo_fast_destroy(struct Qdisc *sch)
+{
+	struct pfifo_fast_priv *priv = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+
+		/* NULL ring is possible if destroy path is due to a failed
+		 * skb_array_init() in pfifo_fast_init() case.
+		 */
+		if (!&q->ring.queue)
+			continue;
+		/* Destroy ring but no need to kfree_skb because a call to
+		 * pfifo_fast_reset() has already done that work.
+		 */
+		ptr_ring_cleanup(&q->ring, NULL);
+	}
+}
+
 struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
 	.priv_size	=	sizeof(struct pfifo_fast_priv),
@@ -700,9 +732,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dequeue	=	pfifo_fast_dequeue,
 	.peek		=	pfifo_fast_peek,
 	.init		=	pfifo_fast_init,
+	.destroy	=	pfifo_fast_destroy,
 	.reset		=	pfifo_fast_reset,
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
+	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
 };
 EXPORT_SYMBOL(pfifo_fast_ops);
 
-- 
cgit v1.2.3


From df45bf84e4f5a48f23d4b1a07d21d566e8b587b2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 8 Dec 2017 19:27:27 +0100
Subject: net: sched: fix use-after-free in tcf_block_put_ext

Since the block is freed with last chain being put, once we reach the
end of iteration of list_for_each_entry_safe, the block may be
already freed. I'm hitting this only by creating and deleting clsact:

[  202.171952] ==================================================================
[  202.180182] BUG: KASAN: use-after-free in tcf_block_put_ext+0x240/0x390
[  202.187590] Read of size 8 at addr ffff880225539a80 by task tc/796
[  202.194508]
[  202.196185] CPU: 0 PID: 796 Comm: tc Not tainted 4.15.0-rc2jiri+ #5
[  202.203200] Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 5.6.5 06/07/2016
[  202.213613] Call Trace:
[  202.216369]  dump_stack+0xda/0x169
[  202.220192]  ? dma_virt_map_sg+0x147/0x147
[  202.224790]  ? show_regs_print_info+0x54/0x54
[  202.229691]  ? tcf_chain_destroy+0x1dc/0x250
[  202.234494]  print_address_description+0x83/0x3d0
[  202.239781]  ? tcf_block_put_ext+0x240/0x390
[  202.244575]  kasan_report+0x1ba/0x460
[  202.248707]  ? tcf_block_put_ext+0x240/0x390
[  202.253518]  tcf_block_put_ext+0x240/0x390
[  202.258117]  ? tcf_chain_flush+0x290/0x290
[  202.262708]  ? qdisc_hash_del+0x82/0x1a0
[  202.267111]  ? qdisc_hash_add+0x50/0x50
[  202.271411]  ? __lock_is_held+0x5f/0x1a0
[  202.275843]  clsact_destroy+0x3d/0x80 [sch_ingress]
[  202.281323]  qdisc_destroy+0xcb/0x240
[  202.285445]  qdisc_graft+0x216/0x7b0
[  202.289497]  tc_get_qdisc+0x260/0x560

Fix this by holding the block also by chain 0 and put chain 0
explicitly, out of the list_for_each_entry_safe loop at the very
end of tcf_block_put_ext.

Fixes: efbf78973978 ("net_sched: get rid of rcu_barrier() in tcf_block_put_ext()")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index d51051dd8f7d..5b9b8a61e8c4 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -342,23 +342,24 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 {
 	struct tcf_chain *chain, *tmp;
 
-	/* Hold a refcnt for all chains, except 0, so that they don't disappear
+	/* Hold a refcnt for all chains, so that they don't disappear
 	 * while we are iterating.
 	 */
 	list_for_each_entry(chain, &block->chain_list, list)
-		if (chain->index)
-			tcf_chain_hold(chain);
+		tcf_chain_hold(chain);
 
 	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_flush(chain);
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	/* At this point, all the chains should have refcnt >= 1. Block will be
-	 * freed after all chains are gone.
-	 */
+	/* At this point, all the chains should have refcnt >= 1. */
 	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
 		tcf_chain_put(chain);
+
+	/* Finally, put chain 0 and allow block to be freed. */
+	chain = list_first_entry(&block->chain_list, struct tcf_chain, list);
+	tcf_chain_put(chain);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
-- 
cgit v1.2.3


From 46e6b992c2502b094e61da6994f1363f3b7c1413 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 7 Dec 2017 15:40:19 -0800
Subject: rtnetlink: allow GSO maximums to be set on device creation

Netlink device already allows changing GSO sizes with
ip set command. The part that is missing is allowing overriding
GSO settings on device creation.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a4faefd65006..412ebf0b09c6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1637,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
+	[IFLA_GSO_MAX_SEGS]	= { .type = NLA_U32 },
+	[IFLA_GSO_MAX_SIZE]	= { .type = NLA_U32 },
 	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -2287,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb,
 		}
 	}
 
+	if (tb[IFLA_GSO_MAX_SIZE]) {
+		u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+		if (max_size > GSO_MAX_SIZE) {
+			err = -EINVAL;
+			goto errout;
+		}
+
+		if (dev->gso_max_size ^ max_size) {
+			netif_set_gso_max_size(dev, max_size);
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
+	if (tb[IFLA_GSO_MAX_SEGS]) {
+		u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+		if (max_segs > GSO_MAX_SEGS) {
+			err = -EINVAL;
+			goto errout;
+		}
+
+		if (dev->gso_max_segs ^ max_segs) {
+			dev->gso_max_segs = max_segs;
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
 	if (tb[IFLA_OPERSTATE])
 		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
 
@@ -2651,6 +2681,10 @@ struct net_device *rtnl_create_link(struct net *net,
 		dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
 	if (tb[IFLA_GROUP])
 		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+	if (tb[IFLA_GSO_MAX_SIZE])
+		netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
+	if (tb[IFLA_GSO_MAX_SEGS])
+		dev->gso_max_size = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
 	return dev;
 }
-- 
cgit v1.2.3


From 02049ce27ef9d5ec0d74023a1487eb5c9bb38143 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Tue, 17 Oct 2017 18:14:50 -0500
Subject: mac80211: mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Notice that in some cases I replaced "fall through on else" and
"otherwise fall through" comments with just a "fall through" comment,
which is what GCC is expecting to find.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c        | 3 +++
 net/mac80211/ht.c         | 1 +
 net/mac80211/iface.c      | 2 +-
 net/mac80211/mesh.c       | 2 ++
 net/mac80211/mesh_hwmp.c  | 1 +
 net/mac80211/mesh_plink.c | 2 +-
 net/mac80211/mlme.c       | 1 +
 net/mac80211/offchannel.c | 4 ++--
 net/mac80211/tdls.c       | 1 +
 net/mac80211/wme.c        | 1 +
 10 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index fb15d3b97cb2..b77ee342b5f8 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -573,10 +573,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
 			     offsetof(typeof(kseq), aes_cmac));
+		/* fall through */
 	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
 	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
 		BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
 			     offsetof(typeof(kseq), aes_gmac));
+		/* fall through */
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
@@ -2205,6 +2207,7 @@ static int ieee80211_scan(struct wiphy *wiphy,
 		 * for now fall through to allow scanning only when
 		 * beaconing hasn't been configured yet
 		 */
+		/* fall through */
 	case NL80211_IFTYPE_AP:
 		/*
 		 * If the scan has been forced (and the driver supports
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 167f83b853e6..cb0860d751fd 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -493,6 +493,7 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
 	case IEEE80211_SMPS_AUTOMATIC:
 	case IEEE80211_SMPS_NUM_MODES:
 		WARN_ON(1);
+		/* fall through */
 	case IEEE80211_SMPS_OFF:
 		action_frame->u.action.u.ht_smps.smps_control =
 				WLAN_HT_SMPS_CONTROL_DISABLED;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 13b16f90e1cf..435e7358004c 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1633,7 +1633,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local,
 				goto out_unlock;
 			}
 		}
-		/* otherwise fall through */
+		/* fall through */
 	default:
 		/* assign a new address if possible -- try n_addresses first */
 		for (i = 0; i < local->hw.wiphy->n_addresses; i++) {
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 5e27364e10ac..73ac607beb5d 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -989,8 +989,10 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 	switch (sdata->vif.bss_conf.chandef.width) {
 	case NL80211_CHAN_WIDTH_20_NOHT:
 		sta_flags |= IEEE80211_STA_DISABLE_HT;
+		/* fall through */
 	case NL80211_CHAN_WIDTH_20:
 		sta_flags |= IEEE80211_STA_DISABLE_40MHZ;
+		/* fall through */
 	case NL80211_CHAN_WIDTH_40:
 		sta_flags |= IEEE80211_STA_DISABLE_VHT;
 		break;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 4394463a0c2e..35ad3983ae4b 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -1250,6 +1250,7 @@ void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata)
 		break;
 	case IEEE80211_PROACTIVE_PREQ_WITH_PREP:
 		flags |= IEEE80211_PREQ_PROACTIVE_PREP_FLAG;
+		/* fall through */
 	case IEEE80211_PROACTIVE_PREQ_NO_PREP:
 		interval = ifmsh->mshcfg.dot11MeshHWMPactivePathToRootTimeout;
 		target_flags |= IEEE80211_PREQ_TO_FLAG |
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index e2d00cce3c17..0f6c9ca59062 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -672,7 +672,7 @@ void mesh_plink_timer(struct timer_list *t)
 			break;
 		}
 		reason = WLAN_REASON_MESH_MAX_RETRIES;
-		/* fall through on else */
+		/* fall through */
 	case NL80211_PLINK_CNF_RCVD:
 		/* confirm timer */
 		if (!reason)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c244691deab9..fa0f96c74898 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -473,6 +473,7 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata,
 	case IEEE80211_SMPS_AUTOMATIC:
 	case IEEE80211_SMPS_NUM_MODES:
 		WARN_ON(1);
+		/* fall through */
 	case IEEE80211_SMPS_OFF:
 		cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
 			IEEE80211_HT_CAP_SM_PS_SHIFT;
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index faf4f6055000..f1d40b6645ff 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -801,14 +801,14 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	case NL80211_IFTYPE_ADHOC:
 		if (!sdata->vif.bss_conf.ibss_joined)
 			need_offchan = true;
-		/* fall through */
 #ifdef CONFIG_MAC80211_MESH
+		/* fall through */
 	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif) &&
 		    !sdata->u.mesh.mesh_id_len)
 			need_offchan = true;
-		/* fall through */
 #endif
+		/* fall through */
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_AP_VLAN:
 	case NL80211_IFTYPE_P2P_GO:
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 91093d4a2f84..96d4fb998e33 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -236,6 +236,7 @@ static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac)
 	switch (ac) {
 	default:
 		WARN_ON_ONCE(1);
+		/* fall through */
 	case 0:
 		return IEEE80211_AC_BE;
 	case 1:
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 3e3d3014e9ab..5f7c96368b11 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -165,6 +165,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
 			qos = sta->sta.wme;
 			break;
 		}
+		/* fall through */
 	case NL80211_IFTYPE_AP:
 		ra = skb->data;
 		break;
-- 
cgit v1.2.3


From d559e303b17826a6e4879fc6f12d929562c4061c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 18 Oct 2017 13:44:23 +0200
Subject: mac80211: avoid looking up tid_tx/tid_rx from timers

There's no need to re-lookup the data structures now that
we actually get them immediately with from_timer(), just
avoid that. The struct has to be valid anyway, otherwise
the timer object itself would no longer be valid, and we
can't have a different version of the struct since only a
single session per TID is permitted.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c | 17 +++--------------
 net/mac80211/agg-tx.c | 31 ++++++++-----------------------
 2 files changed, 11 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index d444752dbf40..35e94483fb8c 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -153,27 +153,16 @@ EXPORT_SYMBOL(ieee80211_stop_rx_ba_session);
  */
 static void sta_rx_agg_session_timer_expired(struct timer_list *t)
 {
-	struct tid_ampdu_rx *tid_rx_timer =
-		from_timer(tid_rx_timer, t, session_timer);
-	struct sta_info *sta = tid_rx_timer->sta;
-	u8 tid = tid_rx_timer->tid;
-	struct tid_ampdu_rx *tid_rx;
+	struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, session_timer);
+	struct sta_info *sta = tid_rx->sta;
+	u8 tid = tid_rx->tid;
 	unsigned long timeout;
 
-	rcu_read_lock();
-	tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
-	if (!tid_rx) {
-		rcu_read_unlock();
-		return;
-	}
-
 	timeout = tid_rx->last_rx + TU_TO_JIFFIES(tid_rx->timeout);
 	if (time_is_after_jiffies(timeout)) {
 		mod_timer(&tid_rx->session_timer, timeout);
-		rcu_read_unlock();
 		return;
 	}
-	rcu_read_unlock();
 
 	ht_dbg(sta->sdata, "RX session timer expired on %pM tid %d\n",
 	       sta->sta.addr, tid);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 5f8ab5be369f..a04df962129c 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -429,18 +429,12 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
  */
 static void sta_addba_resp_timer_expired(struct timer_list *t)
 {
-	struct tid_ampdu_tx *tid_tx_timer =
-		from_timer(tid_tx_timer, t, addba_resp_timer);
-	struct sta_info *sta = tid_tx_timer->sta;
-	u8 tid = tid_tx_timer->tid;
-	struct tid_ampdu_tx *tid_tx;
+	struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, addba_resp_timer);
+	struct sta_info *sta = tid_tx->sta;
+	u8 tid = tid_tx->tid;
 
 	/* check if the TID waits for addBA response */
-	rcu_read_lock();
-	tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
-	if (!tid_tx ||
-	    test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) {
-		rcu_read_unlock();
+	if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) {
 		ht_dbg(sta->sdata,
 		       "timer expired on %pM tid %d not expecting addBA response\n",
 		       sta->sta.addr, tid);
@@ -451,7 +445,6 @@ static void sta_addba_resp_timer_expired(struct timer_list *t)
 	       sta->sta.addr, tid);
 
 	ieee80211_stop_tx_ba_session(&sta->sta, tid);
-	rcu_read_unlock();
 }
 
 void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
@@ -529,29 +522,21 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
  */
 static void sta_tx_agg_session_timer_expired(struct timer_list *t)
 {
-	struct tid_ampdu_tx *tid_tx_timer =
-		from_timer(tid_tx_timer, t, session_timer);
-	struct sta_info *sta = tid_tx_timer->sta;
-	u8 tid = tid_tx_timer->tid;
-	struct tid_ampdu_tx *tid_tx;
+	struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, session_timer);
+	struct sta_info *sta = tid_tx->sta;
+	u8 tid = tid_tx->tid;
 	unsigned long timeout;
 
-	rcu_read_lock();
-	tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
-	if (!tid_tx || test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
-		rcu_read_unlock();
+	if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
 		return;
 	}
 
 	timeout = tid_tx->last_tx + TU_TO_JIFFIES(tid_tx->timeout);
 	if (time_is_after_jiffies(timeout)) {
 		mod_timer(&tid_tx->session_timer, timeout);
-		rcu_read_unlock();
 		return;
 	}
 
-	rcu_read_unlock();
-
 	ht_dbg(sta->sdata, "tx session timer expired on %pM tid %d\n",
 	       sta->sta.addr, tid);
 
-- 
cgit v1.2.3


From e2fb1b839208ad776c0ffbb55f17e6968389ce02 Mon Sep 17 00:00:00 2001
From: Yingying Tang <yintang@qti.qualcomm.com>
Date: Tue, 24 Oct 2017 16:51:10 +0800
Subject: mac80211: enable TDLS peer buffer STA feature

Allow drivers to set the buffer station extended capability
for TDLS links, with a new hardware flag indicating this.

Signed-off-by: Yingying Tang <yintang@qti.qualcomm.com>
[change commit log/documentation wording]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 net/mac80211/debugfs.c | 1 +
 net/mac80211/tdls.c    | 5 ++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index eec143cca1c0..2ee4af25256d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2056,6 +2056,9 @@ struct ieee80211_txq {
  *	The stack will not do fragmentation.
  *	The callback for @set_frag_threshold should be set as well.
  *
+ * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on
+ *	TDLS links.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2098,6 +2101,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_TX_FRAG_LIST,
 	IEEE80211_HW_REPORTS_LOW_ACK,
 	IEEE80211_HW_SUPPORTS_TX_FRAG,
+	IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 5fae001f286c..1f466d12a6bc 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -211,6 +211,7 @@ static const char *hw_flag_names[] = {
 	FLAG(TX_FRAG_LIST),
 	FLAG(REPORTS_LOW_ACK),
 	FLAG(SUPPORTS_TX_FRAG),
+	FLAG(SUPPORTS_TDLS_BUFFER_STA),
 #undef FLAG
 };
 
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 96d4fb998e33..5cd5e6e5834e 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -47,6 +47,8 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata,
 			   NL80211_FEATURE_TDLS_CHANNEL_SWITCH;
 	bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) &&
 			  !ifmgd->tdls_wider_bw_prohibited;
+	bool buffer_sta = ieee80211_hw_check(&local->hw,
+					     SUPPORTS_TDLS_BUFFER_STA);
 	struct ieee80211_supported_band *sband = ieee80211_get_sband(sdata);
 	bool vht = sband && sband->vht_cap.vht_supported;
 	u8 *pos = skb_put(skb, 10);
@@ -56,7 +58,8 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata,
 	*pos++ = 0x0;
 	*pos++ = 0x0;
 	*pos++ = 0x0;
-	*pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0;
+	*pos++ = (chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0) |
+		 (buffer_sta ? WLAN_EXT_CAPA4_TDLS_BUFFER_STA : 0);
 	*pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED;
 	*pos++ = 0;
 	*pos++ = 0;
-- 
cgit v1.2.3


From a9d09bc1bcac89996fc961b1ae023d72171af7b6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Nov 2017 14:54:24 +0100
Subject: mac80211: make __ieee80211_start_rx_ba_session static

The function is only used with the file, so make it static.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c      | 9 +++++----
 net/mac80211/ieee80211_i.h | 4 ----
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 35e94483fb8c..a8b1616cec41 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -404,10 +404,11 @@ end:
 					  timeout);
 }
 
-void __ieee80211_start_rx_ba_session(struct sta_info *sta,
-				     u8 dialog_token, u16 timeout,
-				     u16 start_seq_num, u16 ba_policy, u16 tid,
-				     u16 buf_size, bool tx, bool auto_seq)
+static void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+					    u8 dialog_token, u16 timeout,
+					    u16 start_seq_num, u16 ba_policy,
+					    u16 tid, u16 buf_size, bool tx,
+					    bool auto_seq)
 {
 	mutex_lock(&sta->ampdu_mlme.mtx);
 	___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 885d00b41911..26900025de2f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1757,10 +1757,6 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 				     u16 initiator, u16 reason, bool stop);
 void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 				    u16 initiator, u16 reason, bool stop);
-void __ieee80211_start_rx_ba_session(struct sta_info *sta,
-				     u8 dialog_token, u16 timeout,
-				     u16 start_seq_num, u16 ba_policy, u16 tid,
-				     u16 buf_size, bool tx, bool auto_seq);
 void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
 				      u8 dialog_token, u16 timeout,
 				      u16 start_seq_num, u16 ba_policy, u16 tid,
-- 
cgit v1.2.3


From 9fef65443388a66a2c19835e2848a6ecf162710b Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 29 Oct 2017 11:51:07 +0200
Subject: mac80211: always update the PM state of a peer on MGMT / DATA frames

The 2016 version of the spec is more generic about when the
AP should update the power management state of the peer:
the AP shall update the state based on any management or
data frames. This means that even non-bufferable management
frames should be looked at to update to maintain the power
management state of the peer.

This can avoid problematic cases for example if a station
disappears while being asleep and then re-appears. The AP
would remember it as in power save, but the Authentication
frame couldn't be used to set the peer as awake again.
Note that this issues wasn't really critical since at some
point (after the association) we would have removed the
station and created another one with all the states cleared.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 70e9d2ca8bbe..b3cff69bfd66 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1607,23 +1607,16 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 
 	/*
 	 * Change STA power saving mode only at the end of a frame
-	 * exchange sequence.
+	 * exchange sequence, and only for a data or management
+	 * frame as specified in IEEE 802.11-2016 11.2.3.2
 	 */
 	if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) &&
 	    !ieee80211_has_morefrags(hdr->frame_control) &&
-	    !ieee80211_is_back_req(hdr->frame_control) &&
+	    (ieee80211_is_mgmt(hdr->frame_control) ||
+	     ieee80211_is_data(hdr->frame_control)) &&
 	    !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
 	    (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
-	     rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
-	    /*
-	     * PM bit is only checked in frames where it isn't reserved,
-	     * in AP mode it's reserved in non-bufferable management frames
-	     * (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field)
-	     * BAR frames should be ignored as specified in
-	     * IEEE 802.11-2012 10.2.1.2.
-	     */
-	    (!ieee80211_is_mgmt(hdr->frame_control) ||
-	     ieee80211_is_bufferable_mmpdu(hdr->frame_control))) {
+	     rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) {
 		if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
 			if (!ieee80211_has_pm(hdr->frame_control))
 				sta_ps_end(sta);
-- 
cgit v1.2.3


From 2316380f843dfd4cca5232a3b32dcb2b32b16722 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Sun, 29 Oct 2017 11:51:08 +0200
Subject: mac80211: call synchronize_net once in the restart flow

Currently the restart flow enables RX back, and then proceeds
to tear down RX and TX aggregations.
The TX aggregation tear down calls synchronize_net(), which
waits for packet receiving to be done.
This is done for every session, while RX processing is already
active, and in some reproductions it takes up to 3 seconds.
Add a call once in the restart_work, before we have traffic
active again, and remove the subsequent calls when tearing
down the aggregation.
This requires to move down the code that turns off the
reconfig flag in order to be able to test it in
_ieee80211_stop_tx_ba_session().

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-tx.c |  3 ++-
 net/mac80211/main.c   |  3 +++
 net/mac80211/util.c   | 19 ++++++++++---------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index a04df962129c..595c662a61e8 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -392,7 +392,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 	 * telling the driver. New packets will not go through since
 	 * the aggregation session is no longer OPERATIONAL.
 	 */
-	synchronize_net();
+	if (!local->in_reconfig)
+		synchronize_net();
 
 	tid_tx->stop_initiator = reason == AGG_STOP_PEER_REQUEST ?
 					WLAN_BACK_RECIPIENT :
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index e054a2fd8d38..0785d04a80bc 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -263,6 +263,9 @@ static void ieee80211_restart_work(struct work_struct *work)
 	flush_delayed_work(&local->roc_work);
 	flush_work(&local->hw_roc_done);
 
+	/* wait for all packet processing to be done */
+	synchronize_net();
+
 	ieee80211_reconfig(local);
 	rtnl_unlock();
 }
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index d57e5f6bd8b6..1f82191ce601 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2110,15 +2110,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0);
 
  wake_up:
-	if (local->in_reconfig) {
-		local->in_reconfig = false;
-		barrier();
-
-		/* Restart deferred ROCs */
-		mutex_lock(&local->mtx);
-		ieee80211_start_next_roc(local);
-		mutex_unlock(&local->mtx);
-	}
 
 	if (local->monitors == local->open_count && local->monitors > 0)
 		ieee80211_add_virtual_monitor(local);
@@ -2146,6 +2137,16 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		mutex_unlock(&local->sta_mtx);
 	}
 
+	if (local->in_reconfig) {
+		local->in_reconfig = false;
+		barrier();
+
+		/* Restart deferred ROCs */
+		mutex_lock(&local->mtx);
+		ieee80211_start_next_roc(local);
+		mutex_unlock(&local->mtx);
+	}
+
 	ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
 					IEEE80211_QUEUE_STOP_REASON_SUSPEND,
 					false);
-- 
cgit v1.2.3


From c7976f5272486e4ff406014c4b43e2fa3b70b052 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Sun, 29 Oct 2017 11:51:10 +0200
Subject: mac80211: remove BUG() when interface type is invalid

In the ieee80211_setup_sdata() we check if the interface type is valid
and, if not, call BUG().  This should never happen, but if there is
something wrong with the code, it will not be caught until the bug
happens when an interface is being set up.  Calling BUG() is too
extreme for this and a WARN_ON() would be better used instead.  Change
that.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 435e7358004c..5fe01f82df12 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1474,7 +1474,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 		break;
 	case NL80211_IFTYPE_UNSPECIFIED:
 	case NUM_NL80211_IFTYPES:
-		BUG();
+		WARN_ON(1);
 		break;
 	}
 
-- 
cgit v1.2.3


From 9ae3b172e886c851d8cd6a88569025160b485e95 Mon Sep 17 00:00:00 2001
From: Tova Mussai <tova.mussai@intel.com>
Date: Sun, 29 Oct 2017 11:51:11 +0200
Subject: cfg80211: IBSS: Add support for static WEP in driver for IBSS

Add support for drivers that implement static WEP internally for IBSS.
Add the WEP keys to the IBSS params struct, that will allow the driver
to use the keys in the join flow, and not only after the connection.

Signed-off-by: Tova Mussai <tova.mussai@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 5 +++++
 net/wireless/ibss.c    | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8b8118a7fadb..698cebf2de2a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2021,6 +2021,9 @@ struct cfg80211_disassoc_request {
  * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
  *	will be used in ht_capa.  Un-supported values will be ignored.
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
+ * @wep_keys: static WEP keys, if not NULL points to an array of
+ * 	CFG80211_MAX_WEP_KEYS WEP keys
+ * @wep_tx_key: key index (0..3) of the default TX static WEP key
  */
 struct cfg80211_ibss_params {
 	const u8 *ssid;
@@ -2037,6 +2040,8 @@ struct cfg80211_ibss_params {
 	int mcast_rate[NUM_NL80211_BANDS];
 	struct ieee80211_ht_cap ht_capa;
 	struct ieee80211_ht_cap ht_capa_mask;
+	struct key_params *wep_keys;
+	int wep_tx_key;
 };
 
 /**
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 413d4f4e6334..a1d10993d08a 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -126,6 +126,11 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	wdev->ibss_fixed = params->channel_fixed;
 	wdev->ibss_dfs_possible = params->userspace_handles_dfs;
 	wdev->chandef = params->chandef;
+	if (connkeys) {
+		params->wep_keys = connkeys->params;
+		params->wep_tx_key = connkeys->def;
+	}
+
 #ifdef CONFIG_CFG80211_WEXT
 	wdev->wext.ibss.chandef = params->chandef;
 #endif
-- 
cgit v1.2.3


From 6c2fb1e6527b1092e09c786fee66b1759fa9d574 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Thu, 9 Nov 2017 14:46:30 +0300
Subject: cfg80211: cleanup signal strength units notation

Both cfg80211_rx_mgmt and cfg80211_report_obss_beacon functions send
reports to userspace using NL80211_ATTR_RX_SIGNAL_DBM attribute w/o
any processing of their input signal values. Which means that in
order to match userspace tools expectations, input signal values
for those functions are supposed to be in dBm units.

This patch cleans up comments, variable names, and trace reports
for those functions, replacing confusing 'mBm' by 'dBm'.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 ++--
 net/wireless/mlme.c    |  6 +++---
 net/wireless/trace.h   | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 698cebf2de2a..d7f8e7b96bcb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5581,7 +5581,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * cfg80211_rx_mgmt - notification of received, unprocessed management frame
  * @wdev: wireless device receiving the frame
  * @freq: Frequency on which the frame was received in MHz
- * @sig_dbm: signal strength in mBm, or 0 if unknown
+ * @sig_dbm: signal strength in dBm, or 0 if unknown
  * @buf: Management frame (header + body)
  * @len: length of the frame data
  * @flags: flags, as defined in enum nl80211_rxmgmt_flags
@@ -5760,7 +5760,7 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
  * @frame: the frame
  * @len: length of the frame
  * @freq: frequency the frame was received on
- * @sig_dbm: signal strength in mBm, or 0 if unknown
+ * @sig_dbm: signal strength in dBm, or 0 if unknown
  *
  * Use this function to report to userspace when a beacon was
  * received. It is not useful to call this when there is no
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index e7c64a8dce54..bbb9907bfa86 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -692,7 +692,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
 }
 
-bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
+bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm,
 		      const u8 *buf, size_t len, u32 flags)
 {
 	struct wiphy *wiphy = wdev->wiphy;
@@ -708,7 +708,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
 		cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
 	u16 stype;
 
-	trace_cfg80211_rx_mgmt(wdev, freq, sig_mbm);
+	trace_cfg80211_rx_mgmt(wdev, freq, sig_dbm);
 	stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
 
 	if (!(stypes->rx & BIT(stype))) {
@@ -735,7 +735,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
 
 		/* Indicate the received Action frame to user space */
 		if (nl80211_send_mgmt(rdev, wdev, reg->nlportid,
-				      freq, sig_mbm,
+				      freq, sig_dbm,
 				      buf, len, flags, GFP_ATOMIC))
 			continue;
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f3353fe5b35b..bcfedd39e7a3 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2544,20 +2544,20 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta,
 );
 
 TRACE_EVENT(cfg80211_rx_mgmt,
-	TP_PROTO(struct wireless_dev *wdev, int freq, int sig_mbm),
-	TP_ARGS(wdev, freq, sig_mbm),
+	TP_PROTO(struct wireless_dev *wdev, int freq, int sig_dbm),
+	TP_ARGS(wdev, freq, sig_dbm),
 	TP_STRUCT__entry(
 		WDEV_ENTRY
 		__field(int, freq)
-		__field(int, sig_mbm)
+		__field(int, sig_dbm)
 	),
 	TP_fast_assign(
 		WDEV_ASSIGN;
 		__entry->freq = freq;
-		__entry->sig_mbm = sig_mbm;
+		__entry->sig_dbm = sig_dbm;
 	),
-	TP_printk(WDEV_PR_FMT ", freq: %d, sig mbm: %d",
-		  WDEV_PR_ARG, __entry->freq, __entry->sig_mbm)
+	TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d",
+		  WDEV_PR_ARG, __entry->freq, __entry->sig_dbm)
 );
 
 TRACE_EVENT(cfg80211_mgmt_tx_status,
-- 
cgit v1.2.3


From 768075ebc238d11900b27975e649286d292eb8cf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Nov 2017 15:35:06 +0100
Subject: nl80211: add a few extended error strings to key parsing

This mostly serves as an example for how to add error strings
and erroneous attribute pointers.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 61 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b1ac23ca20c8..e4522ad5f770 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -734,11 +734,12 @@ struct key_parse {
 	bool def_uni, def_multi;
 };
 
-static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
+static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key,
+				 struct key_parse *k)
 {
 	struct nlattr *tb[NL80211_KEY_MAX + 1];
 	int err = nla_parse_nested(tb, NL80211_KEY_MAX, key,
-				   nl80211_key_policy, NULL);
+				   nl80211_key_policy, info->extack);
 	if (err)
 		return err;
 
@@ -771,7 +772,8 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
 	if (tb[NL80211_KEY_TYPE]) {
 		k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
 		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
-			return -EINVAL;
+			return genl_err_attr(info, -EINVAL,
+					     tb[NL80211_KEY_TYPE]);
 	}
 
 	if (tb[NL80211_KEY_DEFAULT_TYPES]) {
@@ -779,7 +781,8 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
 
 		err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1,
 				       tb[NL80211_KEY_DEFAULT_TYPES],
-				       nl80211_key_default_policy, NULL);
+				       nl80211_key_default_policy,
+				       info->extack);
 		if (err)
 			return err;
 
@@ -820,8 +823,10 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
 
 	if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
 		k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
-		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) {
+			GENL_SET_ERR_MSG(info, "key type out of range");
 			return -EINVAL;
+		}
 	}
 
 	if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) {
@@ -850,31 +855,42 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
 	k->type = -1;
 
 	if (info->attrs[NL80211_ATTR_KEY])
-		err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k);
+		err = nl80211_parse_key_new(info, info->attrs[NL80211_ATTR_KEY], k);
 	else
 		err = nl80211_parse_key_old(info, k);
 
 	if (err)
 		return err;
 
-	if (k->def && k->defmgmt)
+	if (k->def && k->defmgmt) {
+		GENL_SET_ERR_MSG(info, "key with def && defmgmt is invalid");
 		return -EINVAL;
+	}
 
 	if (k->defmgmt) {
-		if (k->def_uni || !k->def_multi)
+		if (k->def_uni || !k->def_multi) {
+			GENL_SET_ERR_MSG(info, "defmgmt key must be mcast");
 			return -EINVAL;
+		}
 	}
 
 	if (k->idx != -1) {
 		if (k->defmgmt) {
-			if (k->idx < 4 || k->idx > 5)
+			if (k->idx < 4 || k->idx > 5) {
+				GENL_SET_ERR_MSG(info,
+						 "defmgmt key idx not 4 or 5");
 				return -EINVAL;
+			}
 		} else if (k->def) {
-			if (k->idx < 0 || k->idx > 3)
+			if (k->idx < 0 || k->idx > 3) {
+				GENL_SET_ERR_MSG(info, "def key idx not 0-3");
 				return -EINVAL;
+			}
 		} else {
-			if (k->idx < 0 || k->idx > 5)
+			if (k->idx < 0 || k->idx > 5) {
+				GENL_SET_ERR_MSG(info, "key idx not 0-5");
 				return -EINVAL;
+			}
 		}
 	}
 
@@ -883,8 +899,9 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
 
 static struct cfg80211_cached_keys *
 nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
-		       struct nlattr *keys, bool *no_ht)
+		       struct genl_info *info, bool *no_ht)
 {
+	struct nlattr *keys = info->attrs[NL80211_ATTR_KEYS];
 	struct key_parse parse;
 	struct nlattr *key;
 	struct cfg80211_cached_keys *result;
@@ -909,17 +926,22 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 		memset(&parse, 0, sizeof(parse));
 		parse.idx = -1;
 
-		err = nl80211_parse_key_new(key, &parse);
+		err = nl80211_parse_key_new(info, key, &parse);
 		if (err)
 			goto error;
 		err = -EINVAL;
 		if (!parse.p.key)
 			goto error;
-		if (parse.idx < 0 || parse.idx > 3)
+		if (parse.idx < 0 || parse.idx > 3) {
+			GENL_SET_ERR_MSG(info, "key index out of range [0-3]");
 			goto error;
+		}
 		if (parse.def) {
-			if (def)
+			if (def) {
+				GENL_SET_ERR_MSG(info,
+						 "only one key can be default");
 				goto error;
+			}
 			def = 1;
 			result->def = parse.idx;
 			if (!parse.def_uni || !parse.def_multi)
@@ -932,6 +954,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 			goto error;
 		if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 &&
 		    parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) {
+			GENL_SET_ERR_MSG(info, "connect key must be WEP");
 			err = -EINVAL;
 			goto error;
 		}
@@ -947,6 +970,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 
 	if (result->def < 0) {
 		err = -EINVAL;
+		GENL_SET_ERR_MSG(info, "need a default/TX key");
 		goto error;
 	}
 
@@ -8611,9 +8635,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
 		bool no_ht = false;
 
-		connkeys = nl80211_parse_connkeys(rdev,
-					  info->attrs[NL80211_ATTR_KEYS],
-					  &no_ht);
+		connkeys = nl80211_parse_connkeys(rdev, info, &no_ht);
 		if (IS_ERR(connkeys))
 			return PTR_ERR(connkeys);
 
@@ -9017,8 +9039,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
-		connkeys = nl80211_parse_connkeys(rdev,
-					  info->attrs[NL80211_ATTR_KEYS], NULL);
+		connkeys = nl80211_parse_connkeys(rdev, info, NULL);
 		if (IS_ERR(connkeys))
 			return PTR_ERR(connkeys);
 	}
-- 
cgit v1.2.3


From c7c477b52c4caa2cfb44bf3841f806d1bf20e0bf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 1 Dec 2017 13:50:51 +0200
Subject: mac80211: don't warn on AID field without top two MSBs set

While the change between 802.11-2012 and 802.11-2016 to move from
requiring APs to set the two top bits to now requiring them to be
cleared was apparently unintentional and will be fixed, clients
should either way assume that the top five bits are reserved and
ignore them.

Implement that in mac80211.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index fa0f96c74898..39b660b9a908 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2862,10 +2862,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
 	capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
 
-	if ((aid & (BIT(15) | BIT(14))) != (BIT(15) | BIT(14)))
-		sdata_info(sdata, "invalid AID value 0x%x; bits 15:14 not set\n",
-			   aid);
-	aid &= ~(BIT(15) | BIT(14));
+	/*
+	 * The 5 MSB of the AID field are reserved
+	 * (802.11-2016 9.4.1.8 AID field)
+	 */
+	aid &= 0x7ff;
 
 	ifmgd->broken_ap = false;
 
-- 
cgit v1.2.3


From 9de18d8186cb070d22ed67a3f75a2ef5fbf3ef6f Mon Sep 17 00:00:00 2001
From: David Spinadel <david.spinadel@intel.com>
Date: Fri, 1 Dec 2017 13:50:52 +0200
Subject: mac80211: Add MIC space only for TX key option

Add a key flag to indicates that the device only needs
MIC space and not a real MIC.
In such cases, keep the MIC zeroed for ease of debug.

Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  6 +++++-
 net/mac80211/key.c     | 12 +++++++++---
 net/mac80211/tx.c      |  4 +++-
 net/mac80211/wpa.c     | 16 ++++++++++++----
 4 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 2ee4af25256d..906e90223066 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1552,6 +1552,9 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);
  * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the
  *	driver for a key to indicate that sufficient tailroom must always
  *	be reserved for ICV or MIC, even when HW encryption is enabled.
+ * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for
+ *	a TKIP key if it only requires MIC space. Do not set together with
+ *	@IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key.
  */
 enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_GENERATE_IV_MGMT	= BIT(0),
@@ -1562,6 +1565,7 @@ enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_PUT_IV_SPACE		= BIT(5),
 	IEEE80211_KEY_FLAG_RX_MGMT		= BIT(6),
 	IEEE80211_KEY_FLAG_RESERVE_TAILROOM	= BIT(7),
+	IEEE80211_KEY_FLAG_PUT_MIC_SPACE	= BIT(8),
 };
 
 /**
@@ -1593,8 +1597,8 @@ struct ieee80211_key_conf {
 	u8 icv_len;
 	u8 iv_len;
 	u8 hw_key_idx;
-	u8 flags;
 	s8 keyidx;
+	u16 flags;
 	u8 keylen;
 	u8 key[0];
 };
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 938049395f90..aee05ec3f7ea 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -178,13 +178,17 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 	if (!ret) {
 		key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE;
 
-		if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+		if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC |
+					   IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) ||
 		      (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
 			decrease_tailroom_need_count(sdata, 1);
 
 		WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
 			(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV));
 
+		WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) &&
+			(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC));
+
 		return 0;
 	}
 
@@ -237,7 +241,8 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
 	sta = key->sta;
 	sdata = key->sdata;
 
-	if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+	if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC |
+				   IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) ||
 	      (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
 		increment_tailroom_need_count(sdata);
 
@@ -1104,7 +1109,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf)
 	if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
 		key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
 
-		if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+		if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC |
+					   IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) ||
 		      (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
 			increment_tailroom_need_count(key->sdata);
 	}
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3160954fc406..25904af38839 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2922,7 +2922,9 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 
 		gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV;
 		iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE;
-		mmic = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC;
+		mmic = build.key->conf.flags &
+			(IEEE80211_KEY_FLAG_GENERATE_MMIC |
+			 IEEE80211_KEY_FLAG_PUT_MIC_SPACE);
 
 		/* don't handle software crypto */
 		if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b58722d9de37..785056cb76f6 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -1,7 +1,7 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2008, Jouni Malinen <j@w1.fi>
- * Copyright (C) 2016 Intel Deutschland GmbH
+ * Copyright (C) 2016-2017 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -59,8 +59,9 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	if (info->control.hw_key &&
 	    (info->flags & IEEE80211_TX_CTL_DONTFRAG ||
 	     ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) &&
-	    !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) {
-		/* hwaccel - with no need for SW-generated MMIC */
+	    !(tx->key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC |
+				     IEEE80211_KEY_FLAG_PUT_MIC_SPACE))) {
+		/* hwaccel - with no need for SW-generated MMIC or MIC space */
 		return TX_CONTINUE;
 	}
 
@@ -75,8 +76,15 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 		 skb_tailroom(skb), tail))
 		return TX_DROP;
 
-	key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY];
 	mic = skb_put(skb, MICHAEL_MIC_LEN);
+
+	if (tx->key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) {
+		/* Zeroed MIC can help with debug */
+		memset(mic, 0, MICHAEL_MIC_LEN);
+		return TX_CONTINUE;
+	}
+
+	key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY];
 	michael_mic(key, hdr, data, data_len, mic);
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE))
 		mic[0]++;
-- 
cgit v1.2.3


From e937b8da5a591f141fe41aa48a2e898df9888c95 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 31 Oct 2017 12:27:45 +0100
Subject: mac80211: Add TXQ scheduling API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds an API to mac80211 to handle scheduling of TXQs and changes the
interface between driver and mac80211 for TXQ handling as follows:

- The wake_tx_queue callback interface no longer includes the TXQ. Instead,
  the driver is expected to retrieve that from ieee80211_next_txq()

- Two new mac80211 functions are added: ieee80211_next_txq() and
  ieee80211_schedule_txq(). The former returns the next TXQ that should be
  scheduled, and is how the driver gets a queue to pull packets from. The
  latter is called internally by mac80211 to start scheduling a queue, and
  the driver is supposed to call it to re-schedule the TXQ after it is
  finished pulling packets from it (unless the queue emptied).

The ath9k and ath10k drivers are changed to use the new API.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/core.c |   2 -
 drivers/net/wireless/ath/ath10k/core.h |   4 -
 drivers/net/wireless/ath/ath10k/mac.c  |  55 +++------
 drivers/net/wireless/ath/ath9k/ath9k.h |   9 +-
 drivers/net/wireless/ath/ath9k/main.c  |   2 +-
 drivers/net/wireless/ath/ath9k/recv.c  |   2 -
 drivers/net/wireless/ath/ath9k/xmit.c  | 210 ++++++++-------------------------
 include/net/mac80211.h                 |  37 +++++-
 net/mac80211/agg-tx.c                  |   6 +-
 net/mac80211/driver-ops.h              |  12 +-
 net/mac80211/ieee80211_i.h             |   5 +
 net/mac80211/main.c                    |   3 +
 net/mac80211/sta_info.c                |   7 +-
 net/mac80211/trace.h                   |  32 +----
 net/mac80211/tx.c                      |  49 +++++++-
 15 files changed, 173 insertions(+), 262 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index b29fdbd21ead..90d16a38475f 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2574,9 +2574,7 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev,
 
 	mutex_init(&ar->conf_mutex);
 	spin_lock_init(&ar->data_lock);
-	spin_lock_init(&ar->txqs_lock);
 
-	INIT_LIST_HEAD(&ar->txqs);
 	INIT_LIST_HEAD(&ar->peers);
 	init_waitqueue_head(&ar->peer_mapping_wq);
 	init_waitqueue_head(&ar->htt.empty_tx_wq);
diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index 643041ef3271..4a79fdce9a08 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -347,7 +347,6 @@ struct ath10k_peer {
 };
 
 struct ath10k_txq {
-	struct list_head list;
 	unsigned long num_fw_queued;
 	unsigned long num_push_allowed;
 };
@@ -895,10 +894,7 @@ struct ath10k {
 
 	/* protects shared structure data */
 	spinlock_t data_lock;
-	/* protects: ar->txqs, artxq->list */
-	spinlock_t txqs_lock;
 
-	struct list_head txqs;
 	struct list_head arvifs;
 	struct list_head peers;
 	struct ath10k_peer *peer_map[ATH10K_MAX_NUM_PEER_IDS];
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 0a947eef348d..cca4cd82853b 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -3830,12 +3830,10 @@ static void ath10k_mac_txq_init(struct ieee80211_txq *txq)
 		return;
 
 	artxq = (void *)txq->drv_priv;
-	INIT_LIST_HEAD(&artxq->list);
 }
 
 static void ath10k_mac_txq_unref(struct ath10k *ar, struct ieee80211_txq *txq)
 {
-	struct ath10k_txq *artxq;
 	struct ath10k_skb_cb *cb;
 	struct sk_buff *msdu;
 	int msdu_id;
@@ -3843,12 +3841,6 @@ static void ath10k_mac_txq_unref(struct ath10k *ar, struct ieee80211_txq *txq)
 	if (!txq)
 		return;
 
-	artxq = (void *)txq->drv_priv;
-	spin_lock_bh(&ar->txqs_lock);
-	if (!list_empty(&artxq->list))
-		list_del_init(&artxq->list);
-	spin_unlock_bh(&ar->txqs_lock);
-
 	spin_lock_bh(&ar->htt.tx_lock);
 	idr_for_each_entry(&ar->htt.pending_tx, msdu, msdu_id) {
 		cb = ATH10K_SKB_CB(msdu);
@@ -3978,23 +3970,17 @@ int ath10k_mac_tx_push_txq(struct ieee80211_hw *hw,
 void ath10k_mac_tx_push_pending(struct ath10k *ar)
 {
 	struct ieee80211_hw *hw = ar->hw;
-	struct ieee80211_txq *txq;
-	struct ath10k_txq *artxq;
-	struct ath10k_txq *last;
+	struct ieee80211_txq *txq, *first = NULL;
 	int ret;
 	int max;
 
 	if (ar->htt.num_pending_tx >= (ar->htt.max_num_pending_tx / 2))
 		return;
 
-	spin_lock_bh(&ar->txqs_lock);
 	rcu_read_lock();
 
-	last = list_last_entry(&ar->txqs, struct ath10k_txq, list);
-	while (!list_empty(&ar->txqs)) {
-		artxq = list_first_entry(&ar->txqs, struct ath10k_txq, list);
-		txq = container_of((void *)artxq, struct ieee80211_txq,
-				   drv_priv);
+	txq = ieee80211_next_txq(hw);
+	while (txq) {
 
 		/* Prevent aggressive sta/tid taking over tx queue */
 		max = 16;
@@ -4005,18 +3991,21 @@ void ath10k_mac_tx_push_pending(struct ath10k *ar)
 				break;
 		}
 
-		list_del_init(&artxq->list);
 		if (ret != -ENOENT)
-			list_add_tail(&artxq->list, &ar->txqs);
+			ieee80211_schedule_txq(hw, txq);
 
 		ath10k_htt_tx_txq_update(hw, txq);
 
-		if (artxq == last || (ret < 0 && ret != -ENOENT))
+		if (first == txq || (ret < 0 && ret != -ENOENT))
 			break;
+
+		if (!first)
+			first = txq;
+
+		txq = ieee80211_next_txq(hw);
 	}
 
 	rcu_read_unlock();
-	spin_unlock_bh(&ar->txqs_lock);
 }
 
 /************/
@@ -4250,34 +4239,22 @@ static void ath10k_mac_op_tx(struct ieee80211_hw *hw,
 	}
 }
 
-static void ath10k_mac_op_wake_tx_queue(struct ieee80211_hw *hw,
-					struct ieee80211_txq *txq)
+static void ath10k_mac_op_wake_tx_queue(struct ieee80211_hw *hw)
 {
-	struct ath10k *ar = hw->priv;
-	struct ath10k_txq *artxq = (void *)txq->drv_priv;
-	struct ieee80211_txq *f_txq;
-	struct ath10k_txq *f_artxq;
+	struct ieee80211_txq *txq;
 	int ret = 0;
 	int max = 16;
 
-	spin_lock_bh(&ar->txqs_lock);
-	if (list_empty(&artxq->list))
-		list_add_tail(&artxq->list, &ar->txqs);
-
-	f_artxq = list_first_entry(&ar->txqs, struct ath10k_txq, list);
-	f_txq = container_of((void *)f_artxq, struct ieee80211_txq, drv_priv);
-	list_del_init(&f_artxq->list);
+	txq = ieee80211_next_txq(hw);
 
-	while (ath10k_mac_tx_can_push(hw, f_txq) && max--) {
-		ret = ath10k_mac_tx_push_txq(hw, f_txq);
+	while (ath10k_mac_tx_can_push(hw, txq) && max--) {
+		ret = ath10k_mac_tx_push_txq(hw, txq);
 		if (ret)
 			break;
 	}
 	if (ret != -ENOENT)
-		list_add_tail(&f_artxq->list, &ar->txqs);
-	spin_unlock_bh(&ar->txqs_lock);
+		ieee80211_schedule_txq(hw, txq);
 
-	ath10k_htt_tx_txq_update(hw, f_txq);
 	ath10k_htt_tx_txq_update(hw, txq);
 }
 
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index ef0de4f1312c..face2bb7f357 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -246,12 +246,8 @@ struct ath_atx_tid {
 	s8 bar_index;
 	bool active;
 	bool clear_ps_filter;
-	bool has_queued;
 };
 
-void __ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid);
-void ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid);
-
 struct ath_node {
 	struct ath_softc *sc;
 	struct ieee80211_sta *sta; /* station struct we're part of */
@@ -591,8 +587,7 @@ bool ath_drain_all_txq(struct ath_softc *sc);
 void ath_draintxq(struct ath_softc *sc, struct ath_txq *txq);
 void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an);
 void ath_tx_node_cleanup(struct ath_softc *sc, struct ath_node *an);
-void ath_txq_schedule(struct ath_softc *sc, struct ath_txq *txq);
-void ath_txq_schedule_all(struct ath_softc *sc);
+void ath_txq_schedule(struct ath_softc *sc);
 int ath_tx_init(struct ath_softc *sc, int nbufs);
 int ath_txq_update(struct ath_softc *sc, int qnum,
 		   struct ath9k_tx_queue_info *q);
@@ -618,7 +613,7 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
 				   u16 tids, int nframes,
 				   enum ieee80211_frame_release_type reason,
 				   bool more_data);
-void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue);
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw);
 
 /********/
 /* VIFs */
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index a3be8add56e1..f7dfcdf508ce 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -266,7 +266,7 @@ static bool ath_complete_reset(struct ath_softc *sc, bool start)
 		}
 	work:
 		ath_restart_work(sc);
-		ath_txq_schedule_all(sc);
+		ath_txq_schedule(sc);
 	}
 
 	sc->gtt_cnt = 0;
diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
index 2197aee2bb72..a768e841524d 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -1057,8 +1057,6 @@ static void ath_rx_count_airtime(struct ath_softc *sc,
  	if (!!(sc->airtime_flags & AIRTIME_USE_RX)) {
 		spin_lock_bh(&acq->lock);
 		an->airtime_deficit[acno] -= airtime;
-		if (an->airtime_deficit[acno] <= 0)
-			__ath_tx_queue_tid(sc, ATH_AN_2_TID(an, tidno));
 		spin_unlock_bh(&acq->lock);
 	}
 	ath_debug_airtime(sc, an, airtime, 0);
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 396bf05c6bf6..bd438062a6db 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -112,62 +112,11 @@ void ath_txq_unlock_complete(struct ath_softc *sc, struct ath_txq *txq)
 		ath_tx_status(hw, skb);
 }
 
-void __ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid)
-{
-	struct ath_vif *avp = (struct ath_vif *) tid->an->vif->drv_priv;
-	struct ath_chanctx *ctx = avp->chanctx;
-	struct ath_acq *acq;
-	struct list_head *tid_list;
-	u8 acno = TID_TO_WME_AC(tid->tidno);
-
-	if (!ctx || !list_empty(&tid->list))
-		return;
-
-
-	acq = &ctx->acq[acno];
-	if ((sc->airtime_flags & AIRTIME_USE_NEW_QUEUES) &&
-	    tid->an->airtime_deficit[acno] > 0)
-		tid_list = &acq->acq_new;
-	else
-		tid_list = &acq->acq_old;
-
-	list_add_tail(&tid->list, tid_list);
-}
-
-void ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid)
-{
-	struct ath_vif *avp = (struct ath_vif *) tid->an->vif->drv_priv;
-	struct ath_chanctx *ctx = avp->chanctx;
-	struct ath_acq *acq;
-
-	if (!ctx || !list_empty(&tid->list))
-		return;
-
-	acq = &ctx->acq[TID_TO_WME_AC(tid->tidno)];
-	spin_lock_bh(&acq->lock);
-	__ath_tx_queue_tid(sc, tid);
-	spin_unlock_bh(&acq->lock);
-}
-
-
-void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue)
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw)
 {
 	struct ath_softc *sc = hw->priv;
-	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
-	struct ath_atx_tid *tid = (struct ath_atx_tid *) queue->drv_priv;
-	struct ath_txq *txq = tid->txq;
-
-	ath_dbg(common, QUEUE, "Waking TX queue: %pM (%d)\n",
-		queue->sta ? queue->sta->addr : queue->vif->addr,
-		tid->tidno);
-
-	ath_txq_lock(sc, txq);
 
-	tid->has_queued = true;
-	ath_tx_queue_tid(sc, tid);
-	ath_txq_schedule(sc, txq);
-
-	ath_txq_unlock(sc, txq);
+	ath_txq_schedule(sc);
 }
 
 static struct ath_frame_info *get_frame_info(struct sk_buff *skb)
@@ -230,14 +179,9 @@ ath_tid_pull(struct ath_atx_tid *tid)
 	struct ath_frame_info *fi;
 	int q;
 
-	if (!tid->has_queued)
-		return NULL;
-
 	skb = ieee80211_tx_dequeue(hw, txq);
-	if (!skb) {
-		tid->has_queued = false;
+	if (!skb)
 		return NULL;
-	}
 
 	if (ath_tx_prepare(hw, skb, &txctl)) {
 		ieee80211_free_txskb(hw, skb);
@@ -254,12 +198,6 @@ ath_tid_pull(struct ath_atx_tid *tid)
 	return skb;
  }
 
-
-static bool ath_tid_has_buffered(struct ath_atx_tid *tid)
-{
-	return !skb_queue_empty(&tid->retry_q) || tid->has_queued;
-}
-
 static struct sk_buff *ath_tid_dequeue(struct ath_atx_tid *tid)
 {
 	struct sk_buff *skb;
@@ -671,7 +609,10 @@ static void ath_tx_complete_aggr(struct ath_softc *sc, struct ath_txq *txq,
 
 		skb_queue_splice_tail(&bf_pending, &tid->retry_q);
 		if (!an->sleeping) {
-			ath_tx_queue_tid(sc, tid);
+			struct ieee80211_txq *queue = container_of(
+				(void *)tid, struct ieee80211_txq, drv_priv);
+
+			ieee80211_schedule_txq(sc->hw, queue);
 
 			if (ts->ts_status & (ATH9K_TXERR_FILT | ATH9K_TXERR_XRETRY))
 				tid->clear_ps_filter = true;
@@ -719,8 +660,6 @@ static void ath_tx_count_airtime(struct ath_softc *sc, struct ath_node *an,
 
 		spin_lock_bh(&acq->lock);
 		an->airtime_deficit[q] -= airtime;
-		if (an->airtime_deficit[q] <= 0)
-			__ath_tx_queue_tid(sc, tid);
 		spin_unlock_bh(&acq->lock);
 	}
 	ath_debug_airtime(sc, an, 0, airtime);
@@ -770,8 +709,6 @@ static void ath_tx_process_buffer(struct ath_softc *sc, struct ath_txq *txq,
 	} else
 		ath_tx_complete_aggr(sc, txq, bf, bf_head, sta, tid, ts, txok);
 
-	if (!flush)
-		ath_txq_schedule(sc, txq);
 }
 
 static bool ath_lookup_legacy(struct ath_buf *bf)
@@ -1506,8 +1443,8 @@ ath_tx_form_burst(struct ath_softc *sc, struct ath_txq *txq,
 	} while (1);
 }
 
-static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
-			      struct ath_atx_tid *tid)
+static int ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
+			     struct ath_atx_tid *tid)
 {
 	struct ath_buf *bf;
 	struct ieee80211_tx_info *tx_info;
@@ -1515,21 +1452,18 @@ static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
 	int aggr_len = 0;
 	bool aggr;
 
-	if (!ath_tid_has_buffered(tid))
-		return false;
-
 	INIT_LIST_HEAD(&bf_q);
 
 	bf = ath_tx_get_tid_subframe(sc, txq, tid);
 	if (!bf)
-		return false;
+		return -ENOENT;
 
 	tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
 	aggr = !!(tx_info->flags & IEEE80211_TX_CTL_AMPDU);
 	if ((aggr && txq->axq_ampdu_depth >= ATH_AGGR_MIN_QDEPTH) ||
 	    (!aggr && txq->axq_depth >= ATH_NON_AGGR_MIN_QDEPTH)) {
 		__skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
-		return false;
+		return -ENOBUFS;
 	}
 
 	ath_set_rates(tid->an->vif, tid->an->sta, bf);
@@ -1539,7 +1473,7 @@ static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
 		ath_tx_form_burst(sc, txq, tid, &bf_q, bf);
 
 	if (list_empty(&bf_q))
-		return false;
+		return -ENOENT;
 
 	if (tid->clear_ps_filter || tid->an->no_ps_filter) {
 		tid->clear_ps_filter = false;
@@ -1548,7 +1482,7 @@ static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
 
 	ath_tx_fill_desc(sc, bf, txq, aggr_len);
 	ath_tx_txqaddbuf(sc, txq, &bf_q, false);
-	return true;
+	return 0;
 }
 
 int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta,
@@ -1611,52 +1545,49 @@ void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	struct ath_atx_tid *tid;
-	struct ath_txq *txq;
+	struct ieee80211_txq *queue;
 	int tidno;
 
 	ath_dbg(common, XMIT, "%s called\n", __func__);
 
 	for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
 		tid = ath_node_to_tid(an, tidno);
-		txq = tid->txq;
-
-		ath_txq_lock(sc, txq);
-
-		if (list_empty(&tid->list)) {
-			ath_txq_unlock(sc, txq);
-			continue;
-		}
+		queue = container_of((void *)tid,
+				     struct ieee80211_txq, drv_priv);
 
 		if (!skb_queue_empty(&tid->retry_q))
 			ieee80211_sta_set_buffered(sta, tid->tidno, true);
 
-		list_del_init(&tid->list);
-
-		ath_txq_unlock(sc, txq);
 	}
 }
 
 void ath_tx_aggr_wakeup(struct ath_softc *sc, struct ath_node *an)
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	struct ieee80211_txq *queue;
 	struct ath_atx_tid *tid;
 	struct ath_txq *txq;
 	int tidno;
+	bool sched, wake = false;
 
 	ath_dbg(common, XMIT, "%s called\n", __func__);
 
 	for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
 		tid = ath_node_to_tid(an, tidno);
 		txq = tid->txq;
+		queue = container_of((void *)tid,
+				     struct ieee80211_txq, drv_priv);
 
 		ath_txq_lock(sc, txq);
 		tid->clear_ps_filter = true;
-		if (ath_tid_has_buffered(tid)) {
-			ath_tx_queue_tid(sc, tid);
-			ath_txq_schedule(sc, txq);
-		}
-		ath_txq_unlock_complete(sc, txq);
+		sched = !skb_queue_empty(&tid->retry_q);
+		ath_txq_unlock(sc, txq);
+
+		if (sched && ieee80211_schedule_txq(sc->hw, queue))
+			wake = true;
 	}
+	if (wake)
+		ath_txq_schedule(sc);
 }
 
 void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
@@ -1948,86 +1879,44 @@ void ath_tx_cleanupq(struct ath_softc *sc, struct ath_txq *txq)
 /* For each acq entry, for each tid, try to schedule packets
  * for transmit until ampdu_depth has reached min Q depth.
  */
-void ath_txq_schedule(struct ath_softc *sc, struct ath_txq *txq)
+void ath_txq_schedule(struct ath_softc *sc)
 {
+	struct ieee80211_hw *hw = sc->hw;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	struct ieee80211_txq *queue;
 	struct ath_atx_tid *tid;
-	struct list_head *tid_list;
-	struct ath_acq *acq;
-	bool active = AIRTIME_ACTIVE(sc->airtime_flags);
+	struct ath_txq *txq;
+	int ret = 0;
 
-	if (txq->mac80211_qnum < 0)
+	if (test_bit(ATH_OP_HW_RESET, &common->op_flags))
 		return;
 
-	if (test_bit(ATH_OP_HW_RESET, &common->op_flags))
+	queue = ieee80211_next_txq(hw);
+	if (!queue)
 		return;
 
-	spin_lock_bh(&sc->chan_lock);
-	rcu_read_lock();
-	acq = &sc->cur_chan->acq[txq->mac80211_qnum];
+	tid = (struct ath_atx_tid *)queue->drv_priv;
+	txq = tid->txq;
 
-	if (sc->cur_chan->stopped)
+	ath_txq_lock(sc, txq);
+	if (txq->mac80211_qnum < 0)
 		goto out;
 
-begin:
-	tid_list = &acq->acq_new;
-	if (list_empty(tid_list)) {
-		tid_list = &acq->acq_old;
-		if (list_empty(tid_list))
-			goto out;
-	}
-	tid = list_first_entry(tid_list, struct ath_atx_tid, list);
-
-	if (active && tid->an->airtime_deficit[txq->mac80211_qnum] <= 0) {
-		spin_lock_bh(&acq->lock);
-		tid->an->airtime_deficit[txq->mac80211_qnum] += ATH_AIRTIME_QUANTUM;
-		list_move_tail(&tid->list, &acq->acq_old);
-		spin_unlock_bh(&acq->lock);
-		goto begin;
-	}
-
-	if (!ath_tid_has_buffered(tid)) {
-		spin_lock_bh(&acq->lock);
-		if ((tid_list == &acq->acq_new) && !list_empty(&acq->acq_old))
-			list_move_tail(&tid->list, &acq->acq_old);
-		else {
-			list_del_init(&tid->list);
-		}
-		spin_unlock_bh(&acq->lock);
-		goto begin;
-	}
-
+	spin_lock_bh(&sc->chan_lock);
+	rcu_read_lock();
 
-	/*
-	 * If we succeed in scheduling something, immediately restart to make
-	 * sure we keep the HW busy.
-	 */
-	if(ath_tx_sched_aggr(sc, txq, tid)) {
-		if (!active) {
-			spin_lock_bh(&acq->lock);
-			list_move_tail(&tid->list, &acq->acq_old);
-			spin_unlock_bh(&acq->lock);
-		}
-		goto begin;
-	}
+	if (!sc->cur_chan->stopped)
+		ret = ath_tx_sched_aggr(sc, txq, tid);
 
-out:
 	rcu_read_unlock();
 	spin_unlock_bh(&sc->chan_lock);
-}
 
-void ath_txq_schedule_all(struct ath_softc *sc)
-{
-	struct ath_txq *txq;
-	int i;
+out:
 
-	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
-		txq = sc->tx.txq_map[i];
+	if (ret != -ENOENT)
+		ieee80211_schedule_txq(hw, queue);
 
-		spin_lock_bh(&txq->axq_lock);
-		ath_txq_schedule(sc, txq);
-		spin_unlock_bh(&txq->axq_lock);
-	}
+	ath_txq_unlock(sc, txq);
 }
 
 /***********/
@@ -2645,7 +2534,6 @@ static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq)
 
 		if (list_empty(&txq->axq_q)) {
 			txq->axq_link = NULL;
-			ath_txq_schedule(sc, txq);
 			break;
 		}
 		bf = list_first_entry(&txq->axq_q, struct ath_buf, list);
@@ -2697,6 +2585,7 @@ static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq)
 		ath_tx_process_buffer(sc, txq, &ts, bf, &bf_head);
 	}
 	ath_txq_unlock_complete(sc, txq);
+	ath_txq_schedule(sc);
 }
 
 void ath_tx_tasklet(struct ath_softc *sc)
@@ -2711,6 +2600,7 @@ void ath_tx_tasklet(struct ath_softc *sc)
 			ath_tx_processq(sc, &sc->tx.txq[i]);
 	}
 	rcu_read_unlock();
+	ath_txq_schedule(sc);
 }
 
 void ath_tx_edma_tasklet(struct ath_softc *sc)
@@ -2796,6 +2686,7 @@ void ath_tx_edma_tasklet(struct ath_softc *sc)
 		ath_txq_unlock_complete(sc, txq);
 	}
 	rcu_read_unlock();
+	ath_txq_schedule(sc);
 }
 
 /*****************/
@@ -2875,7 +2766,6 @@ void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an)
 		tid->baw_head  = tid->baw_tail = 0;
 		tid->active	   = false;
 		tid->clear_ps_filter = true;
-		tid->has_queued  = false;
 		__skb_queue_head_init(&tid->retry_q);
 		INIT_LIST_HEAD(&tid->list);
 		acno = TID_TO_WME_AC(tidno);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 906e90223066..45155803c875 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -105,9 +105,12 @@
  * The driver is expected to initialize its private per-queue data for stations
  * and interfaces in the .add_interface and .sta_add ops.
  *
- * The driver can't access the queue directly. To dequeue a frame, it calls
- * ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a queue, it
- * calls the .wake_tx_queue driver op.
+ * The driver can't access the queue directly. To obtain the next queue to pull
+ * frames from, the driver calls ieee80211_next_txq(). To dequeue a frame from a
+ * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a
+ * queue, it calls the .wake_tx_queue driver op. The driver is expected to
+ * re-schedule the txq using ieee80211_schedule_txq() if it is still active
+ * after the driver has finished pulling packets from it.
  *
  * For AP powersave TIM handling, the driver only needs to indicate if it has
  * buffered packets in the driver specific data structures by calling
@@ -3731,8 +3734,7 @@ struct ieee80211_ops {
 					 struct ieee80211_vif *vif,
 					 struct ieee80211_tdls_ch_sw_params *params);
 
-	void (*wake_tx_queue)(struct ieee80211_hw *hw,
-			      struct ieee80211_txq *txq);
+	void (*wake_tx_queue)(struct ieee80211_hw *hw);
 	void (*sync_rx_queues)(struct ieee80211_hw *hw);
 
 	int (*start_nan)(struct ieee80211_hw *hw,
@@ -5883,13 +5885,36 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid);
  * ieee80211_tx_dequeue - dequeue a packet from a software tx queue
  *
  * @hw: pointer as obtained from ieee80211_alloc_hw()
- * @txq: pointer obtained from station or virtual interface
+ * @txq: pointer obtained from ieee80211_next_txq()
  *
  * Returns the skb if successful, %NULL if no frame was available.
  */
 struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 				     struct ieee80211_txq *txq);
 
+/**
+ * ieee80211_schedule_txq - add txq to scheduling loop
+ *
+ * @hw: pointer as obtained from ieee80211_alloc_hw()
+ * @txq: pointer obtained from station or virtual interface
+ *
+ * Returns %true if the txq was actually added to the scheduling,
+ * %false otherwise.
+ */
+bool ieee80211_schedule_txq(struct ieee80211_hw *hw,
+			    struct ieee80211_txq *txq);
+
+/**
+ * ieee80211_next_txq - get next tx queue to pull packets from
+ *
+ * @hw: pointer as obtained from ieee80211_alloc_hw()
+ *
+ * Returns the next txq if successful, %NULL if no queue is eligible. If a txq
+ * is returned, it will have been removed from the scheduler queue and needs to
+ * be re-scheduled with ieee80211_schedule_txq() to continue to be active.
+ */
+struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw);
+
 /**
  * ieee80211_txq_get_depth - get pending frame/byte count of given txq
  *
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 595c662a61e8..6c6cad98ce92 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -226,9 +226,13 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable)
 		clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags);
 
 	clear_bit(IEEE80211_TXQ_STOP, &txqi->flags);
+
+	if (!ieee80211_schedule_txq(&sta->sdata->local->hw, txq))
+		return;
+
 	local_bh_disable();
 	rcu_read_lock();
-	drv_wake_tx_queue(sta->sdata->local, txqi);
+	drv_wake_tx_queue(sta->sdata->local);
 	rcu_read_unlock();
 	local_bh_enable();
 }
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index c7f93fd9ca7a..cdd76306cb8f 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1158,16 +1158,10 @@ drv_tdls_recv_channel_switch(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
-static inline void drv_wake_tx_queue(struct ieee80211_local *local,
-				     struct txq_info *txq)
+static inline void drv_wake_tx_queue(struct ieee80211_local *local)
 {
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);
-
-	if (!check_sdata_in_driver(sdata))
-		return;
-
-	trace_drv_wake_tx_queue(local, sdata, txq);
-	local->ops->wake_tx_queue(&local->hw, &txq->txq);
+	trace_drv_wake_tx_queue(local);
+	local->ops->wake_tx_queue(&local->hw);
 }
 
 static inline int drv_start_nan(struct ieee80211_local *local,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 26900025de2f..4155838c7bef 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -832,6 +832,7 @@ struct txq_info {
 	struct codel_vars def_cvars;
 	struct codel_stats cstats;
 	struct sk_buff_head frags;
+	struct list_head schedule_order;
 	unsigned long flags;
 
 	/* keep last! */
@@ -1122,6 +1123,10 @@ struct ieee80211_local {
 	struct codel_vars *cvars;
 	struct codel_params cparams;
 
+	/* protects active_txqs and txqi->schedule_order */
+	spinlock_t active_txq_lock;
+	struct list_head active_txqs;
+
 	const struct ieee80211_ops *ops;
 
 	/*
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 0785d04a80bc..935d6e2491b1 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -619,6 +619,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	spin_lock_init(&local->rx_path_lock);
 	spin_lock_init(&local->queue_stop_reason_lock);
 
+	INIT_LIST_HEAD(&local->active_txqs);
+	spin_lock_init(&local->active_txq_lock);
+
 	INIT_LIST_HEAD(&local->chanctx_list);
 	mutex_init(&local->chanctx_mtx);
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 0c5627f8a104..e0bcf16df494 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1237,12 +1237,17 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 		drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
 
 	if (sta->sta.txq[0]) {
+		bool wake = false;
+
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
 			if (!txq_has_queue(sta->sta.txq[i]))
 				continue;
 
-			drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
+			if (ieee80211_schedule_txq(&local->hw, sta->sta.txq[i]))
+				wake = true;
 		}
+		if (wake)
+			drv_wake_tx_queue(local);
 	}
 
 	skb_queue_head_init(&pending);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 591ad02e1fa4..08eaad85942e 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2550,35 +2550,9 @@ TRACE_EVENT(drv_tdls_recv_channel_switch,
 	)
 );
 
-TRACE_EVENT(drv_wake_tx_queue,
-	TP_PROTO(struct ieee80211_local *local,
-		 struct ieee80211_sub_if_data *sdata,
-		 struct txq_info *txq),
-
-	TP_ARGS(local, sdata, txq),
-
-	TP_STRUCT__entry(
-		LOCAL_ENTRY
-		VIF_ENTRY
-		STA_ENTRY
-		__field(u8, ac)
-		__field(u8, tid)
-	),
-
-	TP_fast_assign(
-		struct ieee80211_sta *sta = txq->txq.sta;
-
-		LOCAL_ASSIGN;
-		VIF_ASSIGN;
-		STA_ASSIGN;
-		__entry->ac = txq->txq.ac;
-		__entry->tid = txq->txq.tid;
-	),
-
-	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " ac:%d tid:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid
-	)
+DEFINE_EVENT(local_only_evt, drv_wake_tx_queue,
+	     TP_PROTO(struct ieee80211_local *local),
+	     TP_ARGS(local)
 );
 
 #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 25904af38839..842881ca8f20 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1439,6 +1439,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	codel_vars_init(&txqi->def_cvars);
 	codel_stats_init(&txqi->cstats);
 	__skb_queue_head_init(&txqi->frags);
+	INIT_LIST_HEAD(&txqi->schedule_order);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1462,6 +1463,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
 	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
+	list_del_init(&txqi->schedule_order);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1558,7 +1560,8 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
 	ieee80211_txq_enqueue(local, txqi, skb);
 	spin_unlock_bh(&fq->lock);
 
-	drv_wake_tx_queue(local, txqi);
+	if (ieee80211_schedule_txq(&local->hw, &txqi->txq))
+		drv_wake_tx_queue(local);
 
 	return true;
 }
@@ -3553,6 +3556,50 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+bool ieee80211_schedule_txq(struct ieee80211_hw *hw,
+			    struct ieee80211_txq *txq)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct txq_info *txqi = to_txq_info(txq);
+	bool ret = false;
+
+	spin_lock_bh(&local->active_txq_lock);
+
+	if (list_empty(&txqi->schedule_order)) {
+		list_add_tail(&txqi->schedule_order, &local->active_txqs);
+		ret = true;
+	}
+
+	spin_unlock_bh(&local->active_txq_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_schedule_txq);
+
+struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct txq_info *txqi = NULL;
+
+	spin_lock_bh(&local->active_txq_lock);
+
+	if (list_empty(&local->active_txqs))
+		goto out;
+
+	txqi = list_first_entry(&local->active_txqs,
+				struct txq_info, schedule_order);
+	list_del_init(&txqi->schedule_order);
+
+out:
+	spin_unlock_bh(&local->active_txq_lock);
+
+	if (!txqi)
+		return NULL;
+
+	return &txqi->txq;
+}
+EXPORT_SYMBOL(ieee80211_next_txq);
+
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 				  struct net_device *dev,
 				  u32 info_flags)
-- 
cgit v1.2.3


From b0d52ad821843a6c5badebd80feef9f871904fa6 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 31 Oct 2017 12:27:46 +0100
Subject: mac80211: Add airtime account and scheduling to TXQs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds airtime accounting and scheduling to the mac80211 TXQ
scheduler. A new hardware flag, AIRTIME_ACCOUNTING, is added that
drivers can set if they support reporting airtime usage of
transmissions. When this flag is set, mac80211 will expect the actual
airtime usage to be reported in the tx_time and rx_time fields of the
respective status structs.

When airtime information is present, mac80211 will schedule TXQs
(through ieee80211_next_txq()) in a way that enforces airtime fairness
between active stations. This scheduling works the same way as the ath9k
in-driver airtime fairness scheduling.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 24 ++++++++++++++++++++++++
 net/mac80211/debugfs.c     |  1 +
 net/mac80211/debugfs_sta.c | 29 +++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  8 ++++++--
 net/mac80211/main.c        |  3 ++-
 net/mac80211/rx.c          |  8 ++++++++
 net/mac80211/sta_info.c    |  2 ++
 net/mac80211/sta_info.h    |  7 +++++++
 net/mac80211/status.c      | 16 ++++++++++++++++
 net/mac80211/tx.c          | 31 ++++++++++++++++++++++++++-----
 10 files changed, 121 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 45155803c875..531b526a10db 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1188,6 +1188,8 @@ enum mac80211_rx_encoding {
  *	HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT)
  * @nss: number of streams (VHT and HE only)
  * @flag: %RX_FLAG_\*
+ * @airtime: Duration of frame in usec. See @IEEE80211_HW_AIRTIME_ACCOUNTING for
+ *       how to use this.
  * @encoding: &enum mac80211_rx_encoding
  * @bw: &enum rate_info_bw
  * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags
@@ -1202,6 +1204,7 @@ struct ieee80211_rx_status {
 	u32 device_timestamp;
 	u32 ampdu_reference;
 	u32 flag;
+	u16 airtime;
 	u16 freq;
 	u8 enc_flags;
 	u8 encoding:2, bw:3;
@@ -2066,6 +2069,26 @@ struct ieee80211_txq {
  * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on
  *	TDLS links.
  *
+ * @IEEE80211_HW_AIRTIME_ACCOUNTING: Hardware supports accounting the airtime
+ *      usage of other stations and reports it in the @tx_time and/or @airtime
+ *      fields of the TX/RX status structs.
+ *      When setting this flag, the driver should ensure that the respective
+ *      fields in the TX and RX status structs are always either zero or
+ *      contains a valid duration for the frame in usec. The driver can choose
+ *      to report either or both of TX and RX airtime, but it is recommended to
+ *      report both.
+ *      The reported airtime should as a minimum include all time that is spent
+ *      transmitting to the remote station, including overhead and padding, but
+ *      not including time spent waiting for a TXOP. If the time is not reported
+ *      by the hardware it can in some cases be calculated from the rate and
+ *      known frame composition. When possible, the time should include any
+ *      failed transmission attempts.
+ *      For aggregated frames, there are two possible strategies to report the
+ *      airtime: Either include the airtime of the entire aggregate in the first
+ *      (or last) frame and leave the others at zero. Alternatively, include the
+ *      overhead of the full aggregate in the first or last frame and report the
+ *      time of each frame + padding not including the full aggregate overhead.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2109,6 +2132,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_REPORTS_LOW_ACK,
 	IEEE80211_HW_SUPPORTS_TX_FRAG,
 	IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA,
+	IEEE80211_HW_AIRTIME_ACCOUNTING,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 1f466d12a6bc..d6b87a4ec3e9 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -212,6 +212,7 @@ static const char *hw_flag_names[] = {
 	FLAG(REPORTS_LOW_ACK),
 	FLAG(SUPPORTS_TX_FRAG),
 	FLAG(SUPPORTS_TDLS_BUFFER_STA),
+	FLAG(AIRTIME_ACCOUNTING),
 #undef FLAG
 };
 
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index b15412c21ac9..40dba446836f 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -188,6 +188,32 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
 }
 STA_OPS(aqm);
 
+static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
+				size_t count, loff_t *ppos)
+{
+	struct sta_info *sta = file->private_data;
+	size_t bufsz = 200;
+	char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+	ssize_t rv;
+
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_bh(&sta->lock);
+
+	p += scnprintf(p, bufsz + buf - p,
+		"RX: %llu us\nTX: %llu us\nDeficit: %lld us\n",
+		sta->airtime_stats.rx_airtime,
+		sta->airtime_stats.tx_airtime,
+		sta->airtime_deficit);
+
+	spin_unlock_bh(&sta->lock);
+	rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+	kfree(buf);
+	return rv;
+}
+STA_OPS(airtime);
+
 static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
 					size_t count, loff_t *ppos)
 {
@@ -542,6 +568,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	if (local->ops->wake_tx_queue)
 		DEBUGFS_ADD(aqm);
 
+	if (ieee80211_hw_check(&local->hw, AIRTIME_ACCOUNTING))
+		DEBUGFS_ADD(airtime);
+
 	if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
 		debugfs_create_x32("driver_buffered_tids", 0400,
 				   sta->debugfs_dir,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4155838c7bef..120c516851cf 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -90,6 +90,9 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
 
 #define IEEE80211_MAX_NAN_INSTANCE_ID 255
 
+/* How much to increase airtime deficit on each scheduling round */
+#define IEEE80211_AIRTIME_QUANTUM        1000 /* usec */
+
 struct ieee80211_fragment_entry {
 	struct sk_buff_head skb_list;
 	unsigned long first_frag_time;
@@ -1123,9 +1126,10 @@ struct ieee80211_local {
 	struct codel_vars *cvars;
 	struct codel_params cparams;
 
-	/* protects active_txqs and txqi->schedule_order */
+	/* protects active_txqs_{new,old} and txqi->schedule_order */
 	spinlock_t active_txq_lock;
-	struct list_head active_txqs;
+	struct list_head active_txqs_new;
+	struct list_head active_txqs_old;
 
 	const struct ieee80211_ops *ops;
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 935d6e2491b1..b7142f8491d0 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -619,7 +619,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	spin_lock_init(&local->rx_path_lock);
 	spin_lock_init(&local->queue_stop_reason_lock);
 
-	INIT_LIST_HEAD(&local->active_txqs);
+	INIT_LIST_HEAD(&local->active_txqs_new);
+	INIT_LIST_HEAD(&local->active_txqs_old);
 	spin_lock_init(&local->active_txq_lock);
 
 	INIT_LIST_HEAD(&local->chanctx_list);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b3cff69bfd66..808f41fb536a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1630,6 +1630,14 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	if (ieee80211_vif_is_mesh(&rx->sdata->vif))
 		ieee80211_mps_rx_h_sta_process(sta, hdr);
 
+	/* airtime accounting */
+	if (status->airtime) {
+		spin_lock_bh(&sta->lock);
+		sta->airtime_stats.rx_airtime += status->airtime;
+		sta->airtime_deficit -= status->airtime;
+		spin_unlock_bh(&sta->lock);
+	}
+
 	/*
 	 * Drop (qos-)data::nullfunc frames silently, since they
 	 * are used only to control station power saving mode.
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index e0bcf16df494..ed5500e8aafb 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -425,6 +425,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	sta->cparams.interval = MS2TIME(100);
 	sta->cparams.ecn = true;
 
+	sta->airtime_deficit = IEEE80211_AIRTIME_QUANTUM;
+
 	sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
 
 	return sta;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index cd53619435b6..e356f2f85e12 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -559,6 +559,13 @@ struct sta_info {
 	} tx_stats;
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
 
+	/* Airtime stats and deficit, protected by lock */
+	struct {
+		u64 rx_airtime;
+		u64 tx_airtime;
+	} airtime_stats;
+	s64 airtime_deficit;
+
 	/*
 	 * Aggregation information, locked with lock.
 	 */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index da7427a41529..b044dbed2bb1 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -823,6 +823,14 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 				ieee80211_lost_packet(sta, info);
 			}
 		}
+
+		if (info->status.tx_time &&
+		    ieee80211_hw_check(&local->hw, AIRTIME_ACCOUNTING)) {
+			spin_lock_bh(&sta->lock);
+			sta->airtime_stats.tx_airtime += info->status.tx_time;
+			sta->airtime_deficit -= info->status.tx_time;
+			spin_unlock_bh(&sta->lock);
+		}
 	}
 
 	/* SNMP counters
@@ -947,6 +955,14 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 			sta->status_stats.retry_failed++;
 		sta->status_stats.retry_count += retry_count;
 
+		if (info->status.tx_time &&
+		    ieee80211_hw_check(&local->hw, AIRTIME_ACCOUNTING)) {
+			spin_lock_bh(&sta->lock);
+			sta->airtime_stats.tx_airtime += info->status.tx_time;
+			sta->airtime_deficit -= info->status.tx_time;
+			spin_unlock_bh(&sta->lock);
+		}
+
 		if (acked) {
 			sta->status_stats.last_ack = jiffies;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 842881ca8f20..18381581b5e9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3566,7 +3566,7 @@ bool ieee80211_schedule_txq(struct ieee80211_hw *hw,
 	spin_lock_bh(&local->active_txq_lock);
 
 	if (list_empty(&txqi->schedule_order)) {
-		list_add_tail(&txqi->schedule_order, &local->active_txqs);
+		list_add_tail(&txqi->schedule_order, &local->active_txqs_new);
 		ret = true;
 	}
 
@@ -3580,14 +3580,35 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = NULL;
+	struct list_head *head;
 
 	spin_lock_bh(&local->active_txq_lock);
 
-	if (list_empty(&local->active_txqs))
-		goto out;
+begin:
+	head = &local->active_txqs_new;
+	if (list_empty(head)) {
+		head = &local->active_txqs_old;
+		if (list_empty(head))
+			goto out;
+	}
+
+	txqi = list_first_entry(head, struct txq_info, schedule_order);
+
+	if (txqi->txq.sta) {
+		struct sta_info *sta = container_of(txqi->txq.sta,
+						struct sta_info, sta);
+
+		spin_lock_bh(&sta->lock);
+		if (sta->airtime_deficit < 0) {
+			sta->airtime_deficit += IEEE80211_AIRTIME_QUANTUM;
+			list_move_tail(&txqi->schedule_order,
+				       &local->active_txqs_old);
+			spin_unlock_bh(&sta->lock);
+			goto begin;
+		}
+		spin_unlock_bh(&sta->lock);
+	}
 
-	txqi = list_first_entry(&local->active_txqs,
-				struct txq_info, schedule_order);
 	list_del_init(&txqi->schedule_order);
 
 out:
-- 
cgit v1.2.3


From a0b586fa75a69578ecf10b40582eed9b35de2432 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 8 Dec 2017 15:34:13 -0800
Subject: rtnetlink: fix typo in GSO max segments

Fixes: 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 412ebf0b09c6..c688dc564b11 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2684,7 +2684,7 @@ struct net_device *rtnl_create_link(struct net *net,
 	if (tb[IFLA_GSO_MAX_SIZE])
 		netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
 	if (tb[IFLA_GSO_MAX_SEGS])
-		dev->gso_max_size = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+		dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
 	return dev;
 }
-- 
cgit v1.2.3


From 97a6ec4ac021f7fbec05c15a3aa0c4aaf0461af5 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Mon, 4 Dec 2017 10:31:41 -0800
Subject: rhashtable: Change rhashtable_walk_start to return void

Most callers of rhashtable_walk_start don't care about a resize event
which is indicated by a return value of -EAGAIN. So calls to
rhashtable_walk_start are wrapped wih code to ignore -EAGAIN. Something
like this is common:

       ret = rhashtable_walk_start(rhiter);
       if (ret && ret != -EAGAIN)
               goto out;

Since zero and -EAGAIN are the only possible return values from the
function this check is pointless. The condition never evaluates to true.

This patch changes rhashtable_walk_start to return void. This simplifies
code for the callers that ignore -EAGAIN. For the few cases where the
caller cares about the resize event, particularly where the table can be
walked in mulitple parts for netlink or seq file dump, the function
rhashtable_walk_start_check has been added that returns -EAGAIN on a
resize event.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c       |  6 +---
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   |  7 ++---
 fs/gfs2/glock.c                                    |  7 ++---
 include/linux/rhashtable.h                         |  8 ++++-
 include/net/sctp/sctp.h                            |  2 +-
 lib/rhashtable.c                                   | 10 +++++--
 lib/test_rhashtable.c                              |  6 +---
 net/ipv6/ila/ila_xlat.c                            |  4 +--
 net/ipv6/seg6.c                                    |  4 +--
 net/mac80211/mesh_pathtbl.c                        | 34 +++++++---------------
 net/netfilter/nft_set_hash.c                       | 10 ++-----
 net/netlink/af_netlink.c                           |  5 ++--
 net/netlink/diag.c                                 |  8 ++---
 net/sctp/proc.c                                    |  6 +---
 net/sctp/socket.c                                  | 19 +++---------
 net/tipc/socket.c                                  |  6 ++--
 16 files changed, 48 insertions(+), 94 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 9807214da206..2ae5ed151369 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -1412,11 +1412,7 @@ bnxt_tc_flow_stats_batch_prep(struct bnxt *bp,
 	void *flow_node;
 	int rc, i;
 
-	rc = rhashtable_walk_start(iter);
-	if (rc && rc != -EAGAIN) {
-		i = 0;
-		goto done;
-	}
+	rhashtable_walk_start(iter);
 
 	rc = 0;
 	for (i = 0; i < BNXT_FLOW_STATS_BATCH_MAX; i++) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index a12b894f135d..9b9f3f99b39d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -763,9 +763,7 @@ static void ch_flower_stats_handler(struct work_struct *work)
 
 	rhashtable_walk_enter(&adap->flower_tbl, &iter);
 	do {
-		flower_entry = ERR_PTR(rhashtable_walk_start(&iter));
-		if (IS_ERR(flower_entry))
-			goto walk_stop;
+		rhashtable_walk_start(&iter);
 
 		while ((flower_entry = rhashtable_walk_next(&iter)) &&
 		       !IS_ERR(flower_entry)) {
@@ -784,8 +782,9 @@ static void ch_flower_stats_handler(struct work_struct *work)
 				spin_unlock(&flower_entry->lock);
 			}
 		}
-walk_stop:
+
 		rhashtable_walk_stop(&iter);
+
 	} while (flower_entry == ERR_PTR(-EAGAIN));
 	rhashtable_walk_exit(&iter);
 	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 11066d8647d2..90af87ff29ba 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1549,16 +1549,13 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 	rhashtable_walk_enter(&gl_hash_table, &iter);
 
 	do {
-		gl = ERR_PTR(rhashtable_walk_start(&iter));
-		if (IS_ERR(gl))
-			goto walk_stop;
+		rhashtable_walk_start(&iter);
 
 		while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
 			if (gl->gl_name.ln_sbd == sdp &&
 			    lockref_get_not_dead(&gl->gl_lockref))
 				examiner(gl);
 
-walk_stop:
 		rhashtable_walk_stop(&iter);
 	} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
 
@@ -1947,7 +1944,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 	loff_t n = *pos;
 
 	rhashtable_walk_enter(&gl_hash_table, &gi->hti);
-	if (rhashtable_walk_start(&gi->hti) != 0)
+	if (rhashtable_walk_start_check(&gi->hti) != 0)
 		return NULL;
 
 	do {
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 361c08e35dbc..13ccc483738d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -378,7 +378,13 @@ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 void rhashtable_walk_enter(struct rhashtable *ht,
 			   struct rhashtable_iter *iter);
 void rhashtable_walk_exit(struct rhashtable_iter *iter);
-int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
+
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+	(void)rhashtable_walk_start_check(iter);
+}
+
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 906a9c0efa71..6f79415f6634 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,7 +116,7 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
-int sctp_transport_walk_start(struct rhashtable_iter *iter);
+void sctp_transport_walk_start(struct rhashtable_iter *iter);
 void sctp_transport_walk_stop(struct rhashtable_iter *iter);
 struct sctp_transport *sctp_transport_get_next(struct net *net,
 			struct rhashtable_iter *iter);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ddd7dde87c3c..1935e86ed477 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -732,7 +732,7 @@ void rhashtable_walk_exit(struct rhashtable_iter *iter)
 EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 
 /**
- * rhashtable_walk_start - Start a hash table walk
+ * rhashtable_walk_start_check - Start a hash table walk
  * @iter:	Hash table iterator
  *
  * Start a hash table walk at the current iterator position.  Note that we take
@@ -744,8 +744,12 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
  * Returns -EAGAIN if resize event occured.  Note that the iterator
  * will rewind back to the beginning and you may use it immediately
  * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
  */
-int rhashtable_walk_start(struct rhashtable_iter *iter)
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 	__acquires(RCU)
 {
 	struct rhashtable *ht = iter->ht;
@@ -764,7 +768,7 @@ int rhashtable_walk_start(struct rhashtable_iter *iter)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(rhashtable_walk_start);
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
 
 /**
  * rhashtable_walk_next - Return the next object and advance the iterator
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 8e83cbdc049c..76d3667fdea2 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -162,11 +162,7 @@ static void test_bucket_stats(struct rhashtable *ht, unsigned int entries)
 		return;
 	}
 
-	err = rhashtable_walk_start(&hti);
-	if (err && err != -EAGAIN) {
-		pr_warn("Test failed: iterator failed: %d\n", err);
-		return;
-	}
+	rhashtable_walk_start(&hti);
 
 	while ((pos = rhashtable_walk_next(&hti))) {
 		if (PTR_ERR(pos) == -EAGAIN) {
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 6eb5e68f112a..44c39c5f0638 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -512,9 +512,7 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct ila_map *ila;
 	int ret;
 
-	ret = rhashtable_walk_start(rhiter);
-	if (ret && ret != -EAGAIN)
-		goto done;
+	rhashtable_walk_start(rhiter);
 
 	for (;;) {
 		ila = rhashtable_walk_next(rhiter);
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index c81407770956..7f5621d09571 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -306,9 +306,7 @@ static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
 	struct seg6_hmac_info *hinfo;
 	int ret;
 
-	ret = rhashtable_walk_start(iter);
-	if (ret && ret != -EAGAIN)
-		goto done;
+	rhashtable_walk_start(iter);
 
 	for (;;) {
 		hinfo = rhashtable_walk_next(iter);
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 86c8dfef56a4..a5125624a76d 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -257,9 +257,7 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
 	if (ret)
 		return NULL;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto err;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -269,7 +267,6 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
 		if (i++ == idx)
 			break;
 	}
-err:
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 
@@ -513,9 +510,7 @@ void mesh_plink_broken(struct sta_info *sta)
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -535,7 +530,6 @@ void mesh_plink_broken(struct sta_info *sta)
 				WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast);
 		}
 	}
-out:
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -584,9 +578,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -597,7 +589,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 		if (rcu_access_pointer(mpath->next_hop) == sta)
 			__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -614,9 +606,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -627,7 +617,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
 		if (ether_addr_equal(mpath->mpp, proxy))
 			__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -642,9 +632,7 @@ static void table_flush_by_iface(struct mesh_table *tbl)
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -653,7 +641,7 @@ static void table_flush_by_iface(struct mesh_table *tbl)
 			break;
 		__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -873,9 +861,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -887,7 +873,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
 		     time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
 			__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index f8166c1d5430..3f1624ee056f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -251,11 +251,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 	if (err)
 		return;
 
-	err = rhashtable_walk_start(&hti);
-	if (err && err != -EAGAIN) {
-		iter->err = err;
-		goto out;
-	}
+	rhashtable_walk_start(&hti);
 
 	while ((he = rhashtable_walk_next(&hti))) {
 		if (IS_ERR(he)) {
@@ -306,9 +302,7 @@ static void nft_rhash_gc(struct work_struct *work)
 	if (err)
 		goto schedule;
 
-	err = rhashtable_walk_start(&hti);
-	if (err && err != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&hti);
 
 	while ((he = rhashtable_walk_next(&hti))) {
 		if (IS_ERR(he)) {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b9e0ee4e22f5..ab325d4d6fef 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2478,8 +2478,9 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
 		return err;
 	}
 
-	err = rhashtable_walk_start(&iter->hti);
-	return err == -EAGAIN ? 0 : err;
+	rhashtable_walk_start(&iter->hti);
+
+	return 0;
 }
 
 static void netlink_walk_stop(struct nl_seq_iter *iter)
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8faa20b4d457..7dda33b9b784 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -115,11 +115,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	if (!s_num)
 		rhashtable_walk_enter(&tbl->hash, hti);
 
-	ret = rhashtable_walk_start(hti);
-	if (ret == -EAGAIN)
-		ret = 0;
-	if (ret)
-		goto stop;
+	rhashtable_walk_start(hti);
 
 	while ((nlsk = rhashtable_walk_next(hti))) {
 		if (IS_ERR(nlsk)) {
@@ -146,8 +142,8 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		}
 	}
 
-stop:
 	rhashtable_walk_stop(hti);
+
 	if (ret)
 		goto done;
 
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 26b4be6b4172..4545bc2aff84 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -288,12 +288,8 @@ struct sctp_ht_iter {
 static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct sctp_ht_iter *iter = seq->private;
-	int err = sctp_transport_walk_start(&iter->hti);
 
-	if (err) {
-		iter->start_fail = 1;
-		return ERR_PTR(err);
-	}
+	sctp_transport_walk_start(&iter->hti);
 
 	iter->start_fail = 0;
 	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eb17a911aa29..3e55daa37e66 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4676,20 +4676,11 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
 
 /* use callback to avoid exporting the core structure */
-int sctp_transport_walk_start(struct rhashtable_iter *iter)
+void sctp_transport_walk_start(struct rhashtable_iter *iter)
 {
-	int err;
-
 	rhltable_walk_enter(&sctp_transport_hashtable, iter);
 
-	err = rhashtable_walk_start(iter);
-	if (err && err != -EAGAIN) {
-		rhashtable_walk_stop(iter);
-		rhashtable_walk_exit(iter);
-		return err;
-	}
-
-	return 0;
+	rhashtable_walk_start(iter);
 }
 
 void sctp_transport_walk_stop(struct rhashtable_iter *iter)
@@ -4780,12 +4771,10 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
 			    struct net *net, int *pos, void *p) {
 	struct rhashtable_iter hti;
 	struct sctp_transport *tsp;
-	int ret;
+	int ret = 0;
 
 again:
-	ret = sctp_transport_walk_start(&hti);
-	if (ret)
-		return ret;
+	sctp_transport_walk_start(&hti);
 
 	tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
 	for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 5d18c0caa92b..22c4fd8a9dfe 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2640,9 +2640,7 @@ void tipc_sk_reinit(struct net *net)
 	rhashtable_walk_enter(&tn->sk_rht, &iter);
 
 	do {
-		tsk = ERR_PTR(rhashtable_walk_start(&iter));
-		if (IS_ERR(tsk))
-			goto walk_stop;
+		rhashtable_walk_start(&iter);
 
 		while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
 			spin_lock_bh(&tsk->sk.sk_lock.slock);
@@ -2651,7 +2649,7 @@ void tipc_sk_reinit(struct net *net)
 			msg_set_orignode(msg, tn->own_addr);
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
-walk_stop:
+
 		rhashtable_walk_stop(&iter);
 	} while (tsk == ERR_PTR(-EAGAIN));
 }
-- 
cgit v1.2.3


From 25e3f70fcbc2e40cfe4bc0d3f3a4e1f6d4835d49 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 6 Dec 2017 15:03:19 -0800
Subject: netlink: make netlink tap per netns

nlmon device is not supposed to capture netlink events from
other netns, so instead of filtering events, we can simply
make netlink tap itself per netns.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Kevin Cernekee <cernekee@chromium.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 66 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ab325d4d6fef..1dda94c9695a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -65,6 +65,7 @@
 #include <linux/net_namespace.h>
 
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <net/sock.h>
 #include <net/scm.h>
 #include <net/netlink.h>
@@ -145,8 +146,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 
 static BLOCKING_NOTIFIER_HEAD(netlink_chain);
 
-static DEFINE_SPINLOCK(netlink_tap_lock);
-static struct list_head netlink_tap_all __read_mostly;
 
 static const struct rhashtable_params netlink_rhashtable_params;
 
@@ -173,14 +172,24 @@ static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
 	return new;
 }
 
+static unsigned int netlink_tap_net_id;
+
+struct netlink_tap_net {
+	struct list_head netlink_tap_all;
+	spinlock_t netlink_tap_lock;
+};
+
 int netlink_add_tap(struct netlink_tap *nt)
 {
+	struct net *net = dev_net(nt->dev);
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
 	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
 		return -EINVAL;
 
-	spin_lock(&netlink_tap_lock);
-	list_add_rcu(&nt->list, &netlink_tap_all);
-	spin_unlock(&netlink_tap_lock);
+	spin_lock(&nn->netlink_tap_lock);
+	list_add_rcu(&nt->list, &nn->netlink_tap_all);
+	spin_unlock(&nn->netlink_tap_lock);
 
 	__module_get(nt->module);
 
@@ -190,12 +199,14 @@ EXPORT_SYMBOL_GPL(netlink_add_tap);
 
 static int __netlink_remove_tap(struct netlink_tap *nt)
 {
+	struct net *net = dev_net(nt->dev);
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
 	bool found = false;
 	struct netlink_tap *tmp;
 
-	spin_lock(&netlink_tap_lock);
+	spin_lock(&nn->netlink_tap_lock);
 
-	list_for_each_entry(tmp, &netlink_tap_all, list) {
+	list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
 		if (nt == tmp) {
 			list_del_rcu(&nt->list);
 			found = true;
@@ -205,7 +216,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt)
 
 	pr_warn("__netlink_remove_tap: %p not found\n", nt);
 out:
-	spin_unlock(&netlink_tap_lock);
+	spin_unlock(&nn->netlink_tap_lock);
 
 	if (found)
 		module_put(nt->module);
@@ -224,6 +235,26 @@ int netlink_remove_tap(struct netlink_tap *nt)
 }
 EXPORT_SYMBOL_GPL(netlink_remove_tap);
 
+static __net_init int netlink_tap_init_net(struct net *net)
+{
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
+	INIT_LIST_HEAD(&nn->netlink_tap_all);
+	spin_lock_init(&nn->netlink_tap_lock);
+	return 0;
+}
+
+static void __net_exit netlink_tap_exit_net(struct net *net)
+{
+}
+
+static struct pernet_operations netlink_tap_net_ops = {
+	.init = netlink_tap_init_net,
+	.exit = netlink_tap_exit_net,
+	.id   = &netlink_tap_net_id,
+	.size = sizeof(struct netlink_tap_net),
+};
+
 static bool netlink_filter_tap(const struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
@@ -274,7 +305,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 	return ret;
 }
 
-static void __netlink_deliver_tap(struct sk_buff *skb)
+static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
 {
 	int ret;
 	struct netlink_tap *tmp;
@@ -282,19 +313,21 @@ static void __netlink_deliver_tap(struct sk_buff *skb)
 	if (!netlink_filter_tap(skb))
 		return;
 
-	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
+	list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
 		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
 		if (unlikely(ret))
 			break;
 	}
 }
 
-static void netlink_deliver_tap(struct sk_buff *skb)
+static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
 {
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
 	rcu_read_lock();
 
-	if (unlikely(!list_empty(&netlink_tap_all)))
-		__netlink_deliver_tap(skb);
+	if (unlikely(!list_empty(&nn->netlink_tap_all)))
+		__netlink_deliver_tap(skb, nn);
 
 	rcu_read_unlock();
 }
@@ -303,7 +336,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
 				       struct sk_buff *skb)
 {
 	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
-		netlink_deliver_tap(skb);
+		netlink_deliver_tap(sock_net(dst), skb);
 }
 
 static void netlink_overrun(struct sock *sk)
@@ -1213,7 +1246,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
 	int len = skb->len;
 
-	netlink_deliver_tap(skb);
+	netlink_deliver_tap(sock_net(sk), skb);
 
 	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk);
@@ -2731,12 +2764,11 @@ static int __init netlink_proto_init(void)
 		}
 	}
 
-	INIT_LIST_HEAD(&netlink_tap_all);
-
 	netlink_add_usersock_entry();
 
 	sock_register(&netlink_family_ops);
 	register_pernet_subsys(&netlink_net_ops);
+	register_pernet_subsys(&netlink_tap_net_ops);
 	/* The netlink device handler may be needed early. */
 	rtnetlink_init();
 out:
-- 
cgit v1.2.3


From b1042d356305cd183459dea145fd59c0f7491e00 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 6 Dec 2017 15:03:20 -0800
Subject: netlink: convert netlink tap spinlock to mutex

Both netlink_add_tap() and netlink_remove_tap() are
called in process context, no need to bother spinlock.

Note, in fact, currently we always hold RTNL when calling
these two functions, so we don't need any other lock at
all, but keeping this lock doesn't harm anything.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1dda94c9695a..b0fe1fb12b99 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -176,7 +176,7 @@ static unsigned int netlink_tap_net_id;
 
 struct netlink_tap_net {
 	struct list_head netlink_tap_all;
-	spinlock_t netlink_tap_lock;
+	struct mutex netlink_tap_lock;
 };
 
 int netlink_add_tap(struct netlink_tap *nt)
@@ -187,9 +187,9 @@ int netlink_add_tap(struct netlink_tap *nt)
 	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
 		return -EINVAL;
 
-	spin_lock(&nn->netlink_tap_lock);
+	mutex_lock(&nn->netlink_tap_lock);
 	list_add_rcu(&nt->list, &nn->netlink_tap_all);
-	spin_unlock(&nn->netlink_tap_lock);
+	mutex_unlock(&nn->netlink_tap_lock);
 
 	__module_get(nt->module);
 
@@ -204,7 +204,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt)
 	bool found = false;
 	struct netlink_tap *tmp;
 
-	spin_lock(&nn->netlink_tap_lock);
+	mutex_lock(&nn->netlink_tap_lock);
 
 	list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
 		if (nt == tmp) {
@@ -216,7 +216,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt)
 
 	pr_warn("__netlink_remove_tap: %p not found\n", nt);
 out:
-	spin_unlock(&nn->netlink_tap_lock);
+	mutex_unlock(&nn->netlink_tap_lock);
 
 	if (found)
 		module_put(nt->module);
@@ -240,7 +240,7 @@ static __net_init int netlink_tap_init_net(struct net *net)
 	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
 
 	INIT_LIST_HEAD(&nn->netlink_tap_all);
-	spin_lock_init(&nn->netlink_tap_lock);
+	mutex_init(&nn->netlink_tap_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 772a58693fc3116d05b7969223a80a6376e639eb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:03:58 +0800
Subject: sctp: add stream interleave enable members and sockopt

This patch adds intl_enable in asoc and netns, and strm_interleave in
sctp_sock to indicate if stream interleave is enabled and supported.

netns intl_enable would be set via procfs, but that is not added yet
until all stream interleave codes are completely implemented; asoc
intl_enable will be set when doing 4-shakehands.

sp strm_interleave can be set by sockopt SCTP_INTERLEAVING_SUPPORTED
which is also added in this patch. This socket option is defined in
section 4.3.1 of RFC8260.

Note that strm_interleave can only be set by sockopt when both netns
intl_enable and sp frag_interleave are set.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/sctp.h   |  5 ++-
 include/net/sctp/structs.h |  2 ++
 include/uapi/linux/sctp.h  |  1 +
 net/sctp/socket.c          | 88 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 94 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index ebc813277662..0db7fb3e4e15 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -122,9 +122,12 @@ struct netns_sctp {
 	/* Flag to indicate if PR-CONFIG is enabled. */
 	int reconf_enable;
 
-	/* Flag to idicate if SCTP-AUTH is enabled */
+	/* Flag to indicate if SCTP-AUTH is enabled */
 	int auth_enable;
 
+	/* Flag to indicate if stream interleave is enabled */
+	int intl_enable;
+
 	/*
 	 * Policy to control SCTP IPv4 address scoping
 	 * 0   - Disable IPv4 address scoping
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2f8f93da5dc2..7030cbe11f45 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,6 +217,7 @@ struct sctp_sock {
 		disable_fragments:1,
 		v4mapped:1,
 		frag_interleave:1,
+		strm_interleave:1,
 		recvrcvinfo:1,
 		recvnxtinfo:1,
 		data_ready_signalled:1;
@@ -1940,6 +1941,7 @@ struct sctp_association {
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
 	     temp:1,		/* Is it a temporary association? */
 	     force_delay:1,
+	     intl_enable:1,
 	     prsctp_enable:1,
 	     reconf_enable:1;
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index d9adab32dbee..6ed934c65a5f 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -125,6 +125,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 #define SCTP_STREAM_SCHEDULER	123
 #define SCTP_STREAM_SCHEDULER_VALUE	124
+#define SCTP_INTERLEAVING_SUPPORTED	125
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3e55daa37e66..306c737bde87 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3350,7 +3350,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk,
 	if (get_user(val, (int __user *)optval))
 		return -EFAULT;
 
-	sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1;
+	sctp_sk(sk)->frag_interleave = !!val;
+
+	if (!sctp_sk(sk)->frag_interleave)
+		sctp_sk(sk)->strm_interleave = 0;
 
 	return 0;
 }
@@ -4019,6 +4022,40 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_interleaving_supported(struct sock *sk,
+						  char __user *optval,
+						  unsigned int optlen)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct sctp_assoc_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_id)
+		goto out;
+
+	if (!net->sctp.intl_enable || !sp->frag_interleave) {
+		retval = -EPERM;
+		goto out;
+	}
+
+	sp->strm_interleave = !!params.assoc_value;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4206,6 +4243,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_STREAM_SCHEDULER_VALUE:
 		retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
 		break;
+	case SCTP_INTERLEAVING_SUPPORTED:
+		retval = sctp_setsockopt_interleaving_supported(sk, optval,
+								optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6969,6 +7010,47 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
+						  char __user *optval,
+						  int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->intl_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->strm_interleave;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7159,6 +7241,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_scheduler_value(sk, len, optval,
 							 optlen);
 		break;
+	case SCTP_INTERLEAVING_SUPPORTED:
+		retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
+								optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 96b120b3c1397c90b64d1f4b2300fb7ce4aa8a68 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:03:59 +0800
Subject: sctp: add asoc intl_enable negotiation during 4 shakehands

asoc intl_enable will be set when local sp strm_interleave is set
and there's I-DATA chunk in init and init_ack extensions, as said
in section 2.2.1 of RFC8260.

asoc intl_enable indicates all data will be sent as I-DATA chunks.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  3 +++
 net/sctp/sm_make_chunk.c | 18 ++++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index da803dfc7a39..6d2bd64b6ffe 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -102,6 +102,9 @@ enum sctp_cid {
 	/* AUTH Extension Section 4.1 */
 	SCTP_CID_AUTH			= 0x0F,
 
+	/* sctp ndata 5.1. I-DATA */
+	SCTP_CID_I_DATA			= 0x40,
+
 	/* PR-SCTP Sec 3.2 */
 	SCTP_CID_FWD_TSN		= 0xC0,
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9bf575f2e8ed..da33c8550170 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	struct sctp_inithdr init;
 	union sctp_params addrs;
 	struct sctp_sock *sp;
-	__u8 extensions[4];
+	__u8 extensions[5];
 	size_t chunksize;
 	__be16 types[2];
 	int num_ext = 0;
@@ -278,6 +278,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
+	if (sp->strm_interleave) {
+		extensions[num_ext] = SCTP_CID_I_DATA;
+		num_ext += 1;
+	}
+
 	chunksize += vparam_len;
 
 	/* Account for AUTH related parameters */
@@ -392,7 +397,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	struct sctp_inithdr initack;
 	union sctp_params addrs;
 	struct sctp_sock *sp;
-	__u8 extensions[4];
+	__u8 extensions[5];
 	size_t chunksize;
 	int num_ext = 0;
 	int cookie_len;
@@ -442,6 +447,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
+	if (asoc->intl_enable) {
+		extensions[num_ext] = SCTP_CID_I_DATA;
+		num_ext += 1;
+	}
+
 	if (asoc->peer.auth_capable) {
 		auth_random = (struct sctp_paramhdr *)asoc->c.auth_random;
 		chunksize += ntohs(auth_random->length);
@@ -2032,6 +2042,10 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 			if (net->sctp.addip_enable)
 				asoc->peer.asconf_capable = 1;
 			break;
+		case SCTP_CID_I_DATA:
+			if (sctp_sk(asoc->base.sk)->strm_interleave)
+				asoc->intl_enable = 1;
+			break;
 		default:
 			break;
 		}
-- 
cgit v1.2.3


From ad05a7a05ede28ba6dd935d9e932264a22518b1f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:00 +0800
Subject: sctp: add basic structures and make chunk function for idata

sctp_idatahdr and sctp_idata_chunk are used to define and parse
I-DATA chunk format, and sctp_make_idata is a function to build
the chunk.

The I-DATA Chunk Format is defined in section 2.1 of RFC8260.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 17 +++++++++++++++++
 include/net/sctp/sm.h      |  2 ++
 include/net/sctp/structs.h |  1 +
 net/sctp/sm_make_chunk.c   |  6 ++++++
 4 files changed, 26 insertions(+)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 6d2bd64b6ffe..38e2cf66195f 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -243,6 +243,23 @@ struct sctp_data_chunk {
 	struct sctp_datahdr data_hdr;
 };
 
+struct sctp_idatahdr {
+	__be32 tsn;
+	__be16 stream;
+	__be16 reserved;
+	__be32 mid;
+	union {
+		__u32 ppid;
+		__be32 fsn;
+	};
+	__u8 payload[0];
+};
+
+struct sctp_idata_chunk {
+	struct sctp_chunkhdr chunk_hdr;
+	struct sctp_idatahdr data_hdr;
+};
+
 /* DATA Chuck Specific Flags */
 enum {
 	SCTP_DATA_MIDDLE_FRAG	= 0x00,
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 70fb397f65b0..5389ae034cfa 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -197,6 +197,8 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
 struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 				 const __u32 lowest_tsn,
 				 const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
+				   __u8 flags, int paylen, gfp_t gfp);
 struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
 					    int len, const __u8 flags,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 7030cbe11f45..7026a8039367 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -575,6 +575,7 @@ struct sctp_chunk {
 		struct sctp_addiphdr *addip_hdr;
 		struct sctp_fwdtsn_hdr *fwdtsn_hdr;
 		struct sctp_authhdr *auth_hdr;
+		struct sctp_idatahdr *idata_hdr;
 	} subh;
 
 	__u8 *chunk_end;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index da33c8550170..b969397fb773 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1425,6 +1425,12 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
 	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
 }
 
+struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
+				   __u8 flags, int paylen, gfp_t gfp)
+{
+	return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp);
+}
+
 static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
 					    __u8 type, __u8 flags, int paylen,
 					    gfp_t gfp)
-- 
cgit v1.2.3


From 0c3f6f655487d12c7a0c16914c98c599043e88d3 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:01 +0800
Subject: sctp: implement make_datafrag for sctp_stream_interleave

To avoid hundreds of checks for the different process on I-DATA chunk,
struct sctp_stream_interleave is defined as a group of functions used
to replace the codes in some place where it needs to do different job
according to if the asoc intl_enabled is set.

With these ops, it only needs to initialize asoc->stream.si with
sctp_stream_interleave_0 for normal data if asoc intl_enable is 0,
or sctp_stream_interleave_1 for idata if asoc intl_enable is set in
sctp_stream_init.

After that, the members in asoc->stream.si can be used directly in
some special places without checking asoc intl_enable.

make_datafrag is the first member for sctp_stream_interleave, it's
used to make data or idata frags, called in sctp_datamsg_from_user.
The old function sctp_make_datafrag_empty needs to be adjust some
to fit in this ops.

Note that as idata and data chunks have different length, it also
defines data_chunk_len for sctp_stream_interleave to describe the
chunk size.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h                |  5 +--
 include/net/sctp/stream_interleave.h | 44 ++++++++++++++++++++
 include/net/sctp/structs.h           | 12 ++++++
 net/sctp/Makefile                    |  2 +-
 net/sctp/chunk.c                     |  6 +--
 net/sctp/sm_make_chunk.c             | 21 ++++------
 net/sctp/stream.c                    |  1 +
 net/sctp/stream_interleave.c         | 79 ++++++++++++++++++++++++++++++++++++
 8 files changed, 149 insertions(+), 21 deletions(-)
 create mode 100644 include/net/sctp/stream_interleave.h
 create mode 100644 net/sctp/stream_interleave.c

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 5389ae034cfa..f950186aae34 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -199,10 +199,9 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 				 const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
 				   __u8 flags, int paylen, gfp_t gfp);
-struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
-					    int len, const __u8 flags,
-					    __u16 ssn, gfp_t gfp);
+					    int len, __u8 flags, gfp_t gfp);
 struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 				  const __u32 lowest_tsn);
 struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc);
diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
new file mode 100644
index 000000000000..7b9fa8dbe620
--- /dev/null
+++ b/include/net/sctp/stream_interleave.h
@@ -0,0 +1,44 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * These are definitions used by the stream schedulers, defined in RFC
+ * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11)
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation  is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresses:
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *   Xin Long <lucien.xin@gmail.com>
+ */
+
+#ifndef __sctp_stream_interleave_h__
+#define __sctp_stream_interleave_h__
+
+struct sctp_stream_interleave {
+	__u16	data_chunk_len;
+	/* (I-)DATA process */
+	struct sctp_chunk *(*make_datafrag)(const struct sctp_association *asoc,
+					    const struct sctp_sndrcvinfo *sinfo,
+					    int len, __u8 flags, gfp_t gfp);
+};
+
+void sctp_stream_interleave_init(struct sctp_stream *stream);
+
+#endif /* __sctp_stream_interleave_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 7026a8039367..96cc898787f1 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -89,6 +89,7 @@ struct sctp_stream;
 #include <net/sctp/tsnmap.h>
 #include <net/sctp/ulpevent.h>
 #include <net/sctp/ulpqueue.h>
+#include <net/sctp/stream_interleave.h>
 
 /* Structures useful for managing bind/connect. */
 
@@ -1389,11 +1390,22 @@ struct sctp_stream {
 			struct sctp_stream_out_ext *rr_next;
 		};
 	};
+	struct sctp_stream_interleave *si;
 };
 
 #define SCTP_STREAM_CLOSED		0x00
 #define SCTP_STREAM_OPEN		0x01
 
+static inline __u16 sctp_datachk_len(const struct sctp_stream *stream)
+{
+	return stream->si->data_chunk_len;
+}
+
+static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream)
+{
+	return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr);
+}
+
 /* SCTP_GET_ASSOC_STATS counters */
 struct sctp_priv_assoc_stats {
 	/* Maximum observed rto in the association during subsequent
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 1ca84a288443..54bd9c1a8aa1 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -14,7 +14,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
-	  stream_sched_rr.o
+	  stream_sched_rr.o stream_interleave.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7f8baa48e7c2..62adaaacc4ab 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -191,7 +191,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	 */
 	max_data = asoc->pathmtu -
 		   sctp_sk(asoc->base.sk)->pf->af->net_header_len -
-		   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+		   sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream);
 	max_data = SCTP_TRUNC4(max_data);
 
 	/* If the the peer requested that we authenticate DATA chunks
@@ -264,8 +264,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 				frag |= SCTP_DATA_SACK_IMM;
 		}
 
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
-						 0, GFP_KERNEL);
+		chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag,
+						       GFP_KERNEL);
 		if (!chunk) {
 			err = -ENOMEM;
 			goto errout;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index b969397fb773..23a7313d7972 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -721,38 +721,31 @@ nodata:
 /* Make a DATA chunk for the given association from the provided
  * parameters.  However, do not populate the data payload.
  */
-struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
-					    int data_len, __u8 flags, __u16 ssn,
-					    gfp_t gfp)
+					    int len, __u8 flags, gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	struct sctp_datahdr dp;
-	int chunk_len;
 
 	/* We assign the TSN as LATE as possible, not here when
 	 * creating the chunk.
 	 */
-	dp.tsn = 0;
+	memset(&dp, 0, sizeof(dp));
+	dp.ppid = sinfo->sinfo_ppid;
 	dp.stream = htons(sinfo->sinfo_stream);
-	dp.ppid   = sinfo->sinfo_ppid;
 
 	/* Set the flags for an unordered send.  */
-	if (sinfo->sinfo_flags & SCTP_UNORDERED) {
+	if (sinfo->sinfo_flags & SCTP_UNORDERED)
 		flags |= SCTP_DATA_UNORDERED;
-		dp.ssn = 0;
-	} else
-		dp.ssn = htons(ssn);
 
-	chunk_len = sizeof(dp) + data_len;
-	retval = sctp_make_data(asoc, flags, chunk_len, gfp);
+	retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp);
 	if (!retval)
-		goto nodata;
+		return NULL;
 
 	retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
 	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
 
-nodata:
 	return retval;
 }
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 76ea66be0bbe..8370e6cfe897 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -167,6 +167,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	sched->init(stream);
 
 in:
+	sctp_stream_interleave_init(stream);
 	if (!incnt)
 		goto out;
 
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
new file mode 100644
index 000000000000..397c3c1789b3
--- /dev/null
+++ b/net/sctp/stream_interleave.c
@@ -0,0 +1,79 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <linux/sctp.h>
+
+static struct sctp_chunk *sctp_make_idatafrag_empty(
+					const struct sctp_association *asoc,
+					const struct sctp_sndrcvinfo *sinfo,
+					int len, __u8 flags, gfp_t gfp)
+{
+	struct sctp_chunk *retval;
+	struct sctp_idatahdr dp;
+
+	memset(&dp, 0, sizeof(dp));
+	dp.stream = htons(sinfo->sinfo_stream);
+
+	if (sinfo->sinfo_flags & SCTP_UNORDERED)
+		flags |= SCTP_DATA_UNORDERED;
+
+	retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp);
+	if (!retval)
+		return NULL;
+
+	retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
+	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
+
+	return retval;
+}
+
+static struct sctp_stream_interleave sctp_stream_interleave_0 = {
+	.data_chunk_len		= sizeof(struct sctp_data_chunk),
+	/* DATA process functions */
+	.make_datafrag		= sctp_make_datafrag_empty,
+};
+
+static struct sctp_stream_interleave sctp_stream_interleave_1 = {
+	.data_chunk_len		= sizeof(struct sctp_idata_chunk),
+	/* I-DATA process functions */
+	.make_datafrag		= sctp_make_idatafrag_empty,
+};
+
+void sctp_stream_interleave_init(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	stream->si = asoc->intl_enable ? &sctp_stream_interleave_1
+				       : &sctp_stream_interleave_0;
+}
-- 
cgit v1.2.3


From 668c9beb9020d5834ee9e43c208190a07d2b1928 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:02 +0800
Subject: sctp: implement assign_number for sctp_stream_interleave

assign_number is added as a member of sctp_stream_interleave, used
to assign ssn for data or mid (message id) for idata, called in
sctp_packet_append_data. sctp_chunk_assign_ssn is left as it is,
and sctp_chunk_assign_mid is added for sctp_stream_interleave_1.

This procedure is described in section 2.2.2 of RFC8260.

All sizeof(struct sctp_data_chunk) in tx path is replaced with
sctp_datachk_len, to make it right for idata as well. And also
adjust sctp_chunk_is_data for SCTP_CID_I_DATA.

After this patch, idata can be built and sent in tx path.

Note that if sp strm_interleave is set, it has to wait_connect in
sctp_sendmsg, as asoc intl_enable need to be known after 4 shake-
hands, to decide if it should use data or idata later. data and
idata can't be mixed to send in one asoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h         |  9 +++++----
 include/net/sctp/sctp.h              |  4 ++--
 include/net/sctp/sm.h                |  2 +-
 include/net/sctp/stream_interleave.h |  1 +
 include/net/sctp/structs.h           | 18 +++++++++++++++++-
 net/sctp/output.c                    |  5 +++--
 net/sctp/socket.c                    | 17 +++++++++++++++--
 net/sctp/stream_interleave.c         | 37 ++++++++++++++++++++++++++++++++++++
 net/sctp/ulpevent.c                  |  4 ++--
 9 files changed, 83 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index deaafa9b09cb..20ff237c5eb2 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -145,12 +145,13 @@ SCTP_SUBTYPE_CONSTRUCTOR(OTHER,		enum sctp_event_other,	other)
 SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	enum sctp_event_primitive, primitive)
 
 
-#define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA)
+#define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA || \
+			       a->chunk_hdr->type == SCTP_CID_I_DATA)
 
 /* Calculate the actual data size in a data chunk */
-#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end)\
-		       		- (unsigned long)(c->chunk_hdr)\
-				- sizeof(struct sctp_data_chunk)))
+#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end) - \
+				    (unsigned long)(c->chunk_hdr) - \
+				    sctp_datachk_len(&c->asoc->stream)))
 
 /* Internal error codes */
 enum sctp_ierror {
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 6f79415f6634..20c0c1be2ca7 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -444,13 +444,13 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
 	int frag = pmtu;
 
 	frag -= sp->pf->af->net_header_len;
-	frag -= sizeof(struct sctphdr) + sizeof(struct sctp_data_chunk);
+	frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream);
 
 	if (asoc->user_frag)
 		frag = min_t(int, frag, asoc->user_frag);
 
 	frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN -
-					    sizeof(struct sctp_data_chunk)));
+					    sctp_datachk_len(&asoc->stream)));
 
 	return frag;
 }
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index f950186aae34..ca1db8997e5d 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -343,7 +343,7 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk)
 	__u16 size;
 
 	size = ntohs(chunk->chunk_hdr->length);
-	size -= sizeof(struct sctp_data_chunk);
+	size -= sctp_datahdr_len(&chunk->asoc->stream);
 
 	return size;
 }
diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 7b9fa8dbe620..99f399e7a871 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -37,6 +37,7 @@ struct sctp_stream_interleave {
 	struct sctp_chunk *(*make_datafrag)(const struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
 					    int len, __u8 flags, gfp_t gfp);
+	void	(*assign_number)(struct sctp_chunk *chunk);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 96cc898787f1..fd93973bb667 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -399,6 +399,18 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new);
 #define sctp_ssn_skip(stream, type, sid, ssn) \
 	((stream)->type[sid].ssn = ssn + 1)
 
+/* What is the current MID number for this stream? */
+#define sctp_mid_peek(stream, type, sid) \
+	((stream)->type[sid].mid)
+
+/* Return the next MID number for this stream.  */
+#define sctp_mid_next(stream, type, sid) \
+	((stream)->type[sid].mid++)
+
+/* Skip over this mid and all below. */
+#define sctp_mid_skip(stream, type, sid, mid) \
+	((stream)->type[sid].mid = mid + 1)
+
 /*
  * Pointers to address related SCTP functions.
  * (i.e. things that depend on the address family.)
@@ -623,6 +635,7 @@ struct sctp_chunk {
 	__u16	rtt_in_progress:1,	/* This chunk used for RTT calc? */
 		has_tsn:1,		/* Does this chunk have a TSN yet? */
 		has_ssn:1,		/* Does this chunk have a SSN yet? */
+#define has_mid has_ssn
 		singleton:1,		/* Only chunk in the packet? */
 		end_of_packet:1,	/* Last chunk in the packet? */
 		ecn_ce_done:1,		/* Have we processed the ECN CE bit? */
@@ -1360,7 +1373,10 @@ struct sctp_stream_out_ext {
 };
 
 struct sctp_stream_out {
-	__u16	ssn;
+	union {
+		__u32 mid;
+		__u16 ssn;
+	};
 	__u8	state;
 	struct sctp_stream_out_ext *ext;
 };
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 4a865cd06d76..01a26ee051e3 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -313,6 +313,7 @@ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
 	/* We believe that this chunk is OK to add to the packet */
 	switch (chunk->chunk_hdr->type) {
 	case SCTP_CID_DATA:
+	case SCTP_CID_I_DATA:
 		/* Account for the data being in the packet */
 		sctp_packet_append_data(packet, chunk);
 		/* Disallow SACK bundling after DATA. */
@@ -724,7 +725,7 @@ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
 	 * or delay in hopes of bundling a full sized packet.
 	 */
 	if (chunk->skb->len + q->out_qlen > transport->pathmtu -
-		packet->overhead - sizeof(struct sctp_data_chunk) - 4)
+	    packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4)
 		/* Enough data queued to fill a packet */
 		return SCTP_XMIT_OK;
 
@@ -759,7 +760,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
 
 	asoc->peer.rwnd = rwnd;
 	sctp_chunk_assign_tsn(chunk);
-	sctp_chunk_assign_ssn(chunk);
+	asoc->stream.si->assign_number(chunk);
 }
 
 static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 306c737bde87..3654e1ede716 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2002,7 +2002,20 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		if (err < 0)
 			goto out_free;
 
-		wait_connect = true;
+		/* If stream interleave is enabled, wait_connect has to be
+		 * done earlier than data enqueue, as it needs to make data
+		 * or idata according to asoc->intl_enable which is set
+		 * after connection is done.
+		 */
+		if (sctp_sk(asoc->base.sk)->strm_interleave) {
+			timeo = sock_sndtimeo(sk, 0);
+			err = sctp_wait_for_connect(asoc, &timeo);
+			if (err)
+				goto out_unlock;
+		} else {
+			wait_connect = true;
+		}
+
 		pr_debug("%s: we associated primitively\n", __func__);
 	}
 
@@ -3180,7 +3193,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 		if (val == 0) {
 			val = asoc->pathmtu - sp->pf->af->net_header_len;
 			val -= sizeof(struct sctphdr) +
-			       sizeof(struct sctp_data_chunk);
+			       sctp_datachk_len(&asoc->stream);
 		}
 		asoc->user_frag = val;
 		asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 397c3c1789b3..3ac47e78c013 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -57,16 +57,53 @@ static struct sctp_chunk *sctp_make_idatafrag_empty(
 	return retval;
 }
 
+static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *lchunk;
+	__u32 cfsn = 0;
+	__u16 sid;
+
+	if (chunk->has_mid)
+		return;
+
+	sid = sctp_chunk_stream_no(chunk);
+	stream = &chunk->asoc->stream;
+
+	list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) {
+		struct sctp_idatahdr *hdr;
+
+		lchunk->has_mid = 1;
+
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+			continue;
+
+		hdr = lchunk->subh.idata_hdr;
+
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)
+			hdr->ppid = lchunk->sinfo.sinfo_ppid;
+		else
+			hdr->fsn = htonl(cfsn++);
+
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
+			hdr->mid = htonl(sctp_mid_next(stream, out, sid));
+		else
+			hdr->mid = htonl(sctp_mid_peek(stream, out, sid));
+	}
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
 	.make_datafrag		= sctp_make_datafrag_empty,
+	.assign_number		= sctp_chunk_assign_ssn,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.data_chunk_len		= sizeof(struct sctp_idata_chunk),
 	/* I-DATA process functions */
 	.make_datafrag		= sctp_make_idatafrag_empty,
+	.assign_number		= sctp_chunk_assign_mid,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 5447228bf1a0..650b634fb2a3 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -443,8 +443,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
 		goto fail;
 
 	/* Pull off the common chunk header and DATA header.  */
-	skb_pull(skb, sizeof(struct sctp_data_chunk));
-	len -= sizeof(struct sctp_data_chunk);
+	skb_pull(skb, sctp_datachk_len(&asoc->stream));
+	len -= sctp_datachk_len(&asoc->stream);
 
 	/* Embed the event fields inside the cloned skb.  */
 	event = sctp_skb2event(skb);
-- 
cgit v1.2.3


From 9d4ceaf154a947e69648041bcb11a24a7a40c380 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:03 +0800
Subject: sctp: implement validate_data for sctp_stream_interleave

validate_data is added as a member of sctp_stream_interleave, used
to validate ssn/chunk type for data or mid (message id)/chunk type
for idata, called in sctp_eat_data.

If this check fails, an abort packet will be sent, as said in
section 2.2.3 of RFC8260.

It also adds the process for idata in rx path. As Marcelo pointed
out, there's no need to add event table for idata, but just share
chunk_event_table with data's. It would drop data chunk for idata
and drop idata chunk for data by calling validate_data in
sctp_eat_data.

As last patch did, it also replaces sizeof(struct sctp_data_chunk)
with sctp_datachk_len for rx path.

After this patch, the idata can be accepted and delivered to ulp
layer.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h                |  6 ++++++
 include/net/sctp/stream_interleave.h |  1 +
 include/net/sctp/structs.h           |  6 +++++-
 net/sctp/sm_statefuns.c              | 21 ++++++++-----------
 net/sctp/sm_statetable.c             |  3 +++
 net/sctp/stream_interleave.c         | 39 ++++++++++++++++++++++++++++++++++++
 6 files changed, 62 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ca1db8997e5d..0993b4953b3a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -359,6 +359,12 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk)
 	 typecheck(__u32, b) && \
 	 ((__s32)((a) - (b)) <= 0))
 
+/* Compare two MIDs */
+#define MID_lt(a, b)	\
+	(typecheck(__u32, a) && \
+	 typecheck(__u32, b) && \
+	 ((__s32)((a) - (b)) < 0))
+
 /* Compare two SSNs */
 #define SSN_lt(a,b)		\
 	(typecheck(__u16, a) && \
diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 99f399e7a871..d8d1b51e1362 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -38,6 +38,7 @@ struct sctp_stream_interleave {
 					    const struct sctp_sndrcvinfo *sinfo,
 					    int len, __u8 flags, gfp_t gfp);
 	void	(*assign_number)(struct sctp_chunk *chunk);
+	bool	(*validate_data)(struct sctp_chunk *chunk);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index fd93973bb667..be4cc73f3106 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1382,7 +1382,11 @@ struct sctp_stream_out {
 };
 
 struct sctp_stream_in {
-	__u16	ssn;
+	union {
+		__u32 mid;
+		__u16 ssn;
+	};
+	__u32 fsn;
 };
 
 struct sctp_stream {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8f8ccded13e4..c609c5409910 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3013,7 +3013,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3034,7 +3034,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
 	case SCTP_IERROR_PROTO_VIOLATION:
 		return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
 					       (u8 *)chunk->subh.data_hdr,
-					       sizeof(struct sctp_datahdr));
+					       sctp_datahdr_len(&asoc->stream));
 	default:
 		BUG();
 	}
@@ -3133,7 +3133,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4(
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3150,7 +3150,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4(
 	case SCTP_IERROR_PROTO_VIOLATION:
 		return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
 					       (u8 *)chunk->subh.data_hdr,
-					       sizeof(struct sctp_datahdr));
+					       sctp_datahdr_len(&asoc->stream));
 	default:
 		BUG();
 	}
@@ -6244,14 +6244,12 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	struct sctp_chunk *err;
 	enum sctp_verb deliver;
 	size_t datalen;
-	u8 ordered = 0;
-	u16 ssn, sid;
 	__u32 tsn;
 	int tmp;
 
 	data_hdr = (struct sctp_datahdr *)chunk->skb->data;
 	chunk->subh.data_hdr = data_hdr;
-	skb_pull(chunk->skb, sizeof(*data_hdr));
+	skb_pull(chunk->skb, sctp_datahdr_len(&asoc->stream));
 
 	tsn = ntohl(data_hdr->tsn);
 	pr_debug("%s: TSN 0x%x\n", __func__, tsn);
@@ -6299,7 +6297,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * Actually, allow a little bit of overflow (up to a MTU).
 	 */
 	datalen = ntohs(chunk->chunk_hdr->length);
-	datalen -= sizeof(struct sctp_data_chunk);
+	datalen -= sctp_datachk_len(&asoc->stream);
 
 	deliver = SCTP_CMD_CHUNK_ULP;
 
@@ -6394,7 +6392,6 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 		SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS);
 		if (chunk->asoc)
 			chunk->asoc->stats.iodchunks++;
-		ordered = 1;
 	}
 
 	/* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
@@ -6405,8 +6402,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * with cause set to "Invalid Stream Identifier" (See Section 3.3.10)
 	 * and discard the DATA chunk.
 	 */
-	sid = ntohs(data_hdr->stream);
-	if (sid >= asoc->stream.incnt) {
+	if (ntohs(data_hdr->stream) >= asoc->stream.incnt) {
 		/* Mark tsn as received even though we drop it */
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
 
@@ -6427,8 +6423,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * SSN is smaller then the next expected one.  If it is, it wrapped
 	 * and is invalid.
 	 */
-	ssn = ntohs(data_hdr->ssn);
-	if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->stream, in, sid)))
+	if (!asoc->stream.si->validate_data(chunk))
 		return SCTP_IERROR_PROTO_VIOLATION;
 
 	/* Send the data up to the user.  Note:  Schedule  the
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 79b6bee5b768..8c9bb4109f47 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -985,6 +985,9 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
 	if (state > SCTP_STATE_MAX)
 		return &bug;
 
+	if (net->sctp.intl_enable && cid == SCTP_CID_I_DATA)
+		cid = SCTP_CID_DATA;
+
 	if (cid <= SCTP_CID_BASE_MAX)
 		return &chunk_event_table[cid][state];
 
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 3ac47e78c013..3d8733be6f7a 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -92,11 +92,49 @@ static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
 	}
 }
 
+static bool sctp_validate_data(struct sctp_chunk *chunk)
+{
+	const struct sctp_stream *stream;
+	__u16 sid, ssn;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_DATA)
+		return false;
+
+	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+		return true;
+
+	stream = &chunk->asoc->stream;
+	sid = sctp_chunk_stream_no(chunk);
+	ssn = ntohs(chunk->subh.data_hdr->ssn);
+
+	return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid));
+}
+
+static bool sctp_validate_idata(struct sctp_chunk *chunk)
+{
+	struct sctp_stream *stream;
+	__u32 mid;
+	__u16 sid;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_I_DATA)
+		return false;
+
+	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+		return true;
+
+	stream = &chunk->asoc->stream;
+	sid = sctp_chunk_stream_no(chunk);
+	mid = ntohl(chunk->subh.idata_hdr->mid);
+
+	return !MID_lt(mid, sctp_mid_peek(stream, in, sid));
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
 	.make_datafrag		= sctp_make_datafrag_empty,
 	.assign_number		= sctp_chunk_assign_ssn,
+	.validate_data		= sctp_validate_data,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -104,6 +142,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	/* I-DATA process functions */
 	.make_datafrag		= sctp_make_idatafrag_empty,
 	.assign_number		= sctp_chunk_assign_mid,
+	.validate_data		= sctp_validate_idata,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From bd4d627dbd5adb8130d5c54a4135d89f45e41905 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:04 +0800
Subject: sctp: implement ulpevent_data for sctp_stream_interleave

ulpevent_data is added as a member of sctp_stream_interleave, used to
do the most process in ulpq, including to convert data or idata chunk
to event, reasm them in reasm queue and put them in lobby queue in
right order, and deliver them up to user sk rx queue.

This procedure is described in section 2.2.3 of RFC8260.

It adds most functions for idata here to do the similar process as
the old functions for data. But since the details are very different
between them, the old functions can not be reused for idata.

event->ssn and event->ppid settings are moved to ulpevent_data from
sctp_ulpevent_make_rcvmsg, so that sctp_ulpevent_make_rcvmsg could
work for both data and idata.

Note that mid is added in sctp_ulpevent for idata, __packed has to
be used for defining sctp_ulpevent, or it would exceeds the skb cb
that saves a sctp_ulpevent variable for ulp layer process.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |   2 +
 include/net/sctp/structs.h           |   3 +
 include/net/sctp/ulpevent.h          |  20 +-
 net/sctp/sm_sideeffect.c             |   5 +-
 net/sctp/stream_interleave.c         | 418 +++++++++++++++++++++++++++++++++++
 net/sctp/ulpevent.c                  |   2 -
 net/sctp/ulpqueue.c                  |  12 +-
 7 files changed, 451 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index d8d1b51e1362..02f60f541f1e 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -39,6 +39,8 @@ struct sctp_stream_interleave {
 					    int len, __u8 flags, gfp_t gfp);
 	void	(*assign_number)(struct sctp_chunk *chunk);
 	bool	(*validate_data)(struct sctp_chunk *chunk);
+	int	(*ulpevent_data)(struct sctp_ulpq *ulpq,
+				 struct sctp_chunk *chunk, gfp_t gfp);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index be4cc73f3106..73b315de2fef 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -411,6 +411,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new);
 #define sctp_mid_skip(stream, type, sid, mid) \
 	((stream)->type[sid].mid = mid + 1)
 
+#define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid])
+
 /*
  * Pointers to address related SCTP functions.
  * (i.e. things that depend on the address family.)
@@ -1387,6 +1389,7 @@ struct sctp_stream_in {
 		__u16 ssn;
 	};
 	__u32 fsn;
+	char pd_mode;
 };
 
 struct sctp_stream {
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 231dc42f1da6..ce4f2aa35d56 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -45,19 +45,29 @@
 /* A structure to carry information to the ULP (e.g. Sockets API) */
 /* Warning: This sits inside an skb.cb[] area.  Be very careful of
  * growing this structure as it is at the maximum limit now.
+ *
+ * sctp_ulpevent is saved in sk->cb(48 bytes), whose last 4 bytes
+ * have been taken by sock_skb_cb, So here it has to use 'packed'
+ * to make sctp_ulpevent fit into the rest 44 bytes.
  */
 struct sctp_ulpevent {
 	struct sctp_association *asoc;
 	struct sctp_chunk *chunk;
 	unsigned int rmem_len;
-	__u32 ppid;
+	union {
+		__u32 mid;
+		__u16 ssn;
+	};
+	union {
+		__u32 ppid;
+		__u32 fsn;
+	};
 	__u32 tsn;
 	__u32 cumtsn;
 	__u16 stream;
-	__u16 ssn;
 	__u16 flags;
 	__u16 msg_flags;
-};
+} __packed;
 
 /* Retrieve the skb this event sits inside of. */
 static inline struct sk_buff *sctp_event2skb(const struct sctp_ulpevent *ev)
@@ -140,6 +150,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event(
 	const struct sctp_association *asoc, __u16 flags,
 	__u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp);
 
+struct sctp_ulpevent *sctp_make_reassembled_event(
+	struct net *net, struct sk_buff_head *queue,
+	struct sk_buff *f_frag, struct sk_buff *l_frag);
+
 void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
 				   struct msghdr *);
 void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event,
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index df94d77401e7..9d25efb26a39 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1483,8 +1483,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n",
 				 __func__, cmd->obj.chunk, &asoc->ulpq);
 
-			sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.chunk,
-					    GFP_ATOMIC);
+			asoc->stream.si->ulpevent_data(&asoc->ulpq,
+						       cmd->obj.chunk,
+						       GFP_ATOMIC);
 			break;
 
 		case SCTP_CMD_EVENT_ULP:
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 3d8733be6f7a..823831101dc4 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -29,8 +29,10 @@
  *    Xin Long <lucien.xin@gmail.com>
  */
 
+#include <net/busy_poll.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/ulpevent.h>
 #include <linux/sctp.h>
 
 static struct sctp_chunk *sctp_make_idatafrag_empty(
@@ -129,12 +131,427 @@ static bool sctp_validate_idata(struct sctp_chunk *chunk)
 	return !MID_lt(mid, sctp_mid_peek(stream, in, sid));
 }
 
+static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
+				  struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *pos;
+
+	pos = skb_peek_tail(&ulpq->reasm);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	cevent = sctp_skb2event(pos);
+
+	if (event->stream == cevent->stream &&
+	    event->mid == cevent->mid &&
+	    (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
+	     (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+	      event->fsn > cevent->fsn))) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	if ((event->stream == cevent->stream &&
+	     MID_lt(cevent->mid, event->mid)) ||
+	    event->stream > cevent->stream) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		cevent = sctp_skb2event(pos);
+
+		if (event->stream < cevent->stream ||
+		    (event->stream == cevent->stream &&
+		     MID_lt(event->mid, cevent->mid)))
+			break;
+
+		if (event->stream == cevent->stream &&
+		    event->mid == cevent->mid &&
+		    !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+		    (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
+		     event->fsn < cevent->fsn))
+			break;
+	}
+
+	__skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_partial(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sctp_stream_in *sin;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	int is_last = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+
+		if (cevent->stream > event->stream ||
+		    cevent->mid != sin->mid)
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			goto out;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = cevent->fsn + 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn++;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = 0;
+					is_last = 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn = 0;
+				is_last = 1;
+			}
+			goto out;
+		default:
+			goto out;
+		}
+	}
+
+out:
+	if (!first_frag)
+		return NULL;
+
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn = next_fsn;
+		if (is_last) {
+			retval->msg_flags |= MSG_EOR;
+			sin->pd_mode = 0;
+		}
+	}
+
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_association *asoc = ulpq->asoc;
+	struct sk_buff *pos, *first_frag = NULL;
+	struct sctp_ulpevent *retval = NULL;
+	struct sk_buff *pd_first = NULL;
+	struct sk_buff *pd_last = NULL;
+	struct sctp_stream_in *sin;
+	__u32 next_fsn = 0;
+	__u32 pd_point = 0;
+	__u32 pd_len = 0;
+	__u32 mid = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+		if (cevent->stream > event->stream)
+			break;
+
+		if (MID_lt(cevent->mid, event->mid))
+			continue;
+		if (MID_lt(event->mid, cevent->mid))
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (cevent->mid == sin->mid) {
+				pd_first = pos;
+				pd_last = pos;
+				pd_len = pos->len;
+			}
+
+			first_frag = pos;
+			next_fsn = 0;
+			mid = cevent->mid;
+			break;
+
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				if (pd_first) {
+					pd_last = pos;
+					pd_len += pos->len;
+				}
+			} else {
+				first_frag = NULL;
+			}
+			break;
+
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn)
+				goto found;
+			else
+				first_frag = NULL;
+			break;
+		}
+	}
+
+	if (!pd_first)
+		goto out;
+
+	pd_point = sctp_sk(asoc->base.sk)->pd_point;
+	if (pd_point && pd_point <= pd_len) {
+		retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+						     &ulpq->reasm,
+						     pd_first, pd_last);
+		if (retval) {
+			sin->fsn = next_fsn;
+			sin->pd_mode = 1;
+		}
+	}
+	goto out;
+
+found:
+	retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+					     &ulpq->reasm,
+					     first_frag, pos);
+	if (retval)
+		retval->msg_flags |= MSG_EOR;
+
+out:
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq,
+					     struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *retval = NULL;
+	struct sctp_stream_in *sin;
+
+	if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+		event->msg_flags |= MSG_EOR;
+		return event;
+	}
+
+	sctp_intl_store_reasm(ulpq, event);
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+	if (sin->pd_mode && event->mid == sin->mid &&
+	    event->fsn == sin->fsn)
+		retval = sctp_intl_retrieve_partial(ulpq, event);
+
+	if (!retval)
+		retval = sctp_intl_retrieve_reassembled(ulpq, event);
+
+	return retval;
+}
+
+static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
+				    struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *pos;
+
+	pos = skb_peek_tail(&ulpq->lobby);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	cevent = (struct sctp_ulpevent *)pos->cb;
+	if (event->stream == cevent->stream &&
+	    MID_lt(cevent->mid, event->mid)) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	if (event->stream > cevent->stream) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	skb_queue_walk(&ulpq->lobby, pos) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+
+		if (cevent->stream > event->stream)
+			break;
+
+		if (cevent->stream == event->stream &&
+		    MID_lt(event->mid, cevent->mid))
+			break;
+	}
+
+	__skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+}
+
+static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq,
+				       struct sctp_ulpevent *event)
+{
+	struct sk_buff_head *event_list;
+	struct sctp_stream *stream;
+	struct sk_buff *pos, *tmp;
+	__u16 sid = event->stream;
+
+	stream  = &ulpq->asoc->stream;
+	event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev;
+
+	sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
+		struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb;
+
+		if (cevent->stream > sid)
+			break;
+
+		if (cevent->stream < sid)
+			continue;
+
+		if (cevent->mid != sctp_mid_peek(stream, in, sid))
+			break;
+
+		sctp_mid_next(stream, in, sid);
+
+		__skb_unlink(pos, &ulpq->lobby);
+
+		__skb_queue_tail(event_list, pos);
+	}
+}
+
+static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq,
+					     struct sctp_ulpevent *event)
+{
+	struct sctp_stream *stream;
+	__u16 sid;
+
+	if (event->msg_flags & SCTP_DATA_UNORDERED)
+		return event;
+
+	stream  = &ulpq->asoc->stream;
+	sid = event->stream;
+
+	if (event->mid != sctp_mid_peek(stream, in, sid)) {
+		sctp_intl_store_ordered(ulpq, event);
+		return NULL;
+	}
+
+	sctp_mid_next(stream, in, sid);
+
+	sctp_intl_retrieve_ordered(ulpq, event);
+
+	return event;
+}
+
+static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
+			      struct sctp_ulpevent *event)
+{
+	struct sk_buff *skb = sctp_event2skb(event);
+	struct sock *sk = ulpq->asoc->base.sk;
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sk_buff_head *skb_list;
+
+	skb_list = (struct sk_buff_head *)skb->prev;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN &&
+	    (sk->sk_shutdown & SEND_SHUTDOWN ||
+	     !sctp_ulpevent_is_notification(event)))
+		goto out_free;
+
+	if (!sctp_ulpevent_is_notification(event)) {
+		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
+	}
+
+	if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+		goto out_free;
+
+	if (skb_list)
+		skb_queue_splice_tail_init(skb_list,
+					   &sk->sk_receive_queue);
+	else
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	if (!sp->data_ready_signalled) {
+		sp->data_ready_signalled = 1;
+		sk->sk_data_ready(sk);
+	}
+
+	return 1;
+
+out_free:
+	if (skb_list)
+		sctp_queue_purge_ulpevents(skb_list);
+	else
+		sctp_ulpevent_free(event);
+
+	return 0;
+}
+
+static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
+			       struct sctp_chunk *chunk, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sk_buff_head temp;
+	int event_eor = 0;
+
+	event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
+	if (!event)
+		return -ENOMEM;
+
+	event->mid = ntohl(chunk->subh.idata_hdr->mid);
+	if (event->msg_flags & SCTP_DATA_FIRST_FRAG)
+		event->ppid = chunk->subh.idata_hdr->ppid;
+	else
+		event->fsn = ntohl(chunk->subh.idata_hdr->fsn);
+
+	event = sctp_intl_reasm(ulpq, event);
+	if (event && event->msg_flags & MSG_EOR) {
+		skb_queue_head_init(&temp);
+		__skb_queue_tail(&temp, sctp_event2skb(event));
+
+		event = sctp_intl_order(ulpq, event);
+	}
+
+	if (event) {
+		event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
+		sctp_enqueue_event(ulpq, event);
+	}
+
+	return event_eor;
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
 	.make_datafrag		= sctp_make_datafrag_empty,
 	.assign_number		= sctp_chunk_assign_ssn,
 	.validate_data		= sctp_validate_data,
+	.ulpevent_data		= sctp_ulpq_tail_data,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -143,6 +560,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.make_datafrag		= sctp_make_idatafrag_empty,
 	.assign_number		= sctp_chunk_assign_mid,
 	.validate_data		= sctp_validate_idata,
+	.ulpevent_data		= sctp_ulpevent_idata,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 650b634fb2a3..d3218f3e9cf7 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -705,8 +705,6 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	sctp_ulpevent_receive_data(event, asoc);
 
 	event->stream = ntohs(chunk->subh.data_hdr->stream);
-	event->ssn = ntohs(chunk->subh.data_hdr->ssn);
-	event->ppid = chunk->subh.data_hdr->ppid;
 	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
 		event->flags |= SCTP_UNORDERED;
 		event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index a71be33f3afe..0d07f2a6cb35 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -104,6 +104,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 	if (!event)
 		return -ENOMEM;
 
+	event->ssn = ntohs(chunk->subh.data_hdr->ssn);
+	event->ppid = chunk->subh.data_hdr->ppid;
+
 	/* Do reassembly if needed.  */
 	event = sctp_ulpq_reasm(ulpq, event);
 
@@ -328,9 +331,10 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
  * payload was fragmented on the way and ip had to reassemble them.
  * We add the rest of skb's to the first skb's fraglist.
  */
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
-	struct sk_buff_head *queue, struct sk_buff *f_frag,
-	struct sk_buff *l_frag)
+struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
+						  struct sk_buff_head *queue,
+						  struct sk_buff *f_frag,
+						  struct sk_buff *l_frag)
 {
 	struct sk_buff *pos;
 	struct sk_buff *new = NULL;
@@ -853,7 +857,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 	struct sctp_stream *stream;
 
 	/* Check if this message needs ordering.  */
-	if (SCTP_DATA_UNORDERED & event->msg_flags)
+	if (event->msg_flags & SCTP_DATA_UNORDERED)
 		return event;
 
 	/* Note: The stream ID must be verified before this routine.  */
-- 
cgit v1.2.3


From 9162e0ed9e238c1f1d738cb36ee59d96b097f8e1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:05 +0800
Subject: sctp: implement enqueue_event for sctp_stream_interleave

enqueue_event is added as a member of sctp_stream_interleave, used to
enqueue either data, idata or notification events into user socket rx
queue.

It replaces sctp_ulpq_tail_event used in the other places with
enqueue_event.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h | 2 ++
 net/sctp/associola.c                 | 2 +-
 net/sctp/chunk.c                     | 2 +-
 net/sctp/sm_sideeffect.c             | 9 +++++----
 net/sctp/socket.c                    | 2 +-
 net/sctp/stream_interleave.c         | 2 ++
 6 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 02f60f541f1e..a0f61bc08ae8 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -41,6 +41,8 @@ struct sctp_stream_interleave {
 	bool	(*validate_data)(struct sctp_chunk *chunk);
 	int	(*ulpevent_data)(struct sctp_ulpq *ulpq,
 				 struct sctp_chunk *chunk, gfp_t gfp);
+	int	(*enqueue_event)(struct sctp_ulpq *ulpq,
+				 struct sctp_ulpevent *event);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 69394f4d6091..837806dd5799 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -861,7 +861,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
 					0, spc_state, error, GFP_ATOMIC);
 		if (event)
-			sctp_ulpq_tail_event(&asoc->ulpq, event);
+			asoc->stream.si->enqueue_event(&asoc->ulpq, event);
 	}
 
 	/* Select new active and retran paths. */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 62adaaacc4ab..991a530c6b31 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -124,7 +124,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 			ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
 							    error, GFP_ATOMIC);
 			if (ev)
-				sctp_ulpq_tail_event(&asoc->ulpq, ev);
+				asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 		}
 
 		sctp_chunk_put(chunk);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 9d25efb26a39..f4e5ecade936 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -972,7 +972,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 		if (!ev)
 			return;
 
-		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 
 		switch (err_hdr->cause) {
 		case SCTP_ERROR_UNKNOWN_CHUNK:
@@ -1058,7 +1058,7 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands,
 					    asoc->c.sinit_max_instreams,
 					    NULL, GFP_ATOMIC);
 	if (ev)
-		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 }
 
 /* Helper function to generate an adaptation indication event */
@@ -1070,7 +1070,7 @@ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands,
 	ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
 
 	if (ev)
-		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 }
 
 
@@ -1493,7 +1493,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n",
 				 __func__, cmd->obj.ulpevent, &asoc->ulpq);
 
-			sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ulpevent);
+			asoc->stream.si->enqueue_event(&asoc->ulpq,
+						       cmd->obj.ulpevent);
 			break;
 
 		case SCTP_CMD_REPLY:
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3654e1ede716..c58a1fc02978 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2294,7 +2294,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
 			if (!event)
 				return -ENOMEM;
 
-			sctp_ulpq_tail_event(&asoc->ulpq, event);
+			asoc->stream.si->enqueue_event(&asoc->ulpq, event);
 		}
 	}
 
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 823831101dc4..e85397200230 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -552,6 +552,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.assign_number		= sctp_chunk_assign_ssn,
 	.validate_data		= sctp_validate_data,
 	.ulpevent_data		= sctp_ulpq_tail_data,
+	.enqueue_event		= sctp_ulpq_tail_event,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -561,6 +562,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.assign_number		= sctp_chunk_assign_mid,
 	.validate_data		= sctp_validate_idata,
 	.ulpevent_data		= sctp_ulpevent_idata,
+	.enqueue_event		= sctp_enqueue_event,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From 94014e8d871ae43d834828710c098518be44b5d9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:06 +0800
Subject: sctp: implement renege_events for sctp_stream_interleave

renege_events is added as a member of sctp_stream_interleave, used to
renege some old data or idata in reasm or lobby queue properly to free
some memory for the new data when there's memory stress.

It defines sctp_renege_events for idata, and leaves sctp_ulpq_renege
as it is for data.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |   2 +
 include/net/sctp/ulpqueue.h          |   9 +--
 net/sctp/sm_sideeffect.c             |   5 +-
 net/sctp/stream_interleave.c         | 109 +++++++++++++++++++++++++++++++++++
 net/sctp/ulpqueue.c                  |   4 +-
 5 files changed, 119 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index a0f61bc08ae8..16a71cb2b098 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -43,6 +43,8 @@ struct sctp_stream_interleave {
 				 struct sctp_chunk *chunk, gfp_t gfp);
 	int	(*enqueue_event)(struct sctp_ulpq *ulpq,
 				 struct sctp_ulpevent *event);
+	void	(*renege_events)(struct sctp_ulpq *ulpq,
+				 struct sctp_chunk *chunk, gfp_t gfp);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index e0dce07b8794..eb98c7150a56 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -76,11 +76,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc);
 void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn);
 
 void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *, __u32);
-#endif /* __sctp_ulpqueue_h__ */
-
-
-
-
-
 
+__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq,
+			    struct sk_buff_head *list, __u16 needed);
 
+#endif /* __sctp_ulpqueue_h__ */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f4e5ecade936..2bec17ad7fc9 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1735,8 +1735,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_RENEGE:
-			sctp_ulpq_renege(&asoc->ulpq, cmd->obj.chunk,
-					 GFP_ATOMIC);
+			asoc->stream.si->renege_events(&asoc->ulpq,
+						       cmd->obj.chunk,
+						       GFP_ATOMIC);
 			break;
 
 		case SCTP_CMD_SETUP_T4:
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index e85397200230..d62ad5c62092 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -545,6 +545,113 @@ static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
 	return event_eor;
 }
 
+static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
+{
+	struct sctp_stream_in *csin, *sin = NULL;
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	__u16 sid = 0;
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+		if (csin->pd_mode)
+			continue;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (first_frag)
+				goto out;
+			if (cevent->mid == csin->mid) {
+				first_frag = pos;
+				last_frag = pos;
+				next_fsn = 0;
+				sin = csin;
+				sid = cevent->stream;
+			}
+			break;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag)
+				break;
+			if (cevent->stream == sid &&
+			    cevent->mid == sin->mid &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				last_frag = pos;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag)
+				goto out;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!first_frag)
+		return NULL;
+
+out:
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn = next_fsn;
+		sin->pd_mode = 1;
+	}
+
+	return retval;
+}
+
+static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+
+	if (skb_queue_empty(&ulpq->reasm))
+		return;
+
+	do {
+		event = sctp_intl_retrieve_first(ulpq);
+		if (event)
+			sctp_enqueue_event(ulpq, event);
+	} while (event);
+}
+
+static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+			       gfp_t gfp)
+{
+	struct sctp_association *asoc = ulpq->asoc;
+	__u32 freed = 0;
+	__u16 needed;
+
+	if (chunk) {
+		needed = ntohs(chunk->chunk_hdr->length);
+		needed -= sizeof(struct sctp_idata_chunk);
+	} else {
+		needed = SCTP_DEFAULT_MAXWINDOW;
+	}
+
+	if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
+		freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
+		if (freed < needed)
+			freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm,
+						       needed);
+	}
+
+	if (chunk && freed >= needed)
+		if (sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0)
+			sctp_intl_start_pd(ulpq, gfp);
+
+	sk_mem_reclaim(asoc->base.sk);
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
@@ -553,6 +660,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.validate_data		= sctp_validate_data,
 	.ulpevent_data		= sctp_ulpq_tail_data,
 	.enqueue_event		= sctp_ulpq_tail_event,
+	.renege_events		= sctp_ulpq_renege,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -563,6 +671,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.validate_data		= sctp_validate_idata,
 	.ulpevent_data		= sctp_ulpevent_idata,
 	.enqueue_event		= sctp_enqueue_event,
+	.renege_events		= sctp_renege_events,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0d07f2a6cb35..76ec5149a093 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -978,8 +978,8 @@ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
 	sctp_ulpq_reap_ordered(ulpq, sid);
 }
 
-static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq,
-		struct sk_buff_head *list, __u16 needed)
+__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list,
+			    __u16 needed)
 {
 	__u16 freed = 0;
 	__u32 tsn, last_tsn;
-- 
cgit v1.2.3


From be4e0ce10dc64b9a8aae42ec3dbd906022f91ec5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:07 +0800
Subject: sctp: implement start_pd for sctp_stream_interleave

start_pd is added as a member of sctp_stream_interleave, used to
do partial_delivery for data or idata when datalen >= asoc->rwnd
in sctp_eat_data. The codes have been done in last patches, but
they need to be extracted into start_pd, so that it could be used
for SCTP_CMD_PART_DELIVER cmd as well.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h | 1 +
 net/sctp/sm_sideeffect.c             | 2 +-
 net/sctp/stream_interleave.c         | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 16a71cb2b098..317d9b3a5299 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -45,6 +45,7 @@ struct sctp_stream_interleave {
 				 struct sctp_ulpevent *event);
 	void	(*renege_events)(struct sctp_ulpq *ulpq,
 				 struct sctp_chunk *chunk, gfp_t gfp);
+	void	(*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 2bec17ad7fc9..36710549a4ca 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1731,7 +1731,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_PART_DELIVER:
-			sctp_ulpq_partial_delivery(&asoc->ulpq, GFP_ATOMIC);
+			asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC);
 			break;
 
 		case SCTP_CMD_RENEGE:
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index d62ad5c62092..4dce8d33c5ab 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -661,6 +661,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.ulpevent_data		= sctp_ulpq_tail_data,
 	.enqueue_event		= sctp_ulpq_tail_event,
 	.renege_events		= sctp_ulpq_renege,
+	.start_pd		= sctp_ulpq_partial_delivery,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -672,6 +673,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.ulpevent_data		= sctp_ulpevent_idata,
 	.enqueue_event		= sctp_enqueue_event,
 	.renege_events		= sctp_renege_events,
+	.start_pd		= sctp_intl_start_pd,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From 65f5e357839e40817aead853d7a7f61ff828b52b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:08 +0800
Subject: sctp: implement abort_pd for sctp_stream_interleave

abort_pd is added as a member of sctp_stream_interleave, used to abort
partial delivery for data or idata, called in sctp_cmd_assoc_failed.

Since stream interleave allows to do partial delivery for each stream
at the same time, sctp_intl_abort_pd for idata would be very different
from the old function sctp_ulpq_abort_pd for data.

Note that sctp_ulpevent_make_pdapi will support per stream in this
patch by adding pdapi_stream and pdapi_seq in sctp_pdapi_event, as
described in section 6.1.7 of RFC6458.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |  1 +
 include/net/sctp/ulpevent.h          |  3 +-
 include/uapi/linux/sctp.h            |  2 +
 net/sctp/sm_sideeffect.c             |  2 +-
 net/sctp/stream_interleave.c         | 99 ++++++++++++++++++++++++++++++++++++
 net/sctp/ulpevent.c                  |  9 ++--
 net/sctp/ulpqueue.c                  |  2 +-
 7 files changed, 112 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 317d9b3a5299..501b2be049a3 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -46,6 +46,7 @@ struct sctp_stream_interleave {
 	void	(*renege_events)(struct sctp_ulpq *ulpq,
 				 struct sctp_chunk *chunk, gfp_t gfp);
 	void	(*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
+	void	(*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index ce4f2aa35d56..51b4e0626c34 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -122,7 +122,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
 
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	const struct sctp_association *asoc,
-	__u32 indication, gfp_t gfp);
+	__u32 indication, __u32 sid, __u32 seq,
+	__u32 flags, gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication(
 	const struct sctp_association *asoc, gfp_t gfp);
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6ed934c65a5f..4c4db14786bd 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -460,6 +460,8 @@ struct sctp_pdapi_event {
 	__u32 pdapi_length;
 	__u32 pdapi_indication;
 	sctp_assoc_t pdapi_assoc_id;
+	__u32 pdapi_stream;
+	__u32 pdapi_seq;
 };
 
 enum { SCTP_PARTIAL_DELIVERY_ABORTED=0, };
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 36710549a4ca..8adde71fdb31 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
 	struct sctp_chunk *abort;
 
 	/* Cancel any partial delivery in progress. */
-	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+	asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC);
 
 	if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
 		event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 4dce8d33c5ab..d15645ea338b 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -652,6 +652,103 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 	sk_mem_reclaim(asoc->base.sk);
 }
 
+static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
+				      __u32 mid, __u16 flags, gfp_t gfp)
+{
+	struct sock *sk = ulpq->asoc->base.sk;
+	struct sctp_ulpevent *ev = NULL;
+
+	if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
+					&sctp_sk(sk)->subscribe))
+		return;
+
+	ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
+				      sid, mid, flags, gfp);
+	if (ev) {
+		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
+
+		if (!sctp_sk(sk)->data_ready_signalled) {
+			sctp_sk(sk)->data_ready_signalled = 1;
+			sk->sk_data_ready(sk);
+		}
+	}
+}
+
+static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
+{
+	struct sctp_stream *stream = &ulpq->asoc->stream;
+	struct sctp_ulpevent *cevent, *event = NULL;
+	struct sk_buff_head *lobby = &ulpq->lobby;
+	struct sk_buff *pos, *tmp;
+	struct sk_buff_head temp;
+	__u16 csid;
+	__u32 cmid;
+
+	skb_queue_head_init(&temp);
+	sctp_skb_for_each(pos, lobby, tmp) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+		csid = cevent->stream;
+		cmid = cevent->mid;
+
+		if (csid > sid)
+			break;
+
+		if (csid < sid)
+			continue;
+
+		if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid)))
+			break;
+
+		__skb_unlink(pos, lobby);
+		if (!event)
+			event = sctp_skb2event(pos);
+
+		__skb_queue_tail(&temp, pos);
+	}
+
+	if (!event && pos != (struct sk_buff *)lobby) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+		csid = cevent->stream;
+		cmid = cevent->mid;
+
+		if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) {
+			sctp_mid_next(stream, in, csid);
+			__skb_unlink(pos, lobby);
+			__skb_queue_tail(&temp, pos);
+			event = sctp_skb2event(pos);
+		}
+	}
+
+	if (event) {
+		sctp_intl_retrieve_ordered(ulpq, event);
+		sctp_enqueue_event(ulpq, event);
+	}
+}
+
+static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+	struct sctp_stream *stream = &ulpq->asoc->stream;
+	__u16 sid;
+
+	for (sid = 0; sid < stream->incnt; sid++) {
+		struct sctp_stream_in *sin = &stream->in[sid];
+		__u32 mid;
+
+		if (sin->pd_mode) {
+			sin->pd_mode = 0;
+
+			mid = sin->mid;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp);
+			sctp_mid_skip(stream, in, sid, mid);
+
+			sctp_intl_reap_ordered(ulpq, sid);
+		}
+	}
+
+	/* intl abort pd happens only when all data needs to be cleaned */
+	sctp_ulpq_flush(ulpq);
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
@@ -662,6 +759,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.enqueue_event		= sctp_ulpq_tail_event,
 	.renege_events		= sctp_ulpq_renege,
 	.start_pd		= sctp_ulpq_partial_delivery,
+	.abort_pd		= sctp_ulpq_abort_pd,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -674,6 +772,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.enqueue_event		= sctp_enqueue_event,
 	.renege_events		= sctp_renege_events,
 	.start_pd		= sctp_intl_start_pd,
+	.abort_pd		= sctp_intl_abort_pd,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d3218f3e9cf7..84207ad33e8e 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -730,8 +730,9 @@ fail:
  *   various events.
  */
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
-	const struct sctp_association *asoc, __u32 indication,
-	gfp_t gfp)
+					const struct sctp_association *asoc,
+					__u32 indication, __u32 sid, __u32 seq,
+					__u32 flags, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_pdapi_event *pd;
@@ -752,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	 *   Currently unused.
 	 */
 	pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
-	pd->pdapi_flags = 0;
+	pd->pdapi_flags = flags;
+	pd->pdapi_stream = sid;
+	pd->pdapi_seq = seq;
 
 	/* pdapi_length: 32 bits (unsigned integer)
 	 *
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 76ec5149a093..dd53daab4a25 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -1144,7 +1144,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 				       &sctp_sk(sk)->subscribe))
 		ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
 					      SCTP_PARTIAL_DELIVERY_ABORTED,
-					      gfp);
+					      0, 0, 0, gfp);
 	if (ev)
 		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
 
-- 
cgit v1.2.3


From 132282386f5d0eff7a84a119599216b5f9e9bfc6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:09 +0800
Subject: sctp: add support for the process of unordered idata

Unordered idata process is more complicated than unordered data:

 - It has to add mid into sctp_stream_out to save the next mid value,
   which is separated from ordered idata's.

 - To support pd for unordered idata, another mid and pd_mode need to
   be added to save the message id and pd state in sctp_stream_in.

 - To make  unordered idata reasm easier, it adds a new event queue
   to save frags for idata.

The patch mostly adds the samilar reasm functions for unordered idata
as ordered idata's, and also adjusts some other codes on assign_mid,
abort_pd and ulpevent_data for idata.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h   |  14 +-
 include/net/sctp/ulpqueue.h  |   1 +
 net/sctp/socket.c            |  23 ++-
 net/sctp/stream_interleave.c | 377 ++++++++++++++++++++++++++++++++++++++++---
 net/sctp/ulpqueue.c          |   5 +
 5 files changed, 392 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 73b315de2fef..8ef638d966f1 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -413,6 +413,14 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new);
 
 #define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid])
 
+/* What is the current MID_uo number for this stream? */
+#define sctp_mid_uo_peek(stream, type, sid) \
+	((stream)->type[sid].mid_uo)
+
+/* Return the next MID_uo number for this stream.  */
+#define sctp_mid_uo_next(stream, type, sid) \
+	((stream)->type[sid].mid_uo++)
+
 /*
  * Pointers to address related SCTP functions.
  * (i.e. things that depend on the address family.)
@@ -1379,8 +1387,9 @@ struct sctp_stream_out {
 		__u32 mid;
 		__u16 ssn;
 	};
-	__u8	state;
+	__u32 mid_uo;
 	struct sctp_stream_out_ext *ext;
+	__u8 state;
 };
 
 struct sctp_stream_in {
@@ -1388,8 +1397,11 @@ struct sctp_stream_in {
 		__u32 mid;
 		__u16 ssn;
 	};
+	__u32 mid_uo;
 	__u32 fsn;
+	__u32 fsn_uo;
 	char pd_mode;
+	char pd_mode_uo;
 };
 
 struct sctp_stream {
diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index eb98c7150a56..bb0ecba3db2b 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -45,6 +45,7 @@ struct sctp_ulpq {
 	char pd_mode;
 	struct sctp_association *asoc;
 	struct sk_buff_head reasm;
+	struct sk_buff_head reasm_uo;
 	struct sk_buff_head lobby;
 };
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c58a1fc02978..7eec0a0b7f79 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -201,6 +201,22 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
 		cb(chunk);
 }
 
+static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk,
+				 void (*cb)(struct sk_buff *, struct sock *))
+
+{
+	struct sk_buff *skb, *tmp;
+
+	sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp)
+		cb(skb, sk);
+
+	sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp)
+		cb(skb, sk);
+
+	sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp)
+		cb(skb, sk);
+}
+
 /* Verify that this is a valid address. */
 static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
 				   int len)
@@ -1554,6 +1570,7 @@ static void sctp_close(struct sock *sk, long timeout)
 
 		if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
 		    !skb_queue_empty(&asoc->ulpq.reasm) ||
+		    !skb_queue_empty(&asoc->ulpq.reasm_uo) ||
 		    (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
 			struct sctp_chunk *chunk;
 
@@ -8495,11 +8512,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 
 	}
 
-	sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp)
-		sctp_skb_set_owner_r_frag(skb, newsk);
-
-	sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp)
-		sctp_skb_set_owner_r_frag(skb, newsk);
+	sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag);
 
 	/* Set the type of socket to indicate that it is peeled off from the
 	 * original UDP-style socket or created with the accept() call on a
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index d15645ea338b..87b9417c9892 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -74,12 +74,10 @@ static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
 
 	list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) {
 		struct sctp_idatahdr *hdr;
+		__u32 mid;
 
 		lchunk->has_mid = 1;
 
-		if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-			continue;
-
 		hdr = lchunk->subh.idata_hdr;
 
 		if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)
@@ -87,10 +85,16 @@ static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
 		else
 			hdr->fsn = htonl(cfsn++);
 
-		if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
-			hdr->mid = htonl(sctp_mid_next(stream, out, sid));
-		else
-			hdr->mid = htonl(sctp_mid_peek(stream, out, sid));
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+			mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
+				sctp_mid_uo_next(stream, out, sid) :
+				sctp_mid_uo_peek(stream, out, sid);
+		} else {
+			mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
+				sctp_mid_next(stream, out, sid) :
+				sctp_mid_peek(stream, out, sid);
+		}
+		hdr->mid = htonl(mid);
 	}
 }
 
@@ -449,9 +453,6 @@ static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq,
 	struct sctp_stream *stream;
 	__u16 sid;
 
-	if (event->msg_flags & SCTP_DATA_UNORDERED)
-		return event;
-
 	stream  = &ulpq->asoc->stream;
 	sid = event->stream;
 
@@ -512,6 +513,317 @@ out_free:
 	return 0;
 }
 
+static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq,
+				     struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *pos;
+
+	pos = skb_peek_tail(&ulpq->reasm_uo);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+		return;
+	}
+
+	cevent = sctp_skb2event(pos);
+
+	if (event->stream == cevent->stream &&
+	    event->mid == cevent->mid &&
+	    (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
+	     (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+	      event->fsn > cevent->fsn))) {
+		__skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+		return;
+	}
+
+	if ((event->stream == cevent->stream &&
+	     MID_lt(cevent->mid, event->mid)) ||
+	    event->stream > cevent->stream) {
+		__skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+		return;
+	}
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		cevent = sctp_skb2event(pos);
+
+		if (event->stream < cevent->stream ||
+		    (event->stream == cevent->stream &&
+		     MID_lt(event->mid, cevent->mid)))
+			break;
+
+		if (event->stream == cevent->stream &&
+		    event->mid == cevent->mid &&
+		    !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+		    (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
+		     event->fsn < cevent->fsn))
+			break;
+	}
+
+	__skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sctp_stream_in *sin;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	int is_last = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+		if (cevent->stream > event->stream)
+			break;
+
+		if (MID_lt(cevent->mid, sin->mid_uo))
+			continue;
+		if (MID_lt(sin->mid_uo, cevent->mid))
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			goto out;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn_uo) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = cevent->fsn + 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn++;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn_uo) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = 0;
+					is_last = 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn = 0;
+				is_last = 1;
+			}
+			goto out;
+		default:
+			goto out;
+		}
+	}
+
+out:
+	if (!first_frag)
+		return NULL;
+
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm_uo, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn_uo = next_fsn;
+		if (is_last) {
+			retval->msg_flags |= MSG_EOR;
+			sin->pd_mode_uo = 0;
+		}
+	}
+
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_association *asoc = ulpq->asoc;
+	struct sk_buff *pos, *first_frag = NULL;
+	struct sctp_ulpevent *retval = NULL;
+	struct sk_buff *pd_first = NULL;
+	struct sk_buff *pd_last = NULL;
+	struct sctp_stream_in *sin;
+	__u32 next_fsn = 0;
+	__u32 pd_point = 0;
+	__u32 pd_len = 0;
+	__u32 mid = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+		if (cevent->stream > event->stream)
+			break;
+
+		if (MID_lt(cevent->mid, event->mid))
+			continue;
+		if (MID_lt(event->mid, cevent->mid))
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (!sin->pd_mode_uo) {
+				sin->mid_uo = cevent->mid;
+				pd_first = pos;
+				pd_last = pos;
+				pd_len = pos->len;
+			}
+
+			first_frag = pos;
+			next_fsn = 0;
+			mid = cevent->mid;
+			break;
+
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				if (pd_first) {
+					pd_last = pos;
+					pd_len += pos->len;
+				}
+			} else {
+				first_frag = NULL;
+			}
+			break;
+
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn)
+				goto found;
+			else
+				first_frag = NULL;
+			break;
+		}
+	}
+
+	if (!pd_first)
+		goto out;
+
+	pd_point = sctp_sk(asoc->base.sk)->pd_point;
+	if (pd_point && pd_point <= pd_len) {
+		retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+						     &ulpq->reasm_uo,
+						     pd_first, pd_last);
+		if (retval) {
+			sin->fsn_uo = next_fsn;
+			sin->pd_mode_uo = 1;
+		}
+	}
+	goto out;
+
+found:
+	retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+					     &ulpq->reasm_uo,
+					     first_frag, pos);
+	if (retval)
+		retval->msg_flags |= MSG_EOR;
+
+out:
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *retval = NULL;
+	struct sctp_stream_in *sin;
+
+	if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+		event->msg_flags |= MSG_EOR;
+		return event;
+	}
+
+	sctp_intl_store_reasm_uo(ulpq, event);
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+	if (sin->pd_mode_uo && event->mid == sin->mid_uo &&
+	    event->fsn == sin->fsn_uo)
+		retval = sctp_intl_retrieve_partial_uo(ulpq, event);
+
+	if (!retval)
+		retval = sctp_intl_retrieve_reassembled_uo(ulpq, event);
+
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
+{
+	struct sctp_stream_in *csin, *sin = NULL;
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	__u16 sid = 0;
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+		if (csin->pd_mode_uo)
+			continue;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (first_frag)
+				goto out;
+			first_frag = pos;
+			last_frag = pos;
+			next_fsn = 0;
+			sin = csin;
+			sid = cevent->stream;
+			sin->mid_uo = cevent->mid;
+			break;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag)
+				break;
+			if (cevent->stream == sid &&
+			    cevent->mid == sin->mid_uo &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				last_frag = pos;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag)
+				goto out;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!first_frag)
+		return NULL;
+
+out:
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm_uo, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn_uo = next_fsn;
+		sin->pd_mode_uo = 1;
+	}
+
+	return retval;
+}
+
 static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
 			       struct sctp_chunk *chunk, gfp_t gfp)
 {
@@ -529,12 +841,16 @@ static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
 	else
 		event->fsn = ntohl(chunk->subh.idata_hdr->fsn);
 
-	event = sctp_intl_reasm(ulpq, event);
-	if (event && event->msg_flags & MSG_EOR) {
-		skb_queue_head_init(&temp);
-		__skb_queue_tail(&temp, sctp_event2skb(event));
+	if (!(event->msg_flags & SCTP_DATA_UNORDERED)) {
+		event = sctp_intl_reasm(ulpq, event);
+		if (event && event->msg_flags & MSG_EOR) {
+			skb_queue_head_init(&temp);
+			__skb_queue_tail(&temp, sctp_event2skb(event));
 
-		event = sctp_intl_order(ulpq, event);
+			event = sctp_intl_order(ulpq, event);
+		}
+	} else {
+		event = sctp_intl_reasm_uo(ulpq, event);
 	}
 
 	if (event) {
@@ -614,14 +930,21 @@ static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 
-	if (skb_queue_empty(&ulpq->reasm))
-		return;
+	if (!skb_queue_empty(&ulpq->reasm)) {
+		do {
+			event = sctp_intl_retrieve_first(ulpq);
+			if (event)
+				sctp_enqueue_event(ulpq, event);
+		} while (event);
+	}
 
-	do {
-		event = sctp_intl_retrieve_first(ulpq);
-		if (event)
-			sctp_enqueue_event(ulpq, event);
-	} while (event);
+	if (!skb_queue_empty(&ulpq->reasm_uo)) {
+		do {
+			event = sctp_intl_retrieve_first_uo(ulpq);
+			if (event)
+				sctp_enqueue_event(ulpq, event);
+		} while (event);
+	}
 }
 
 static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
@@ -643,6 +966,9 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 		if (freed < needed)
 			freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm,
 						       needed);
+		if (freed < needed)
+			freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo,
+						       needed);
 	}
 
 	if (chunk && freed >= needed)
@@ -734,6 +1060,13 @@ static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 		struct sctp_stream_in *sin = &stream->in[sid];
 		__u32 mid;
 
+		if (sin->pd_mode_uo) {
+			sin->pd_mode_uo = 0;
+
+			mid = sin->mid_uo;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp);
+		}
+
 		if (sin->pd_mode) {
 			sin->pd_mode = 0;
 
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index dd53daab4a25..97fae53310e0 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -60,6 +60,7 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
 
 	ulpq->asoc = asoc;
 	skb_queue_head_init(&ulpq->reasm);
+	skb_queue_head_init(&ulpq->reasm_uo);
 	skb_queue_head_init(&ulpq->lobby);
 	ulpq->pd_mode  = 0;
 
@@ -83,6 +84,10 @@ void sctp_ulpq_flush(struct sctp_ulpq *ulpq)
 		sctp_ulpevent_free(event);
 	}
 
+	while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) {
+		event = sctp_skb2event(skb);
+		sctp_ulpevent_free(event);
+	}
 }
 
 /* Dispose of a ulpqueue.  */
-- 
cgit v1.2.3


From 02db55718d53f9d426cee504c27fb768e9ed4ffe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Dec 2017 17:55:02 -0800
Subject: tcp: do not overshoot window_clamp in tcp_rcv_space_adjust()

While rcvbuf is properly clamped by tcp_rmem[2], rcvwin
is left to a potentially too big value.

It has no serious effect, since :
1) tcp_grow_window() has very strict checks.
2) window_clamp can be mangled by user space to any value anyway.

tcp_init_buffer_space() and companions use tcp_full_space(),
we use tcp_win_from_space() to avoid reloading sk->sk_rcvbuf

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9550cc42de2d..746a6773c482 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -631,7 +631,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 			sk->sk_rcvbuf = rcvbuf;
 
 			/* Make the window clamp follow along.  */
-			tp->window_clamp = rcvwin;
+			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
 		}
 	}
 	tp->rcvq_space.space = copied;
-- 
cgit v1.2.3


From 607065bad9931e72207b0cac365d7d4abc06bd99 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Dec 2017 17:55:03 -0800
Subject: tcp: avoid integer overflows in tcp_rcv_space_adjust()

When using large tcp_rmem[2] values (I did tests with 500 MB),
I noticed overflows while computing rcvwin.

Lets fix this before the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  2 +-
 net/ipv4/tcp_input.c | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ca4a6361389b..4f93f0953c41 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -344,7 +344,7 @@ struct tcp_sock {
 
 /* Receiver queue space */
 	struct {
-		int	space;
+		u32	space;
 		u32	seq;
 		u64	time;
 	} rcvq_space;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 746a6773c482..2900e58738cd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -576,8 +576,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 void tcp_rcv_space_adjust(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied;
 	int time;
-	int copied;
 
 	tcp_mstamp_refresh(tp);
 	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
@@ -600,12 +600,13 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
 	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		int rcvwin, rcvmem, rcvbuf;
+		int rcvmem, rcvbuf;
+		u64 rcvwin;
 
 		/* minimal window to cope with packet losses, assuming
 		 * steady state. Add some cushion because of small variations.
 		 */
-		rcvwin = (copied << 1) + 16 * tp->advmss;
+		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
 
 		/* If rate increased by 25%,
 		 *	assume slow start, rcvwin = 3 * copied
@@ -625,8 +626,9 @@ void tcp_rcv_space_adjust(struct sock *sk)
 		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
 			rcvmem += 128;
 
-		rcvbuf = min(rcvwin / tp->advmss * rcvmem,
-			     sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+		do_div(rcvwin, tp->advmss);
+		rcvbuf = min_t(u64, rcvwin * rcvmem,
+			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 		if (rcvbuf > sk->sk_rcvbuf) {
 			sk->sk_rcvbuf = rcvbuf;
 
-- 
cgit v1.2.3


From c3916ad9320eed8eacd7c0b2cf7f881efceda892 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Dec 2017 17:55:04 -0800
Subject: tcp: smoother receiver autotuning

Back in linux-3.13 (commit b0983d3c9b13 ("tcp: fix dynamic right sizing"))
I addressed the pressing issues we had with receiver autotuning.

But DRS suffers from extra latencies caused by rcv_rtt_est.rtt_us
drifts. One common problem happens during slow start, since the
apparent RTT measured by the receiver can be inflated by ~50%,
at the end of one packet train.

Also, a single drop can delay read() calls by one RTT, meaning
tcp_rcv_space_adjust() can be called one RTT too late.

By replacing the tri-modal heuristic with a continuous function,
we can offset the effects of not growing 'at the optimal time'.

The curve of the function matches prior behavior if the space
increased by 25% and 50% exactly.

Cost of added multiply/divide is small, considering a TCP flow
typically would run this part of the code few times in its life.

I tested this patch with 100 ms RTT / 1% loss link, 100 runs
of (netperf -l 5), and got an average throughput of 4600 Mbit
instead of 1700 Mbit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2900e58738cd..fefb46c16de7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -601,26 +601,17 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
 		int rcvmem, rcvbuf;
-		u64 rcvwin;
+		u64 rcvwin, grow;
 
 		/* minimal window to cope with packet losses, assuming
 		 * steady state. Add some cushion because of small variations.
 		 */
 		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
 
-		/* If rate increased by 25%,
-		 *	assume slow start, rcvwin = 3 * copied
-		 * If rate increased by 50%,
-		 *	assume sender can use 2x growth, rcvwin = 4 * copied
-		 */
-		if (copied >=
-		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
-			if (copied >=
-			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
-				rcvwin <<= 1;
-			else
-				rcvwin += (rcvwin >> 1);
-		}
+		/* Accommodate for sender rate increase (eg. slow start) */
+		grow = rcvwin * (copied - tp->rcvq_space.space);
+		do_div(grow, tp->rcvq_space.space);
+		rcvwin += (grow << 1);
 
 		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
 		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
-- 
cgit v1.2.3


From 1b259904a2d0ad8c57feb498932bed5171112af3 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 8 Nov 2017 08:03:04 +0100
Subject: Bluetooth: Use common error handling code in bt_init()

* Improve jump targets so that a bit of exception handling can be better
  reused at the end of this function.

* Adjust five condition checks.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/af_bluetooth.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 91e3ba280706..f044202346c6 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -766,43 +766,39 @@ static int __init bt_init(void)
 		return err;
 
 	err = sock_register(&bt_sock_family_ops);
-	if (err < 0) {
-		bt_sysfs_cleanup();
-		return err;
-	}
+	if (err)
+		goto cleanup_sysfs;
 
 	BT_INFO("HCI device and connection manager initialized");
 
 	err = hci_sock_init();
-	if (err < 0)
-		goto error;
+	if (err)
+		goto unregister_socket;
 
 	err = l2cap_init();
-	if (err < 0)
-		goto sock_err;
+	if (err)
+		goto cleanup_socket;
 
 	err = sco_init();
-	if (err < 0) {
-		l2cap_exit();
-		goto sock_err;
-	}
+	if (err)
+		goto cleanup_cap;
 
 	err = mgmt_init();
-	if (err < 0) {
-		sco_exit();
-		l2cap_exit();
-		goto sock_err;
-	}
+	if (err)
+		goto cleanup_sco;
 
 	return 0;
 
-sock_err:
+cleanup_sco:
+	sco_exit();
+cleanup_cap:
+	l2cap_exit();
+cleanup_socket:
 	hci_sock_cleanup();
-
-error:
+unregister_socket:
 	sock_unregister(PF_BLUETOOTH);
+cleanup_sysfs:
 	bt_sysfs_cleanup();
-
 	return err;
 }
 
-- 
cgit v1.2.3


From 8a950794484480641b3f3fceb89b7f6e2d1e1328 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 11 Dec 2017 12:10:33 +0200
Subject: Bluetooth: Utilize %*ph specifier

Instead of open coding byte-by-byte printing, re-use %*ph specifier.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_debugfs.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 63df63ebfb24..e204bfdb5ba2 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -106,21 +106,10 @@ static int features_show(struct seq_file *f, void *ptr)
 	u8 p;
 
 	hci_dev_lock(hdev);
-	for (p = 0; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) {
-		seq_printf(f, "%2u: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x "
-			   "0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", p,
-			   hdev->features[p][0], hdev->features[p][1],
-			   hdev->features[p][2], hdev->features[p][3],
-			   hdev->features[p][4], hdev->features[p][5],
-			   hdev->features[p][6], hdev->features[p][7]);
-	}
+	for (p = 0; p < HCI_MAX_PAGES && p <= hdev->max_page; p++)
+		seq_printf(f, "%2u: %8ph\n", p, hdev->features[p]);
 	if (lmp_le_capable(hdev))
-		seq_printf(f, "LE: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x "
-			   "0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n",
-			   hdev->le_features[0], hdev->le_features[1],
-			   hdev->le_features[2], hdev->le_features[3],
-			   hdev->le_features[4], hdev->le_features[5],
-			   hdev->le_features[6], hdev->le_features[7]);
+		seq_printf(f, "LE: %8ph\n", hdev->le_features);
 	hci_dev_unlock(hdev);
 
 	return 0;
-- 
cgit v1.2.3


From 94386b6a5b2c2102e832507ced90a14e6e3568eb Mon Sep 17 00:00:00 2001
From: Jaganath Kanakkassery <jaganath.k.os@gmail.com>
Date: Mon, 11 Dec 2017 20:26:47 +0530
Subject: Bluetooth: Remove redundant disable_advertising()

There is already __hci_req_disable_advertising() function for disabling,
so use it.

Signed-off-by: Jaganath Kanakkassery <jaganathx.kanakkassery@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index abc0f3224dd1..da59f82754bc 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1985,13 +1985,6 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
-static void disable_advertising(struct hci_request *req)
-{
-	u8 enable = 0x00;
-
-	hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
-}
-
 static int active_scan(struct hci_request *req, unsigned long opt)
 {
 	uint16_t interval = opt;
@@ -2017,7 +2010,7 @@ static int active_scan(struct hci_request *req, unsigned long opt)
 		cancel_adv_timeout(hdev);
 		hci_dev_unlock(hdev);
 
-		disable_advertising(req);
+		__hci_req_disable_advertising(req);
 	}
 
 	/* If controller is scanning, it means the background scanning is
-- 
cgit v1.2.3


From 9e1e9f20ca96026c56ac613317ea4bf01c6c3385 Mon Sep 17 00:00:00 2001
From: Łukasz Rymanowski <lukasz.rymanowski@codecoup.pl>
Date: Fri, 8 Dec 2017 13:40:57 +0100
Subject: Bluetooth: Add support to advertise when connected
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far, kernel did not allow to advertise when there was a connection
established. With this patch kernel does allow it if controller
supports it.

If controller supports non-connectable advertising when connected, then
only non-connectable advertising instances will be advertised.

Signed-off-by: Łukasz Rymanowski <lukasz.rymanowski@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 55 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index da59f82754bc..3394e6791673 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -919,6 +919,43 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
 	return true;
 }
 
+static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
+{
+	/* If there is no connection we are OK to advertise. */
+	if (hci_conn_num(hdev, LE_LINK) == 0)
+		return true;
+
+	/* Check le_states if there is any connection in slave role. */
+	if (hdev->conn_hash.le_num_slave > 0) {
+		/* Slave connection state and non connectable mode bit 20. */
+		if (!connectable && !(hdev->le_states[2] & 0x10))
+			return false;
+
+		/* Slave connection state and connectable mode bit 38
+		 * and scannable bit 21.
+		 */
+		if (connectable && (!(hdev->le_states[4] & 0x01) ||
+				    !(hdev->le_states[2] & 0x40)))
+			return false;
+	}
+
+	/* Check le_states if there is any connection in master role. */
+	if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_slave) {
+		/* Master connection state and non connectable mode bit 18. */
+		if (!connectable && !(hdev->le_states[2] & 0x02))
+			return false;
+
+		/* Master connection state and connectable mode bit 35 and
+		 * scannable 19.
+		 */
+		if (connectable && (!(hdev->le_states[4] & 0x10) ||
+				    !(hdev->le_states[2] & 0x08)))
+			return false;
+	}
+
+	return true;
+}
+
 void __hci_req_enable_advertising(struct hci_request *req)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -927,7 +964,15 @@ void __hci_req_enable_advertising(struct hci_request *req)
 	bool connectable;
 	u32 flags;
 
-	if (hci_conn_num(hdev, LE_LINK) > 0)
+	flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance);
+
+	/* If the "connectable" instance flag was not set, then choose between
+	 * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+	 */
+	connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+		      mgmt_get_connectable(hdev);
+
+	if (!is_advertising_allowed(hdev, connectable))
 		return;
 
 	if (hci_dev_test_flag(hdev, HCI_LE_ADV))
@@ -940,14 +985,6 @@ void __hci_req_enable_advertising(struct hci_request *req)
 	 */
 	hci_dev_clear_flag(hdev, HCI_LE_ADV);
 
-	flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance);
-
-	/* If the "connectable" instance flag was not set, then choose between
-	 * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
-	 */
-	connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
-		      mgmt_get_connectable(hdev);
-
 	/* Set require_privacy to true only when non-connectable
 	 * advertising is used. In that case it is fine to use a
 	 * non-resolvable private address.
-- 
cgit v1.2.3


From 039af9c66b93154b493e3088a36b251b99c9b3c4 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Dec 2017 15:35:03 -0800
Subject: net_sched: switch to exit_batch for action pernet ops

Since we now hold RTNL lock in tc_action_net_exit(), it is good to
batch them to speedup tc action dismantle.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      | 13 ++++++++++---
 net/sched/act_bpf.c        |  8 +++-----
 net/sched/act_connmark.c   |  8 +++-----
 net/sched/act_csum.c       |  8 +++-----
 net/sched/act_gact.c       |  8 +++-----
 net/sched/act_ife.c        |  8 +++-----
 net/sched/act_ipt.c        | 16 ++++++----------
 net/sched/act_mirred.c     |  8 +++-----
 net/sched/act_nat.c        |  8 +++-----
 net/sched/act_pedit.c      |  8 +++-----
 net/sched/act_police.c     |  8 +++-----
 net/sched/act_sample.c     |  8 +++-----
 net/sched/act_simple.c     |  8 +++-----
 net/sched/act_skbedit.c    |  8 +++-----
 net/sched/act_skbmod.c     |  8 +++-----
 net/sched/act_tunnel_key.c |  8 +++-----
 net/sched/act_vlan.c       |  8 +++-----
 17 files changed, 61 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 02bf409140d0..6ed9692f20bd 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -120,12 +120,19 @@ int tc_action_net_init(struct tc_action_net *tn,
 void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
 			 struct tcf_idrinfo *idrinfo);
 
-static inline void tc_action_net_exit(struct tc_action_net *tn)
+static inline void tc_action_net_exit(struct list_head *net_list,
+				      unsigned int id)
 {
+	struct net *net;
+
 	rtnl_lock();
-	tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
+	list_for_each_entry(net, net_list, exit_list) {
+		struct tc_action_net *tn = net_generic(net, id);
+
+		tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
+		kfree(tn->idrinfo);
+	}
 	rtnl_unlock();
-	kfree(tn->idrinfo);
 }
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index e6c477fa9ca5..b3f2c15affa7 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -401,16 +401,14 @@ static __net_init int bpf_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_bpf_ops);
 }
 
-static void __net_exit bpf_exit_net(struct net *net)
+static void __net_exit bpf_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, bpf_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, bpf_net_id);
 }
 
 static struct pernet_operations bpf_net_ops = {
 	.init = bpf_init_net,
-	.exit = bpf_exit_net,
+	.exit_batch = bpf_exit_net,
 	.id   = &bpf_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 10b7a8855a6c..2b15ba84e0c8 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -209,16 +209,14 @@ static __net_init int connmark_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_connmark_ops);
 }
 
-static void __net_exit connmark_exit_net(struct net *net)
+static void __net_exit connmark_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, connmark_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, connmark_net_id);
 }
 
 static struct pernet_operations connmark_net_ops = {
 	.init = connmark_init_net,
-	.exit = connmark_exit_net,
+	.exit_batch = connmark_exit_net,
 	.id   = &connmark_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d836f998117b..af4b8ec60d9a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -635,16 +635,14 @@ static __net_init int csum_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_csum_ops);
 }
 
-static void __net_exit csum_exit_net(struct net *net)
+static void __net_exit csum_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, csum_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, csum_net_id);
 }
 
 static struct pernet_operations csum_net_ops = {
 	.init = csum_init_net,
-	.exit = csum_exit_net,
+	.exit_batch = csum_exit_net,
 	.id   = &csum_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e29a48ef7fc3..9d632e92cad0 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -235,16 +235,14 @@ static __net_init int gact_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_gact_ops);
 }
 
-static void __net_exit gact_exit_net(struct net *net)
+static void __net_exit gact_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, gact_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, gact_net_id);
 }
 
 static struct pernet_operations gact_net_ops = {
 	.init = gact_init_net,
-	.exit = gact_exit_net,
+	.exit_batch = gact_exit_net,
 	.id   = &gact_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index dee9cf22686c..5954e992685a 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -858,16 +858,14 @@ static __net_init int ife_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_ife_ops);
 }
 
-static void __net_exit ife_exit_net(struct net *net)
+static void __net_exit ife_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, ife_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, ife_net_id);
 }
 
 static struct pernet_operations ife_net_ops = {
 	.init = ife_init_net,
-	.exit = ife_exit_net,
+	.exit_batch = ife_exit_net,
 	.id   = &ife_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 2479b255dc1d..06e380ae0928 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -337,16 +337,14 @@ static __net_init int ipt_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_ipt_ops);
 }
 
-static void __net_exit ipt_exit_net(struct net *net)
+static void __net_exit ipt_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, ipt_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, ipt_net_id);
 }
 
 static struct pernet_operations ipt_net_ops = {
 	.init = ipt_init_net,
-	.exit = ipt_exit_net,
+	.exit_batch = ipt_exit_net,
 	.id   = &ipt_net_id,
 	.size = sizeof(struct tc_action_net),
 };
@@ -387,16 +385,14 @@ static __net_init int xt_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_xt_ops);
 }
 
-static void __net_exit xt_exit_net(struct net *net)
+static void __net_exit xt_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, xt_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, xt_net_id);
 }
 
 static struct pernet_operations xt_net_ops = {
 	.init = xt_init_net,
-	.exit = xt_exit_net,
+	.exit_batch = xt_exit_net,
 	.id   = &xt_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index cee2d413bf57..37e5e4decbd6 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -334,16 +334,14 @@ static __net_init int mirred_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_mirred_ops);
 }
 
-static void __net_exit mirred_exit_net(struct net *net)
+static void __net_exit mirred_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, mirred_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, mirred_net_id);
 }
 
 static struct pernet_operations mirred_net_ops = {
 	.init = mirred_init_net,
-	.exit = mirred_exit_net,
+	.exit_batch = mirred_exit_net,
 	.id   = &mirred_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index c365d01b99c8..98c6a4b2f523 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -310,16 +310,14 @@ static __net_init int nat_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_nat_ops);
 }
 
-static void __net_exit nat_exit_net(struct net *net)
+static void __net_exit nat_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, nat_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, nat_net_id);
 }
 
 static struct pernet_operations nat_net_ops = {
 	.init = nat_init_net,
-	.exit = nat_exit_net,
+	.exit_batch = nat_exit_net,
 	.id   = &nat_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index dba996bcd6dc..349beaffb29e 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -453,16 +453,14 @@ static __net_init int pedit_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_pedit_ops);
 }
 
-static void __net_exit pedit_exit_net(struct net *net)
+static void __net_exit pedit_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, pedit_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, pedit_net_id);
 }
 
 static struct pernet_operations pedit_net_ops = {
 	.init = pedit_init_net,
-	.exit = pedit_exit_net,
+	.exit_batch = pedit_exit_net,
 	.id   = &pedit_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 3bb2ebf9e9ae..bf483db993a1 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -334,16 +334,14 @@ static __net_init int police_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_police_ops);
 }
 
-static void __net_exit police_exit_net(struct net *net)
+static void __net_exit police_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, police_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, police_net_id);
 }
 
 static struct pernet_operations police_net_ops = {
 	.init = police_init_net,
-	.exit = police_exit_net,
+	.exit_batch = police_exit_net,
 	.id   = &police_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 859a93903339..1ba0df238756 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -236,16 +236,14 @@ static __net_init int sample_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_sample_ops);
 }
 
-static void __net_exit sample_exit_net(struct net *net)
+static void __net_exit sample_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, sample_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, sample_net_id);
 }
 
 static struct pernet_operations sample_net_ops = {
 	.init = sample_init_net,
-	.exit = sample_exit_net,
+	.exit_batch = sample_exit_net,
 	.id   = &sample_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index eda57b47a6b6..425eac11f6da 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -204,16 +204,14 @@ static __net_init int simp_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_simp_ops);
 }
 
-static void __net_exit simp_exit_net(struct net *net)
+static void __net_exit simp_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, simp_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, simp_net_id);
 }
 
 static struct pernet_operations simp_net_ops = {
 	.init = simp_init_net,
-	.exit = simp_exit_net,
+	.exit_batch = simp_exit_net,
 	.id   = &simp_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 59949d61f20d..5a3f691bb545 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -241,16 +241,14 @@ static __net_init int skbedit_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_skbedit_ops);
 }
 
-static void __net_exit skbedit_exit_net(struct net *net)
+static void __net_exit skbedit_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, skbedit_net_id);
 }
 
 static struct pernet_operations skbedit_net_ops = {
 	.init = skbedit_init_net,
-	.exit = skbedit_exit_net,
+	.exit_batch = skbedit_exit_net,
 	.id   = &skbedit_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index f090bba1a79e..fa975262dbac 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -266,16 +266,14 @@ static __net_init int skbmod_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_skbmod_ops);
 }
 
-static void __net_exit skbmod_exit_net(struct net *net)
+static void __net_exit skbmod_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, skbmod_net_id);
 }
 
 static struct pernet_operations skbmod_net_ops = {
 	.init = skbmod_init_net,
-	.exit = skbmod_exit_net,
+	.exit_batch = skbmod_exit_net,
 	.id   = &skbmod_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 57b63bdec3ae..0e23aac09ad6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -325,16 +325,14 @@ static __net_init int tunnel_key_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_tunnel_key_ops);
 }
 
-static void __net_exit tunnel_key_exit_net(struct net *net)
+static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, tunnel_key_net_id);
 }
 
 static struct pernet_operations tunnel_key_net_ops = {
 	.init = tunnel_key_init_net,
-	.exit = tunnel_key_exit_net,
+	.exit_batch = tunnel_key_exit_net,
 	.id   = &tunnel_key_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 41f0878ad26e..e1a1b3f3983a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -301,16 +301,14 @@ static __net_init int vlan_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_vlan_ops);
 }
 
-static void __net_exit vlan_exit_net(struct net *net)
+static void __net_exit vlan_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, vlan_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, vlan_net_id);
 }
 
 static struct pernet_operations vlan_net_ops = {
 	.init = vlan_init_net,
-	.exit = vlan_exit_net,
+	.exit_batch = vlan_exit_net,
 	.id   = &vlan_net_id,
 	.size = sizeof(struct tc_action_net),
 };
-- 
cgit v1.2.3


From b4f70c3d4ec32a2ff4c62e1e2da0da5f55fe12bd Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 11 Dec 2017 15:42:53 -0800
Subject: tcp: allow TLP in ECN CWR

This patch enables tail loss probe in cwnd reduction (CWR) state
to detect potential losses. Prior to this patch, since the sender
uses PRR to determine the cwnd in CWR state, the combination of
CWR+PRR plus tcp_tso_should_defer() could cause unnecessary stalls
upon losses: PRR makes cwnd so gentle that tcp_tso_should_defer()
defers sending wait for more ACKs. The ACKs may not come due to
packet losses.

Disallowing TLP when there is unused cwnd had the primary effect
of disallowing TLP when there is TSO deferral, Nagle deferral,
or we hit the rwin limit. Because basically every application
write() or incoming ACK will cause us to run tcp_write_xmit()
to see if we can send more, and then if we sent something we call
tcp_schedule_loss_probe() to see if we should schedule a TLP. At
that point, there are a few common reasons why some cwnd budget
could still be unused:

(a) rwin limit
(b) nagle check
(c) TSO deferral
(d) TSQ

For (d), after the next packet tx completion the TSQ mechanism
will allow us to send more packets, so we don't really need a
TLP (in practice it shouldn't matter whether we schedule one
or not). But for (a), (b), (c) the sender won't send any more
packets until it gets another ACK. But if the whole flight was
lost, or all the ACKs were lost, then we won't get any more ACKs,
and ideally we should schedule and send a TLP to get more feedback.
In particular for a long time we have wanted some kind of timer for
TSO deferral, and at least this would give us some kind of timer

Reported-by: Steve Ibanez <sibanez@stanford.edu>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Nandita Dukkipati <nanditad@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c7b506..04be9f833927 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2414,15 +2414,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 
 	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
-	 * in Open state, that are either limited by cwnd or application.
+	 * not in loss recovery, that are either limited by cwnd or application.
 	 */
 	if ((early_retrans != 3 && early_retrans != 4) ||
 	    !tp->packets_out || !tcp_is_sack(tp) ||
-	    icsk->icsk_ca_state != TCP_CA_Open)
-		return false;
-
-	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-	     !tcp_write_queue_empty(sk))
+	    (icsk->icsk_ca_state != TCP_CA_Open &&
+	     icsk->icsk_ca_state != TCP_CA_CWR))
 		return false;
 
 	/* Probe timeout is 2*rtt. Add minimum RTO to account
-- 
cgit v1.2.3


From 22b371cbb949e1c8ee4accfead5ee9f3e7f0c114 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Nov 2017 23:15:46 +0200
Subject: Bluetooth: introduce DEFINE_SHOW_ATTRIBUTE() macro

This macro deduplicates a lot of similar code across the hci_debugfs.c
module. Targeting to be moved to seq_file.h eventually.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_debugfs.c | 184 +++++---------------------------------------
 1 file changed, 18 insertions(+), 166 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index e204bfdb5ba2..57403bd567d0 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -88,6 +88,9 @@ static int __name ## _show(struct seq_file *f, void *ptr)		      \
 	return 0;							      \
 }									      \
 									      \
+DEFINE_SHOW_ATTRIBUTE(__name)
+
+#define DEFINE_SHOW_ATTRIBUTE(__name)					      \
 static int __name ## _open(struct inode *inode, struct file *file)	      \
 {									      \
 	return single_open(file, __name ## _show, inode->i_private);	      \
@@ -115,17 +118,7 @@ static int features_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int features_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, features_show, inode->i_private);
-}
-
-static const struct file_operations features_fops = {
-	.open		= features_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(features);
 
 static int device_id_show(struct seq_file *f, void *ptr)
 {
@@ -139,17 +132,7 @@ static int device_id_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int device_id_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, device_id_show, inode->i_private);
-}
-
-static const struct file_operations device_id_fops = {
-	.open		= device_id_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(device_id);
 
 static int device_list_show(struct seq_file *f, void *ptr)
 {
@@ -169,17 +152,7 @@ static int device_list_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int device_list_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, device_list_show, inode->i_private);
-}
-
-static const struct file_operations device_list_fops = {
-	.open		= device_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(device_list);
 
 static int blacklist_show(struct seq_file *f, void *p)
 {
@@ -194,17 +167,7 @@ static int blacklist_show(struct seq_file *f, void *p)
 	return 0;
 }
 
-static int blacklist_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, blacklist_show, inode->i_private);
-}
-
-static const struct file_operations blacklist_fops = {
-	.open		= blacklist_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(blacklist);
 
 static int uuids_show(struct seq_file *f, void *p)
 {
@@ -229,17 +192,7 @@ static int uuids_show(struct seq_file *f, void *p)
        return 0;
 }
 
-static int uuids_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uuids_show, inode->i_private);
-}
-
-static const struct file_operations uuids_fops = {
-	.open		= uuids_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(uuids);
 
 static int remote_oob_show(struct seq_file *f, void *ptr)
 {
@@ -258,17 +211,7 @@ static int remote_oob_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int remote_oob_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, remote_oob_show, inode->i_private);
-}
-
-static const struct file_operations remote_oob_fops = {
-	.open		= remote_oob_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(remote_oob);
 
 static int conn_info_min_age_set(void *data, u64 val)
 {
@@ -432,17 +375,7 @@ static int inquiry_cache_show(struct seq_file *f, void *p)
 	return 0;
 }
 
-static int inquiry_cache_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, inquiry_cache_show, inode->i_private);
-}
-
-static const struct file_operations inquiry_cache_fops = {
-	.open		= inquiry_cache_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(inquiry_cache);
 
 static int link_keys_show(struct seq_file *f, void *ptr)
 {
@@ -458,17 +391,7 @@ static int link_keys_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int link_keys_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, link_keys_show, inode->i_private);
-}
-
-static const struct file_operations link_keys_fops = {
-	.open		= link_keys_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(link_keys);
 
 static int dev_class_show(struct seq_file *f, void *ptr)
 {
@@ -482,17 +405,7 @@ static int dev_class_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int dev_class_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dev_class_show, inode->i_private);
-}
-
-static const struct file_operations dev_class_fops = {
-	.open		= dev_class_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(dev_class);
 
 static int voice_setting_get(void *data, u64 *val)
 {
@@ -681,17 +594,7 @@ static int identity_show(struct seq_file *f, void *p)
 	return 0;
 }
 
-static int identity_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, identity_show, inode->i_private);
-}
-
-static const struct file_operations identity_fops = {
-	.open		= identity_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(identity);
 
 static int rpa_timeout_set(void *data, u64 val)
 {
@@ -735,17 +638,7 @@ static int random_address_show(struct seq_file *f, void *p)
 	return 0;
 }
 
-static int random_address_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, random_address_show, inode->i_private);
-}
-
-static const struct file_operations random_address_fops = {
-	.open		= random_address_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(random_address);
 
 static int static_address_show(struct seq_file *f, void *p)
 {
@@ -758,17 +651,7 @@ static int static_address_show(struct seq_file *f, void *p)
 	return 0;
 }
 
-static int static_address_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, static_address_show, inode->i_private);
-}
-
-static const struct file_operations static_address_fops = {
-	.open		= static_address_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(static_address);
 
 static ssize_t force_static_address_read(struct file *file,
 					 char __user *user_buf,
@@ -830,17 +713,7 @@ static int white_list_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int white_list_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, white_list_show, inode->i_private);
-}
-
-static const struct file_operations white_list_fops = {
-	.open		= white_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(white_list);
 
 static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
 {
@@ -858,18 +731,7 @@ static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int identity_resolving_keys_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, identity_resolving_keys_show,
-			   inode->i_private);
-}
-
-static const struct file_operations identity_resolving_keys_fops = {
-	.open		= identity_resolving_keys_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(identity_resolving_keys);
 
 static int long_term_keys_show(struct seq_file *f, void *ptr)
 {
@@ -887,17 +749,7 @@ static int long_term_keys_show(struct seq_file *f, void *ptr)
 	return 0;
 }
 
-static int long_term_keys_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, long_term_keys_show, inode->i_private);
-}
-
-static const struct file_operations long_term_keys_fops = {
-	.open		= long_term_keys_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(long_term_keys);
 
 static int conn_min_interval_set(void *data, u64 val)
 {
-- 
cgit v1.2.3


From ec94c2696f0bcd5ae92a553244e4ac30d2171a2d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Dec 2017 21:25:12 -0800
Subject: tcp/dccp: avoid one atomic operation for timewait hashdance

First, rename __inet_twsk_hashdance() to inet_twsk_hashdance()

Then, remove one inet_twsk_put() by setting tw_refcnt to 3 instead
of 4, but adding a fat warning that we do not have the right to access
tw anymore after inet_twsk_hashdance()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h |  4 ++--
 net/dccp/minisocks.c             |  7 ++++---
 net/ipv4/inet_timewait_sock.c    | 27 +++++++++++++--------------
 net/ipv4/tcp_minisocks.c         |  7 ++++---
 4 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 1356fa6a7566..899495589a7e 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -93,8 +93,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 					   struct inet_timewait_death_row *dr,
 					   const int state);
 
-void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
-			   struct inet_hashinfo *hashinfo);
+void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+			 struct inet_hashinfo *hashinfo);
 
 void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo,
 			  bool rearm);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 178bb9833311..37ccbe62eb1a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -63,9 +63,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 		 */
 		local_bh_disable();
 		inet_twsk_schedule(tw, timeo);
-		/* Linkage updates. */
-		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
-		inet_twsk_put(tw);
+		/* Linkage updates.
+		 * Note that access to tw after this point is illegal.
+		 */
+		inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
 		local_bh_enable();
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index b563e0c46bac..277ff69a312d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
  * Essentially we whip up a timewait bucket, copy the relevant info into it
  * from the SK, and mess with hash chains and list linkage.
  */
-void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 			   struct inet_hashinfo *hashinfo)
 {
 	const struct inet_sock *inet = inet_sk(sk);
@@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	spin_lock(lock);
 
-	/*
-	 * Step 2: Hash TW into tcp ehash chain.
-	 * Notes :
-	 * - tw_refcnt is set to 4 because :
-	 * - We have one reference from bhash chain.
-	 * - We have one reference from ehash chain.
-	 * - We have one reference from timer.
-	 * - One reference for ourself (our caller will release it).
-	 * We can use atomic_set() because prior spin_lock()/spin_unlock()
-	 * committed into memory all tw fields.
-	 */
-	refcount_set(&tw->tw_refcnt, 4);
 	inet_twsk_add_node_rcu(tw, &ehead->chain);
 
 	/* Step 3: Remove SK from hash chain */
@@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
 	spin_unlock(lock);
+
+	/* tw_refcnt is set to 3 because we have :
+	 * - one reference for bhash chain.
+	 * - one reference for ehash chain.
+	 * - one reference for timer.
+	 * We can use atomic_set() because prior spin_lock()/spin_unlock()
+	 * committed into memory all tw fields.
+	 * Also note that after this point, we lost our implicit reference
+	 * so we are not allowed to use tw anymore.
+	 */
+	refcount_set(&tw->tw_refcnt, 3);
 }
-EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
 
 static void tw_timer_handler(struct timer_list *t)
 {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b079b619b60c..a8384b0c11f8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		 */
 		local_bh_disable();
 		inet_twsk_schedule(tw, timeo);
-		/* Linkage updates. */
-		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
-		inet_twsk_put(tw);
+		/* Linkage updates.
+		 * Note that access to tw after this point is illegal.
+		 */
+		inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 		local_bh_enable();
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
-- 
cgit v1.2.3


From eb7935830d00b9e0c4ca11382143ea2320eb45c2 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 12 Dec 2017 16:02:50 +0200
Subject: net: bridge: use rhashtable for fdbs

Before this patch the bridge used a fixed 256 element hash table which
was fine for small use cases (in my tests it starts to degrade
above 1000 entries), but it wasn't enough for medium or large
scale deployments. Modern setups have thousands of participants in a
single bridge, even only enabling vlans and adding a few thousand vlan
entries will cause a few thousand fdbs to be automatically inserted per
participating port. So we need to scale the fdb table considerably to
cope with modern workloads, and this patch converts it to use a
rhashtable for its operations thus improving the bridge scalability.
Tests show the following results (10 runs each), at up to 1000 entries
rhashtable is ~3% slower, at 2000 rhashtable is 30% faster, at 3000 it
is 2 times faster and at 30000 it is 50 times faster.
Obviously this happens because of the properties of the two constructs
and is expected, rhashtable keeps pretty much a constant time even with
10000000 entries (tested), while the fixed hash table struggles
considerably even above 10000.
As a side effect this also reduces the net_bridge struct size from 3248
bytes to 1344 bytes. Also note that the key struct is 8 bytes.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bridge.h |   4 +-
 net/bridge/br_device.c        |  10 ++
 net/bridge/br_fdb.c           | 392 ++++++++++++++++++++----------------------
 net/bridge/br_private.h       |  16 +-
 net/bridge/br_switchdev.c     |   8 +-
 5 files changed, 211 insertions(+), 219 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
index 1bee3e7fdf32..8ea966448b58 100644
--- a/include/trace/events/bridge.h
+++ b/include/trace/events/bridge.h
@@ -82,8 +82,8 @@ TRACE_EVENT(fdb_delete,
 	TP_fast_assign(
 		__assign_str(br_dev, br->dev->name);
 		__assign_str(dev, f->dst ? f->dst->dev->name : "null");
-		memcpy(__entry->addr, f->addr.addr, ETH_ALEN);
-		__entry->vid = f->vlan_id;
+		memcpy(__entry->addr, f->key.addr.addr, ETH_ALEN);
+		__entry->vid = f->key.vlan_id;
 	),
 
 	TP_printk("br_dev %s dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u",
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index af5b8c87f590..1285ca30ab0a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -125,9 +125,16 @@ static int br_dev_init(struct net_device *dev)
 	if (!br->stats)
 		return -ENOMEM;
 
+	err = br_fdb_hash_init(br);
+	if (err) {
+		free_percpu(br->stats);
+		return err;
+	}
+
 	err = br_vlan_init(br);
 	if (err) {
 		free_percpu(br->stats);
+		br_fdb_hash_fini(br);
 		return err;
 	}
 
@@ -135,6 +142,7 @@ static int br_dev_init(struct net_device *dev)
 	if (err) {
 		free_percpu(br->stats);
 		br_vlan_flush(br);
+		br_fdb_hash_fini(br);
 	}
 	br_set_lockdep_class(dev);
 
@@ -148,6 +156,7 @@ static void br_dev_uninit(struct net_device *dev)
 	br_multicast_dev_del(br);
 	br_multicast_uninit_stats(br);
 	br_vlan_flush(br);
+	br_fdb_hash_fini(br);
 	free_percpu(br->stats);
 }
 
@@ -416,6 +425,7 @@ void br_dev_setup(struct net_device *dev)
 	br->dev = dev;
 	spin_lock_init(&br->lock);
 	INIT_LIST_HEAD(&br->port_list);
+	INIT_HLIST_HEAD(&br->fdb_list);
 	spin_lock_init(&br->hash_lock);
 
 	br->bridge_id.prio[0] = 0x80;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 4ea5c8bbe286..dc87fbc9a23b 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,14 +28,20 @@
 #include <trace/events/bridge.h>
 #include "br_private.h"
 
+static const struct rhashtable_params br_fdb_rht_params = {
+	.head_offset = offsetof(struct net_bridge_fdb_entry, rhnode),
+	.key_offset = offsetof(struct net_bridge_fdb_entry, key),
+	.key_len = sizeof(struct net_bridge_fdb_key),
+	.automatic_shrinking = true,
+	.locks_mul = 1,
+};
+
 static struct kmem_cache *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr, u16 vid);
 static void fdb_notify(struct net_bridge *br,
 		       const struct net_bridge_fdb_entry *, int);
 
-static u32 fdb_salt __read_mostly;
-
 int __init br_fdb_init(void)
 {
 	br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
@@ -45,7 +51,6 @@ int __init br_fdb_init(void)
 	if (!br_fdb_cache)
 		return -ENOMEM;
 
-	get_random_bytes(&fdb_salt, sizeof(fdb_salt));
 	return 0;
 }
 
@@ -54,6 +59,15 @@ void br_fdb_fini(void)
 	kmem_cache_destroy(br_fdb_cache);
 }
 
+int br_fdb_hash_init(struct net_bridge *br)
+{
+	return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params);
+}
+
+void br_fdb_hash_fini(struct net_bridge *br)
+{
+	rhashtable_destroy(&br->fdb_hash_tbl);
+}
 
 /* if topology_changing then use forward_delay (default 15 sec)
  * otherwise keep longer (default 5 minutes)
@@ -70,13 +84,6 @@ static inline int has_expired(const struct net_bridge *br,
 		time_before_eq(fdb->updated + hold_time(br), jiffies);
 }
 
-static inline int br_mac_hash(const unsigned char *mac, __u16 vid)
-{
-	/* use 1 byte of OUI and 3 bytes of NIC */
-	u32 key = get_unaligned((u32 *)(mac + 2));
-	return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1);
-}
-
 static void fdb_rcu_free(struct rcu_head *head)
 {
 	struct net_bridge_fdb_entry *ent
@@ -84,19 +91,18 @@ static void fdb_rcu_free(struct rcu_head *head)
 	kmem_cache_free(br_fdb_cache, ent);
 }
 
-static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl,
 						 const unsigned char *addr,
 						 __u16 vid)
 {
-	struct net_bridge_fdb_entry *f;
+	struct net_bridge_fdb_key key;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	hlist_for_each_entry_rcu(f, head, hlist)
-		if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
-			break;
+	key.vlan_id = vid;
+	memcpy(key.addr.addr, addr, sizeof(key.addr.addr));
 
-	return f;
+	return rhashtable_lookup(tbl, &key, br_fdb_rht_params);
 }
 
 /* requires bridge hash_lock */
@@ -104,13 +110,12 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
 						const unsigned char *addr,
 						__u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 
 	lockdep_assert_held_once(&br->hash_lock);
 
 	rcu_read_lock();
-	fdb = fdb_find_rcu(head, addr, vid);
+	fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
 	rcu_read_unlock();
 
 	return fdb;
@@ -120,9 +125,7 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
 					     const unsigned char *addr,
 					     __u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
-
-	return fdb_find_rcu(head, addr, vid);
+	return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
 }
 
 /* When a static FDB entry is added, the mac address from the entry is
@@ -175,9 +178,11 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 	trace_fdb_delete(br, f);
 
 	if (f->is_static)
-		fdb_del_hw_addr(br, f->addr.addr);
+		fdb_del_hw_addr(br, f->key.addr.addr);
 
-	hlist_del_init_rcu(&f->hlist);
+	hlist_del_init_rcu(&f->fdb_node);
+	rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
+			       br_fdb_rht_params);
 	fdb_notify(br, f, RTM_DELNEIGH);
 	call_rcu(&f->rcu, fdb_rcu_free);
 }
@@ -187,11 +192,11 @@ static void fdb_delete_local(struct net_bridge *br,
 			     const struct net_bridge_port *p,
 			     struct net_bridge_fdb_entry *f)
 {
-	const unsigned char *addr = f->addr.addr;
+	const unsigned char *addr = f->key.addr.addr;
 	struct net_bridge_vlan_group *vg;
 	const struct net_bridge_vlan *v;
 	struct net_bridge_port *op;
-	u16 vid = f->vlan_id;
+	u16 vid = f->key.vlan_id;
 
 	/* Maybe another port has same hw addr? */
 	list_for_each_entry(op, &br->port_list, list) {
@@ -233,31 +238,23 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 {
 	struct net_bridge_vlan_group *vg;
+	struct net_bridge_fdb_entry *f;
 	struct net_bridge *br = p->br;
 	struct net_bridge_vlan *v;
-	int i;
 
 	spin_lock_bh(&br->hash_lock);
-
 	vg = nbp_vlan_group(p);
-	/* Search all chains since old address/hash is unknown */
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct hlist_node *h;
-		hlist_for_each(h, &br->hash[i]) {
-			struct net_bridge_fdb_entry *f;
-
-			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
-			if (f->dst == p && f->is_local && !f->added_by_user) {
-				/* delete old one */
-				fdb_delete_local(br, p, f);
-
-				/* if this port has no vlan information
-				 * configured, we can safely be done at
-				 * this point.
-				 */
-				if (!vg || !vg->num_vlans)
-					goto insert;
-			}
+	hlist_for_each_entry(f, &br->fdb_list, fdb_node) {
+		if (f->dst == p && f->is_local && !f->added_by_user) {
+			/* delete old one */
+			fdb_delete_local(br, p, f);
+
+			/* if this port has no vlan information
+			 * configured, we can safely be done at
+			 * this point.
+			 */
+			if (!vg || !vg->num_vlans)
+				goto insert;
 		}
 	}
 
@@ -316,35 +313,32 @@ void br_fdb_cleanup(struct work_struct *work)
 {
 	struct net_bridge *br = container_of(work, struct net_bridge,
 					     gc_work.work);
+	struct net_bridge_fdb_entry *f = NULL;
 	unsigned long delay = hold_time(br);
 	unsigned long work_delay = delay;
 	unsigned long now = jiffies;
-	int i;
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct net_bridge_fdb_entry *f;
-		struct hlist_node *n;
+	/* this part is tricky, in order to avoid blocking learning and
+	 * consequently forwarding, we rely on rcu to delete objects with
+	 * delayed freeing allowing us to continue traversing
+	 */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		unsigned long this_timer;
 
-		if (!br->hash[i].first)
+		if (f->is_static || f->added_by_external_learn)
 			continue;
-
-		spin_lock_bh(&br->hash_lock);
-		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
-			unsigned long this_timer;
-
-			if (f->is_static)
-				continue;
-			if (f->added_by_external_learn)
-				continue;
-			this_timer = f->updated + delay;
-			if (time_after(this_timer, now))
-				work_delay = min(work_delay, this_timer - now);
-			else
+		this_timer = f->updated + delay;
+		if (time_after(this_timer, now)) {
+			work_delay = min(work_delay, this_timer - now);
+		} else {
+			spin_lock_bh(&br->hash_lock);
+			if (!hlist_unhashed(&f->fdb_node))
 				fdb_delete(br, f);
+			spin_unlock_bh(&br->hash_lock);
 		}
-		spin_unlock_bh(&br->hash_lock);
-		cond_resched();
 	}
+	rcu_read_unlock();
 
 	/* Cleanup minimum 10 milliseconds apart */
 	work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
@@ -354,16 +348,13 @@ void br_fdb_cleanup(struct work_struct *work)
 /* Completely flush all dynamic entries in forwarding database.*/
 void br_fdb_flush(struct net_bridge *br)
 {
-	int i;
+	struct net_bridge_fdb_entry *f;
+	struct hlist_node *tmp;
 
 	spin_lock_bh(&br->hash_lock);
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct net_bridge_fdb_entry *f;
-		struct hlist_node *n;
-		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
-			if (!f->is_static)
-				fdb_delete(br, f);
-		}
+	hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+		if (!f->is_static)
+			fdb_delete(br, f);
 	}
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -377,27 +368,22 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			   u16 vid,
 			   int do_all)
 {
-	int i;
+	struct net_bridge_fdb_entry *f;
+	struct hlist_node *tmp;
 
 	spin_lock_bh(&br->hash_lock);
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct hlist_node *h, *g;
+	hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+		if (f->dst != p)
+			continue;
 
-		hlist_for_each_safe(h, g, &br->hash[i]) {
-			struct net_bridge_fdb_entry *f
-				= hlist_entry(h, struct net_bridge_fdb_entry, hlist);
-			if (f->dst != p)
+		if (!do_all)
+			if (f->is_static || (vid && f->key.vlan_id != vid))
 				continue;
 
-			if (!do_all)
-				if (f->is_static || (vid && f->vlan_id != vid))
-					continue;
-
-			if (f->is_local)
-				fdb_delete_local(br, p, f);
-			else
-				fdb_delete(br, f);
-		}
+		if (f->is_local)
+			fdb_delete_local(br, p, f);
+		else
+			fdb_delete(br, f);
 	}
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -433,52 +419,48 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 		   unsigned long maxnum, unsigned long skip)
 {
-	struct __fdb_entry *fe = buf;
-	int i, num = 0;
 	struct net_bridge_fdb_entry *f;
+	struct __fdb_entry *fe = buf;
+	int num = 0;
 
 	memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
 
 	rcu_read_lock();
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
-			if (num >= maxnum)
-				goto out;
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		if (num >= maxnum)
+			break;
 
-			if (has_expired(br, f))
-				continue;
+		if (has_expired(br, f))
+			continue;
 
-			/* ignore pseudo entry for local MAC address */
-			if (!f->dst)
-				continue;
+		/* ignore pseudo entry for local MAC address */
+		if (!f->dst)
+			continue;
 
-			if (skip) {
-				--skip;
-				continue;
-			}
+		if (skip) {
+			--skip;
+			continue;
+		}
 
-			/* convert from internal format to API */
-			memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
+		/* convert from internal format to API */
+		memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN);
 
-			/* due to ABI compat need to split into hi/lo */
-			fe->port_no = f->dst->port_no;
-			fe->port_hi = f->dst->port_no >> 8;
+		/* due to ABI compat need to split into hi/lo */
+		fe->port_no = f->dst->port_no;
+		fe->port_hi = f->dst->port_no >> 8;
 
-			fe->is_local = f->is_local;
-			if (!f->is_static)
-				fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
-			++fe;
-			++num;
-		}
+		fe->is_local = f->is_local;
+		if (!f->is_static)
+			fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
+		++fe;
+		++num;
 	}
-
- out:
 	rcu_read_unlock();
 
 	return num;
 }
 
-static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
+static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
 					       struct net_bridge_port *source,
 					       const unsigned char *addr,
 					       __u16 vid,
@@ -489,16 +471,23 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 
 	fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
 	if (fdb) {
-		memcpy(fdb->addr.addr, addr, ETH_ALEN);
+		memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
 		fdb->dst = source;
-		fdb->vlan_id = vid;
+		fdb->key.vlan_id = vid;
 		fdb->is_local = is_local;
 		fdb->is_static = is_static;
 		fdb->added_by_user = 0;
 		fdb->added_by_external_learn = 0;
 		fdb->offloaded = 0;
 		fdb->updated = fdb->used = jiffies;
-		hlist_add_head_rcu(&fdb->hlist, head);
+		if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
+						  &fdb->rhnode,
+						  br_fdb_rht_params)) {
+			kmem_cache_free(br_fdb_cache, fdb);
+			fdb = NULL;
+		} else {
+			hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list);
+		}
 	}
 	return fdb;
 }
@@ -506,7 +495,6 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		  const unsigned char *addr, u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 
 	if (!is_valid_ether_addr(addr))
@@ -524,7 +512,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		fdb_delete(br, fdb);
 	}
 
-	fdb = fdb_create(head, source, addr, vid, 1, 1);
+	fdb = fdb_create(br, source, addr, vid, 1, 1);
 	if (!fdb)
 		return -ENOMEM;
 
@@ -548,7 +536,6 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		   const unsigned char *addr, u16 vid, bool added_by_user)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 	bool fdb_modified = false;
 
@@ -561,7 +548,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 	      source->state == BR_STATE_FORWARDING))
 		return;
 
-	fdb = fdb_find_rcu(head, addr, vid);
+	fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
 	if (likely(fdb)) {
 		/* attempt to update an entry for a local interface */
 		if (unlikely(fdb->is_local)) {
@@ -590,14 +577,13 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		}
 	} else {
 		spin_lock(&br->hash_lock);
-		if (likely(!fdb_find_rcu(head, addr, vid))) {
-			fdb = fdb_create(head, source, addr, vid, 0, 0);
-			if (fdb) {
-				if (unlikely(added_by_user))
-					fdb->added_by_user = 1;
-				trace_br_fdb_update(br, source, addr, vid, added_by_user);
-				fdb_notify(br, fdb, RTM_NEWNEIGH);
-			}
+		fdb = fdb_create(br, source, addr, vid, 0, 0);
+		if (fdb) {
+			if (unlikely(added_by_user))
+				fdb->added_by_user = 1;
+			trace_br_fdb_update(br, source, addr, vid,
+					    added_by_user);
+			fdb_notify(br, fdb, RTM_NEWNEIGH);
 		}
 		/* else  we lose race and someone else inserts
 		 * it first, don't bother updating
@@ -646,7 +632,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	if (fdb->added_by_external_learn)
 		ndm->ndm_flags |= NTF_EXT_LEARNED;
 
-	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
 		goto nla_put_failure;
 	if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
 		goto nla_put_failure;
@@ -657,7 +643,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 		goto nla_put_failure;
 
-	if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id))
+	if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16),
+					&fdb->key.vlan_id))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -711,54 +698,48 @@ int br_fdb_dump(struct sk_buff *skb,
 		int *idx)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_fdb_entry *f;
 	int err = 0;
-	int i;
 
 	if (!(dev->priv_flags & IFF_EBRIDGE))
-		goto out;
+		return err;
 
 	if (!filter_dev) {
 		err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
 		if (err < 0)
-			goto out;
+			return err;
 	}
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct net_bridge_fdb_entry *f;
-
-		hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
-
-			if (*idx < cb->args[2])
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		if (*idx < cb->args[2])
+			goto skip;
+		if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
+			if (filter_dev != dev)
 				goto skip;
-
-			if (filter_dev &&
-			    (!f->dst || f->dst->dev != filter_dev)) {
-				if (filter_dev != dev)
-					goto skip;
-				/* !f->dst is a special case for bridge
-				 * It means the MAC belongs to the bridge
-				 * Therefore need a little more filtering
-				 * we only want to dump the !f->dst case
-				 */
-				if (f->dst)
-					goto skip;
-			}
-			if (!filter_dev && f->dst)
+			/* !f->dst is a special case for bridge
+			 * It means the MAC belongs to the bridge
+			 * Therefore need a little more filtering
+			 * we only want to dump the !f->dst case
+			 */
+			if (f->dst)
 				goto skip;
-
-			err = fdb_fill_info(skb, br, f,
-					    NETLINK_CB(cb->skb).portid,
-					    cb->nlh->nlmsg_seq,
-					    RTM_NEWNEIGH,
-					    NLM_F_MULTI);
-			if (err < 0)
-				goto out;
-skip:
-			*idx += 1;
 		}
+		if (!filter_dev && f->dst)
+			goto skip;
+
+		err = fdb_fill_info(skb, br, f,
+				    NETLINK_CB(cb->skb).portid,
+				    cb->nlh->nlmsg_seq,
+				    RTM_NEWNEIGH,
+				    NLM_F_MULTI);
+		if (err < 0)
+			break;
+skip:
+		*idx += 1;
 	}
+	rcu_read_unlock();
 
-out:
 	return err;
 }
 
@@ -766,7 +747,6 @@ out:
 static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 			 const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 	bool modified = false;
 
@@ -787,7 +767,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
 
-		fdb = fdb_create(head, source, addr, vid, 0, 0);
+		fdb = fdb_create(br, source, addr, vid, 0, 0);
 		if (!fdb)
 			return -ENOMEM;
 
@@ -1012,65 +992,60 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
 {
-	struct net_bridge_fdb_entry *fdb, *tmp;
-	int i;
+	struct net_bridge_fdb_entry *f, *tmp;
 	int err;
 
 	ASSERT_RTNL();
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry(fdb, &br->hash[i], hlist) {
-			/* We only care for static entries */
-			if (!fdb->is_static)
-				continue;
-
-			err = dev_uc_add(p->dev, fdb->addr.addr);
-			if (err)
-				goto rollback;
-		}
+	/* the key here is that static entries change only under rtnl */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		/* We only care for static entries */
+		if (!f->is_static)
+			continue;
+		err = dev_uc_add(p->dev, f->key.addr.addr);
+		if (err)
+			goto rollback;
 	}
-	return 0;
+done:
+	rcu_read_unlock();
 
-rollback:
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry(tmp, &br->hash[i], hlist) {
-			/* If we reached the fdb that failed, we can stop */
-			if (tmp == fdb)
-				break;
-
-			/* We only care for static entries */
-			if (!tmp->is_static)
-				continue;
+	return err;
 
-			dev_uc_del(p->dev, tmp->addr.addr);
-		}
+rollback:
+	hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) {
+		/* We only care for static entries */
+		if (!tmp->is_static)
+			continue;
+		if (tmp == f)
+			break;
+		dev_uc_del(p->dev, tmp->key.addr.addr);
 	}
-	return err;
+
+	goto done;
 }
 
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 {
-	struct net_bridge_fdb_entry *fdb;
-	int i;
+	struct net_bridge_fdb_entry *f;
 
 	ASSERT_RTNL();
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
-			/* We only care for static entries */
-			if (!fdb->is_static)
-				continue;
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		/* We only care for static entries */
+		if (!f->is_static)
+			continue;
 
-			dev_uc_del(p->dev, fdb->addr.addr);
-		}
+		dev_uc_del(p->dev, f->key.addr.addr);
 	}
+	rcu_read_unlock();
 }
 
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
 	struct net_bridge_fdb_entry *fdb;
-	struct hlist_head *head;
 	bool modified = false;
 	int err = 0;
 
@@ -1078,10 +1053,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 
 	spin_lock_bh(&br->hash_lock);
 
-	head = &br->hash[br_mac_hash(addr, vid)];
 	fdb = br_fdb_find(br, addr, vid);
 	if (!fdb) {
-		fdb = fdb_create(head, p, addr, vid, 0, 0);
+		fdb = fdb_create(br, p, addr, vid, 0, 0);
 		if (!fdb) {
 			err = -ENOMEM;
 			goto err_unlock;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1312b8d20ec3..80559fd11b7e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -168,12 +168,17 @@ struct net_bridge_vlan_group {
 	u16				pvid;
 };
 
+struct net_bridge_fdb_key {
+	mac_addr addr;
+	u16 vlan_id;
+};
+
 struct net_bridge_fdb_entry {
-	struct hlist_node		hlist;
+	struct rhash_head		rhnode;
 	struct net_bridge_port		*dst;
 
-	mac_addr			addr;
-	__u16				vlan_id;
+	struct net_bridge_fdb_key	key;
+	struct hlist_node		fdb_node;
 	unsigned char			is_local:1,
 					is_static:1,
 					added_by_user:1,
@@ -315,7 +320,7 @@ struct net_bridge {
 	struct net_bridge_vlan_group	__rcu *vlgrp;
 #endif
 
-	struct hlist_head		hash[BR_HASH_SIZE];
+	struct rhashtable		fdb_hash_tbl;
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	union {
 		struct rtable		fake_rtable;
@@ -405,6 +410,7 @@ struct net_bridge {
 	int offload_fwd_mark;
 #endif
 	bool				neigh_suppress_enabled;
+	struct hlist_head		fdb_list;
 };
 
 struct br_input_skb_cb {
@@ -515,6 +521,8 @@ static inline void br_netpoll_disable(struct net_bridge_port *p)
 /* br_fdb.c */
 int br_fdb_init(void);
 void br_fdb_fini(void);
+int br_fdb_hash_init(struct net_bridge *br);
+void br_fdb_hash_fini(struct net_bridge *br);
 void br_fdb_flush(struct net_bridge *br);
 void br_fdb_find_delete_local(struct net_bridge *br,
 			      const struct net_bridge_port *p,
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 9700e0f3307b..ee775f4ff76c 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -121,13 +121,13 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 
 	switch (type) {
 	case RTM_DELNEIGH:
-		br_switchdev_fdb_call_notifiers(false, fdb->addr.addr,
-						fdb->vlan_id,
+		br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
+						fdb->key.vlan_id,
 						fdb->dst->dev);
 		break;
 	case RTM_NEWNEIGH:
-		br_switchdev_fdb_call_notifiers(true, fdb->addr.addr,
-						fdb->vlan_id,
+		br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
+						fdb->key.vlan_id,
 						fdb->dst->dev);
 		break;
 	}
-- 
cgit v1.2.3


From 8d74e9f88d65af8bb2e095aff506aa6eac755ada Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 12 Dec 2017 11:39:04 -0500
Subject: net: avoid skb_warn_bad_offload on IS_ERR

skb_warn_bad_offload warns when packets enter the GSO stack that
require skb_checksum_help or vice versa. Do not warn on arbitrary
bad packets. Packet sockets can craft many. Syzkaller was able to
demonstrate another one with eth_type games.

In particular, suppress the warning when segmentation returns an
error, which is for reasons other than checksum offload.

See also commit 36c92474498a ("net: WARN if skb_checksum_help() is
called on skb requiring segmentation") for context on this warning.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8aa2f70995e8..b0eee49a2489 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2803,7 +2803,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 
 	segs = skb_mac_gso_segment(skb, features);
 
-	if (unlikely(skb_needs_check(skb, tx_path)))
+	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 		skb_warn_bad_offload(skb);
 
 	return segs;
-- 
cgit v1.2.3


From 7268586baa530312041e597b518b5c6a05110df1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 12 Dec 2017 13:10:40 -0800
Subject: tcp: pause Fast Open globally after third consecutive timeout

Prior to this patch, active Fast Open is paused on a specific
destination IP address if the previous connections to the
IP address have experienced recurring timeouts . But recent
experiments by Microsoft (https://goo.gl/cykmn7) and Mozilla
browsers indicate the isssue is often caused by broken middle-boxes
sitting close to the client. Therefore it is much better user
experience if Fast Open is disabled out-right globally to avoid
experiencing further timeouts on connections toward other
destinations.

This patch changes the destination-IP disablement to global
disablement if a connection experiencing recurring timeouts
or aborts due to timeout.  Repeated incidents would still
exponentially increase the pause time, starting from an hour.
This is extremely conservative but an unfortunate compromise to
minimize bad experience due to broken middle-boxes.

Reported-by: Dragana Damjanovic <ddamjanovic@mozilla.com>
Reported-by: Patrick McManus <mcmanus@ducksong.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Wei Wang <weiwan@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  1 +
 include/net/tcp.h                      |  5 ++---
 net/ipv4/tcp_fastopen.c                | 30 ++++++++++++++++++++----------
 net/ipv4/tcp_metrics.c                 |  5 +----
 net/ipv4/tcp_timer.c                   | 17 +----------------
 5 files changed, 25 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 46c7e1085efc..3f2c40d8e6aa 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -606,6 +606,7 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER
 	This time period will grow exponentially when more blackhole issues
 	get detected right after Fastopen is re-enabled and will reset to
 	initial value when the blackhole issue goes away.
+	0 to disable the blackhole detection.
 	By default, it is set to 1hr.
 
 tcp_syn_retries - INTEGER
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c3744e52cd1..6939e69d3c37 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1507,8 +1507,7 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
 
 /* From tcp_fastopen.c */
 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-			    struct tcp_fastopen_cookie *cookie, int *syn_loss,
-			    unsigned long *last_syn_loss);
+			    struct tcp_fastopen_cookie *cookie);
 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 			    struct tcp_fastopen_cookie *cookie, bool syn_lost,
 			    u16 try_exp);
@@ -1546,7 +1545,7 @@ extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
 void tcp_fastopen_active_disable(struct sock *sk);
 bool tcp_fastopen_active_should_disable(struct sock *sk);
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
-void tcp_fastopen_active_timeout_reset(void);
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
 
 /* Latencies incurred by various limits for a sender. They are
  * chronograph-like stats that are mutually exclusive.
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 78c192ee03a4..018a48477355 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -379,18 +379,9 @@ fastopen:
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			       struct tcp_fastopen_cookie *cookie)
 {
-	unsigned long last_syn_loss = 0;
 	const struct dst_entry *dst;
-	int syn_loss = 0;
 
-	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
-
-	/* Recurring FO SYN losses: no cookie or data in SYN */
-	if (syn_loss > 1 &&
-	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
-		cookie->len = -1;
-		return false;
-	}
+	tcp_fastopen_cache_get(sk, mss, cookie);
 
 	/* Firewall blackhole issue check */
 	if (tcp_fastopen_active_should_disable(sk)) {
@@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
  * following circumstances:
  *   1. client side TFO socket receives out of order FIN
  *   2. client side TFO socket receives out of order RST
+ *   3. client side TFO socket has timed out three times consecutively during
+ *      or after handshake
  * We disable active side TFO globally for 1hr at first. Then if it
  * happens again, we disable it for 2h, then 4h, 8h, ...
  * And we reset the timeout back to 1hr when we see a successful active
@@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 		dst_release(dst);
 	}
 }
+
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
+{
+	u32 timeouts = inet_csk(sk)->icsk_retransmits;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Broken middle-boxes may black-hole Fast Open connection during or
+	 * even after the handshake. Be extremely conservative and pause
+	 * Fast Open globally after hitting the third consecutive timeout or
+	 * exceeding the configured timeout limit.
+	 */
+	if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
+	    (timeouts == 2 || (timeouts < 2 && expired))) {
+		tcp_fastopen_active_disable(sk);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+	}
+}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7097f92d16e5..759e6bc8327b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
 static DEFINE_SEQLOCK(fastopen_seqlock);
 
 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-			    struct tcp_fastopen_cookie *cookie,
-			    int *syn_loss, unsigned long *last_syn_loss)
+			    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_metrics_block *tm;
 
@@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 			*cookie = tfom->cookie;
 			if (cookie->len <= 0 && tfom->try_exp == 1)
 				cookie->exp = true;
-			*syn_loss = tfom->syn_loss;
-			*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
 		} while (read_seqretry(&fastopen_seqlock, seq));
 	}
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 16df6dd44b98..c9a63417af48 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -183,11 +183,6 @@ static int tcp_write_timeout(struct sock *sk)
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 		if (icsk->icsk_retransmits) {
 			dst_negative_advice(sk);
-			if (tp->syn_fastopen || tp->syn_data)
-				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
-			if (tp->syn_data && icsk->icsk_retransmits == 1)
-				NET_INC_STATS(sock_net(sk),
-					      LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 		} else if (!tp->syn_data && !tp->syn_fastopen) {
 			sk_rethink_txhash(sk);
 		}
@@ -195,17 +190,6 @@ static int tcp_write_timeout(struct sock *sk)
 		expired = icsk->icsk_retransmits >= retry_until;
 	} else {
 		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
-			/* Some middle-boxes may black-hole Fast Open _after_
-			 * the handshake. Therefore we conservatively disable
-			 * Fast Open on this path on recurring timeouts after
-			 * successful Fast Open.
-			 */
-			if (tp->syn_data_acked) {
-				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
-				if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
-					NET_INC_STATS(sock_net(sk),
-						      LINUX_MIB_TCPFASTOPENACTIVEFAIL);
-			}
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -228,6 +212,7 @@ static int tcp_write_timeout(struct sock *sk)
 		expired = retransmits_timed_out(sk, retry_until,
 						icsk->icsk_user_timeout);
 	}
+	tcp_fastopen_active_detect_blackhole(sk, expired);
 	if (expired) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
-- 
cgit v1.2.3


From f0af34317f4ba30794a19fd6ad7e30e801d53b07 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Fri, 15 Dec 2017 12:47:01 +0800
Subject: net: dsa: mediatek: combine MediaTek tag with VLAN tag

In order to let MT7530 switch can recognize well those egress packets
having both special tag and VLAN tag, the information about the special
tag should be carried on the existing VLAN tag. On the other hand, it's
unnecessary for extra handling for ingress packets when VLAN tag is
present since it is able to put the VLAN tag after the special tag and
then follow the existing way to parse.

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_mtk.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 8475434af7d5..11535bc70743 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -13,10 +13,13 @@
  */
 
 #include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
 
 #include "dsa_priv.h"
 
 #define MTK_HDR_LEN		4
+#define MTK_HDR_XMIT_UNTAGGED		0
+#define MTK_HDR_XMIT_TAGGED_TPID_8100	1
 #define MTK_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)
 #define MTK_HDR_XMIT_DP_BIT_MASK	GENMASK(5, 0)
 
@@ -25,20 +28,37 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	u8 *mtk_tag;
+	bool is_vlan_skb = true;
 
-	if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
-		return NULL;
-
-	skb_push(skb, MTK_HDR_LEN);
+	/* Build the special tag after the MAC Source Address. If VLAN header
+	 * is present, it's required that VLAN header and special tag is
+	 * being combined. Only in this way we can allow the switch can parse
+	 * the both special and VLAN tag at the same time and then look up VLAN
+	 * table with VID.
+	 */
+	if (!skb_vlan_tagged(skb)) {
+		if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
+			return NULL;
 
-	memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
+		skb_push(skb, MTK_HDR_LEN);
+		memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
+		is_vlan_skb = false;
+	}
 
-	/* Build the tag after the MAC Source Address */
 	mtk_tag = skb->data + 2 * ETH_ALEN;
-	mtk_tag[0] = 0;
+
+	/* Mark tag attribute on special tag insertion to notify hardware
+	 * whether that's a combined special tag with 802.1Q header.
+	 */
+	mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 :
+		     MTK_HDR_XMIT_UNTAGGED;
 	mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK;
-	mtk_tag[2] = 0;
-	mtk_tag[3] = 0;
+
+	/* Tag control information is kept for 802.1Q */
+	if (!is_vlan_skb) {
+		mtk_tag[2] = 0;
+		mtk_tag[3] = 0;
+	}
 
 	return skb;
 }
-- 
cgit v1.2.3


From 7db7d9f369a47e1a46f93c320b45cb89e81723e7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 15:05:11 +0100
Subject: batman-adv: Add SPDX license identifier above copyright header

The "Linux kernel licensing rules" require that each file has a SPDX
license identifier as first line (and sometimes as second line).

The FSFE REUSE practices [1] would also require the same tags but have no
restrictions on the placement in the source file. Using the "Linux kernel
licensing rules" is therefore also fulfilling the FSFE REUSE practices
requirements at the same time.

[1] https://reuse.software/practices/

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h        | 1 +
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.c              | 1 +
 net/batman-adv/bat_algo.h              | 1 +
 net/batman-adv/bat_iv_ogm.c            | 1 +
 net/batman-adv/bat_iv_ogm.h            | 1 +
 net/batman-adv/bat_v.c                 | 1 +
 net/batman-adv/bat_v.h                 | 1 +
 net/batman-adv/bat_v_elp.c             | 1 +
 net/batman-adv/bat_v_elp.h             | 1 +
 net/batman-adv/bat_v_ogm.c             | 1 +
 net/batman-adv/bat_v_ogm.h             | 1 +
 net/batman-adv/bitarray.c              | 1 +
 net/batman-adv/bitarray.h              | 1 +
 net/batman-adv/bridge_loop_avoidance.c | 1 +
 net/batman-adv/bridge_loop_avoidance.h | 1 +
 net/batman-adv/debugfs.c               | 1 +
 net/batman-adv/debugfs.h               | 1 +
 net/batman-adv/distributed-arp-table.c | 1 +
 net/batman-adv/distributed-arp-table.h | 1 +
 net/batman-adv/fragmentation.c         | 1 +
 net/batman-adv/fragmentation.h         | 1 +
 net/batman-adv/gateway_client.c        | 1 +
 net/batman-adv/gateway_client.h        | 1 +
 net/batman-adv/gateway_common.c        | 1 +
 net/batman-adv/gateway_common.h        | 1 +
 net/batman-adv/hard-interface.c        | 1 +
 net/batman-adv/hard-interface.h        | 1 +
 net/batman-adv/hash.c                  | 1 +
 net/batman-adv/hash.h                  | 1 +
 net/batman-adv/icmp_socket.c           | 1 +
 net/batman-adv/icmp_socket.h           | 1 +
 net/batman-adv/log.c                   | 1 +
 net/batman-adv/log.h                   | 1 +
 net/batman-adv/main.c                  | 1 +
 net/batman-adv/main.h                  | 1 +
 net/batman-adv/multicast.c             | 1 +
 net/batman-adv/multicast.h             | 1 +
 net/batman-adv/netlink.c               | 1 +
 net/batman-adv/netlink.h               | 1 +
 net/batman-adv/network-coding.c        | 1 +
 net/batman-adv/network-coding.h        | 1 +
 net/batman-adv/originator.c            | 1 +
 net/batman-adv/originator.h            | 1 +
 net/batman-adv/packet.h                | 1 +
 net/batman-adv/routing.c               | 1 +
 net/batman-adv/routing.h               | 1 +
 net/batman-adv/send.c                  | 1 +
 net/batman-adv/send.h                  | 1 +
 net/batman-adv/soft-interface.c        | 1 +
 net/batman-adv/soft-interface.h        | 1 +
 net/batman-adv/sysfs.c                 | 1 +
 net/batman-adv/sysfs.h                 | 1 +
 net/batman-adv/tp_meter.c              | 1 +
 net/batman-adv/tp_meter.h              | 1 +
 net/batman-adv/translation-table.c     | 1 +
 net/batman-adv/translation-table.h     | 1 +
 net/batman-adv/tvlv.c                  | 1 +
 net/batman-adv/tvlv.h                  | 1 +
 net/batman-adv/types.h                 | 1 +
 60 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index efd641c8a5d6..fb4533826163 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: ISC */
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 915987bc6d29..022f6e77307b 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,4 +1,4 @@
-#
+# SPDX-License-Identifier: GPL-2.0
 # Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 44fd073b7546..fa306b25a78b 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 29f6312f9bf1..029221615ba3 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 1b659ab652fb..bff5ec66a2e1 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index ae2ab526bdb1..9dc0dd5c83df 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 341ceab8338d..16709552c21e 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index dd7c4b647e6b..a17ab68bbce8 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 1de992c58b35..8375fd679db3 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index 376ead280ab9..5e39d0588a48 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index c251445a42a0..22d2bafa814a 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 2068770b542d..6a4c14ccc3c6 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 2b070c7e31da..125817c389e5 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index cc262c9d97e0..8cb2c874f5d3 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index cdd8e8e4df0b..007147f3ed9e 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 234775748b8e..b568cec819c5 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index e32ad47c6efd..d94585dc2dbe 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 9c5d4a65b98c..90a08d35c501 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 760c0de72582..3c2faf773335 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index ec364a3c1c66..d81a05a6e6f9 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index a98cf1104a30..22ce4c0c86c3 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 1a2d6c308745..30ffa992fcfc 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 10d521f0b17f..e8db19940ab8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 3baa3d466e5e..981f58421a32 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 2c26039c23fc..a7039503d88e 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 0a6a97d201f2..7c298b05c1dc 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 4e3d5340ad96..2f067a507fd5 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 9f9890ff7a22..ac7311a91f9d 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index b5f7e13918ac..a6dbaf2e9fc9 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 0c905e91c5e2..81cf54eb2fad 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index bded31121d12..cc76f1365300 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index f3fec40aae86..84cddd01eeab 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 4ef4bde2cc2d..148e64e846d2 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 65ce97efa6b5..6744a64143c0 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 4daed7ad46f2..5ce2007ea11b 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index bb8003cf2296..4bdb39ab3b20 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index e553a8770a89..01546a42b7ad 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 2a78cddab0e9..51f273b5b77d 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index ab13b4d58733..ce424fe2f24e 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index f1cd8c5da966..0e7e57b69b54 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 3604d7899e2c..5cfac6e56610 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index c66efb81d2f4..adaeafa4f71e 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 0a565d0422bb..0716daf5b9a7 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 40c7f039d5d7..b5d2164532c9 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 8e8a5db197cb..4eaf4b426726 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 40d9bf3e5bfe..86b0ea1e5c1c 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 5ede16c32f15..a1289bc5f115 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 7895323fd2a7..a6c53684ba70 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index a16b34f473ef..eb36820e41bc 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 9f673cdfecf8..e543024f98ef 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 639c3abb214a..075c5b5b2ce1 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index aa187fd42475..ab0b95f15b36 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index e487412e256b..0384cb6c406b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 15cd2139381e..601feb2c4ecf 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index a8ada5c123bd..c8b8f2cb2c2b 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8a3ce79b1307..281bd4cf7f90 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 411d586191da..8d9e3abec2c8 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 1d9e267caec9..67b2ba4b824b 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index 4d01400ada30..a74df33f446d 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index a62795868794..1df798b32077 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
-- 
cgit v1.2.3


From 9969ffa85ff28a56adc9c1570b5f956161f6e6a4 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 15:05:13 +0100
Subject: batman-adv: Add license header to Kconfig

The last remaining file without license notice and/or SPDX license
identifier under net/batman-adv/ is the Kconfig. It should have been
licensed under the same conditions as the rest of batman-adv and the
Makefile which uses the CONFIG_* variables from Kconfig.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/Kconfig | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index b73b96a2854b..c44f6515be5e 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -1,3 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+#
+# Marek Lindner, Simon Wunderlich
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
 #
 # B.A.T.M.A.N meshing protocol
 #
-- 
cgit v1.2.3


From b92b94ac732f5c83c60be2825d8b5cec4dc469d3 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 17:12:02 +0100
Subject: batman-adv: include gfp.h for GFP_* defines

The linux/gfp.h provides the GFP_ATOMIC and GFP_KERNEL define. It should
therefore be included instead of linux/fs.h.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_iv_ogm.c            | 2 +-
 net/batman-adv/bat_v_elp.c             | 2 +-
 net/batman-adv/bat_v_ogm.c             | 2 +-
 net/batman-adv/bridge_loop_avoidance.c | 2 +-
 net/batman-adv/distributed-arp-table.c | 2 +-
 net/batman-adv/fragmentation.c         | 2 +-
 net/batman-adv/gateway_client.c        | 2 +-
 net/batman-adv/hard-interface.c        | 2 +-
 net/batman-adv/hash.c                  | 2 +-
 net/batman-adv/icmp_socket.c           | 1 +
 net/batman-adv/log.c                   | 1 +
 net/batman-adv/main.c                  | 2 +-
 net/batman-adv/multicast.c             | 2 +-
 net/batman-adv/netlink.c               | 2 +-
 net/batman-adv/network-coding.c        | 2 +-
 net/batman-adv/originator.c            | 2 +-
 net/batman-adv/send.c                  | 2 +-
 net/batman-adv/soft-interface.c        | 2 +-
 net/batman-adv/sysfs.c                 | 2 +-
 net/batman-adv/tp_meter.c              | 2 +-
 net/batman-adv/translation-table.c     | 2 +-
 net/batman-adv/tvlv.c                  | 2 +-
 22 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index bff5ec66a2e1..27d9c8adf2ca 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -27,7 +27,7 @@
 #include <linux/cache.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/init.h>
 #include <linux/jiffies.h>
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 8375fd679db3..92b56bb5686d 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -25,7 +25,7 @@
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 22d2bafa814a..7055a9483788 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -23,7 +23,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 007147f3ed9e..ce124a21fce5 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -25,7 +25,7 @@
 #include <linux/crc16.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 3c2faf773335..f704bbc76e2a 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -24,7 +24,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 22ce4c0c86c3..741c6b91664e 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -23,7 +23,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index e8db19940ab8..21db0165175b 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -23,7 +23,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/in.h>
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 2f067a507fd5..394e69b77535 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -23,7 +23,7 @@
 #include <linux/bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if.h>
 #include <linux/if_arp.h>
 #include <linux/if_ether.h>
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a6dbaf2e9fc9..2ce0d5673f40 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -19,7 +19,7 @@
 #include "hash.h"
 #include "main.h"
 
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/lockdep.h>
 #include <linux/slab.h>
 
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index cc76f1365300..71ba58cc51fa 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -27,6 +27,7 @@
 #include <linux/export.h>
 #include <linux/fcntl.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 148e64e846d2..6fbcdd40a332 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -25,6 +25,7 @@
 #include <linux/export.h>
 #include <linux/fcntl.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 5ce2007ea11b..0c3664aa6a58 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -23,8 +23,8 @@
 #include <linux/byteorder/generic.h>
 #include <linux/crc32c.h>
 #include <linux/errno.h>
-#include <linux/fs.h>
 #include <linux/genetlink.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/init.h>
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 01546a42b7ad..d8617c2794db 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -25,7 +25,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/icmpv6.h>
 #include <linux/if_bridge.h>
 #include <linux/if_ether.h>
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index ce424fe2f24e..f7281685633c 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -24,8 +24,8 @@
 #include <linux/cache.h>
 #include <linux/errno.h>
 #include <linux/export.h>
-#include <linux/fs.h>
 #include <linux/genetlink.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 5cfac6e56610..d37fe5ed6117 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -26,7 +26,7 @@
 #include <linux/debugfs.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/if_packet.h>
 #include <linux/init.h>
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 0716daf5b9a7..007b6bd8df95 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -22,7 +22,7 @@
 #include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index a6c53684ba70..c53b11d41d8b 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -24,7 +24,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if.h>
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index e543024f98ef..ba8fd06eee7e 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -27,7 +27,7 @@
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/jiffies.h>
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index ab0b95f15b36..374ff46feb8e 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -23,7 +23,7 @@
 #include <linux/compiler.h>
 #include <linux/device.h>
 #include <linux/errno.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if.h>
 #include <linux/if_vlan.h>
 #include <linux/kernel.h>
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 601feb2c4ecf..e33f64f0fcb8 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -26,7 +26,7 @@
 #include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/init.h>
 #include <linux/jiffies.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 281bd4cf7f90..db40ca6243b3 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -28,7 +28,7 @@
 #include <linux/crc32c.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/init.h>
 #include <linux/jhash.h>
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 67b2ba4b824b..d956c2a0e9cb 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -20,7 +20,7 @@
 
 #include <linux/byteorder/generic.h>
 #include <linux/etherdevice.h>
-#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
-- 
cgit v1.2.3


From ecc36f5ee6fdc73237b0fe693ca752d4b16e65bc Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 17:12:03 +0100
Subject: batman-adv: include build_bug.h for BUILD_BUG_ON define

commit bc6245e5efd7 ("bug: split BUILD_BUG stuff out into
<linux/build_bug.h>") added a new header for BUILD_BUG_ON. It should
therefore be included instead of linux/bug.h

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.c              | 2 +-
 net/batman-adv/tp_meter.c          | 2 +-
 net/batman-adv/translation-table.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 0c3664aa6a58..6f6c500e8aa8 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -19,7 +19,7 @@
 #include "main.h"
 
 #include <linux/atomic.h>
-#include <linux/bug.h>
+#include <linux/build_bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/crc32c.h>
 #include <linux/errno.h>
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index e33f64f0fcb8..fe9eb2970ec9 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -20,7 +20,7 @@
 #include "main.h"
 
 #include <linux/atomic.h>
-#include <linux/bug.h>
+#include <linux/build_bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index db40ca6243b3..27a0c34a5ad0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -21,7 +21,7 @@
 
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/bug.h>
+#include <linux/build_bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
-- 
cgit v1.2.3


From 3a64469e4fddeeca8e6fba7ea4e9558a4af3a1c8 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 17:12:04 +0100
Subject: batman-adv: Include net.h for net_ratelimited_function

The linux/net.h provides the net_ratelimited_function. It should
therefore be included directly before it is used.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/network-coding.c    | 1 +
 net/batman-adv/translation-table.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index d37fe5ed6117..bd421408d9e7 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -36,6 +36,7 @@
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/net.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
 #include <linux/random.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 27a0c34a5ad0..b4b20ad1ed9a 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -37,6 +37,7 @@
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/net.h>
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <linux/rculist.h>
-- 
cgit v1.2.3


From 8cfba951a0e65acb4ea9fe669a0aa09a44c5d1de Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 17:12:05 +0100
Subject: batman-adv: include kobject.h for kobject_* functions

The linux/kobject.h provides the kobject_* function declarations and should
therefore be included directly before they are used.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/sysfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 374ff46feb8e..227a072dc1d3 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -27,6 +27,7 @@
 #include <linux/if.h>
 #include <linux/if_vlan.h>
 #include <linux/kernel.h>
+#include <linux/kobject.h>
 #include <linux/kref.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
-- 
cgit v1.2.3


From eface060c2c354af1ad902929fc2cc9a426ad349 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 17:12:06 +0100
Subject: batman-adv: Remove unused sched.h include

The linux/wait.h include was removed with commit 421d988b2c08 ("batman-adv:
Consolidate logging related functions"). The previously required (but not
unused) linux/sched.h include can also be dropped now.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/debugfs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index d94585dc2dbe..fddf16a3dc89 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
-#include <linux/sched.h> /* for linux/wait.h */
 #include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/stddef.h>
-- 
cgit v1.2.3


From 6a3038f07c59152fb640a3cd274afdf909640e4b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:46 +0100
Subject: batman-adv: Add missing kernel-doc to packet.h

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/packet.h | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 4eaf4b426726..dccbd4a6f019 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -22,6 +22,12 @@
 #include <asm/byteorder.h>
 #include <linux/types.h>
 
+/**
+ * batadv_tp_is_error() - Check throughput meter return code for error
+ * @n: throughput meter return code
+ *
+ * Return: 0 when not error was detected, != 0 otherwise
+ */
 #define batadv_tp_is_error(n) ((u8)(n) > 127 ? 1 : 0)
 
 /**
@@ -89,7 +95,15 @@ enum batadv_iv_flags {
 	BATADV_DIRECTLINK          = BIT(2),
 };
 
-/* ICMP message types */
+/**
+ * enum batadv_icmp_packettype - ICMP message types
+ * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST
+ * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found
+ * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination
+ * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops
+ * @BATADV_PARAMETER_PROBLEM: return code for malformed messages
+ * @BATADV_TP: throughput meter packet
+ */
 enum batadv_icmp_packettype {
 	BATADV_ECHO_REPLY	       = 0,
 	BATADV_DESTINATION_UNREACHABLE = 3,
@@ -137,7 +151,14 @@ enum batadv_vlan_flags {
 	BATADV_VLAN_HAS_TAG	= BIT(15),
 };
 
-/* claim frame types for the bridge loop avoidance */
+/**
+ * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance
+ * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address
+ * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address
+ * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc
+ * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table
+ * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet
+ */
 enum batadv_bla_claimframe {
 	BATADV_CLAIM_TYPE_CLAIM		= 0x00,
 	BATADV_CLAIM_TYPE_UNCLAIM	= 0x01,
-- 
cgit v1.2.3


From 7e9a8c2ce7c5f8745c003e2ba4758c21c38a0419 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:47 +0100
Subject: batman-adv: Use parentheses in function kernel-doc

The documentation describing kernel-doc comments for functions ("How to
format kernel-doc comments") uses parentheses at the end of the function
name. Using this format allows to use a consistent style when adding
documentation to a function and when referencing this function in a
different kernel-doc section.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_algo.c              |   7 +-
 net/batman-adv/bat_iv_ogm.c            |  80 +++++++-------
 net/batman-adv/bat_v.c                 |  48 ++++-----
 net/batman-adv/bat_v_elp.c             |  24 ++---
 net/batman-adv/bat_v_ogm.c             |  34 +++---
 net/batman-adv/bitarray.c              |   2 +-
 net/batman-adv/bitarray.h              |   2 +-
 net/batman-adv/bridge_loop_avoidance.c | 109 ++++++++++---------
 net/batman-adv/bridge_loop_avoidance.h |   4 +-
 net/batman-adv/debugfs.c               |  12 +--
 net/batman-adv/distributed-arp-table.c |  77 +++++++-------
 net/batman-adv/distributed-arp-table.h |   6 +-
 net/batman-adv/fragmentation.c         |  20 ++--
 net/batman-adv/fragmentation.h         |   2 +-
 net/batman-adv/gateway_client.c        |  24 +++--
 net/batman-adv/gateway_common.c        |  18 ++--
 net/batman-adv/hard-interface.c        |  26 ++---
 net/batman-adv/hard-interface.h        |   2 +-
 net/batman-adv/hash.h                  |   2 +-
 net/batman-adv/icmp_socket.c           |   4 +-
 net/batman-adv/main.c                  |  16 +--
 net/batman-adv/main.h                  |   6 +-
 net/batman-adv/multicast.c             |  78 +++++++-------
 net/batman-adv/netlink.c               |  22 ++--
 net/batman-adv/network-coding.c        | 120 +++++++++++----------
 net/batman-adv/originator.c            |  76 ++++++-------
 net/batman-adv/routing.c               |  24 ++---
 net/batman-adv/send.c                  |  45 ++++----
 net/batman-adv/send.h                  |   4 +-
 net/batman-adv/soft-interface.c        |  41 +++----
 net/batman-adv/sysfs.c                 |  20 ++--
 net/batman-adv/tp_meter.c              |  70 ++++++------
 net/batman-adv/translation-table.c     | 189 +++++++++++++++++----------------
 net/batman-adv/tvlv.c                  |  38 +++----
 34 files changed, 633 insertions(+), 619 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index fa306b25a78b..aed7ced059df 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -38,7 +38,8 @@ char batadv_routing_algo[20] = "BATMAN_IV";
 static struct hlist_head batadv_algo_list;
 
 /**
- * batadv_algo_init - Initialize batman-adv algorithm management data structures
+ * batadv_algo_init() - Initialize batman-adv algorithm management data
+ *  structures
  */
 void batadv_algo_init(void)
 {
@@ -149,7 +150,7 @@ module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
 		0644);
 
 /**
- * batadv_algo_dump_entry - fill in information about one supported routing
+ * batadv_algo_dump_entry() - fill in information about one supported routing
  *  algorithm
  * @msg: netlink message to be sent back
  * @portid: Port to reply to
@@ -180,7 +181,7 @@ static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_algo_dump - fill in information about supported routing
+ * batadv_algo_dump() - fill in information about supported routing
  *  algorithms
  * @msg: netlink message to be sent back
  * @cb: Parameters to the netlink request
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 27d9c8adf2ca..1fc67aa8d7df 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -87,7 +87,7 @@ enum batadv_dup_status {
 };
 
 /**
- * batadv_ring_buffer_set - update the ring buffer with the given value
+ * batadv_ring_buffer_set() - update the ring buffer with the given value
  * @lq_recv: pointer to the ring buffer
  * @lq_index: index to store the value at
  * @value: value to store in the ring buffer
@@ -99,7 +99,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value)
 }
 
 /**
- * batadv_ring_buffer_avg - compute the average of all non-zero values stored
+ * batadv_ring_buffer_avg() - compute the average of all non-zero values stored
  * in the given ring buffer
  * @lq_recv: pointer to the ring buffer
  *
@@ -131,7 +131,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
 }
 
 /**
- * batadv_iv_ogm_orig_free - free the private resources allocated for this
+ * batadv_iv_ogm_orig_free() - free the private resources allocated for this
  *  orig_node
  * @orig_node: the orig_node for which the resources have to be free'd
  */
@@ -142,8 +142,8 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
 }
 
 /**
- * batadv_iv_ogm_orig_add_if - change the private structures of the orig_node to
- *  include the new hard-interface
+ * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node
+ *  to include the new hard-interface
  * @orig_node: the orig_node that has to be changed
  * @max_if_num: the current amount of interfaces
  *
@@ -187,7 +187,7 @@ unlock:
 }
 
 /**
- * batadv_iv_ogm_drop_bcast_own_entry - drop section of bcast_own
+ * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own
  * @orig_node: the orig_node that has to be changed
  * @max_if_num: the current amount of interfaces
  * @del_if_num: the index of the interface being removed
@@ -225,7 +225,7 @@ batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_iv_ogm_drop_bcast_own_sum_entry - drop section of bcast_own_sum
+ * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum
  * @orig_node: the orig_node that has to be changed
  * @max_if_num: the current amount of interfaces
  * @del_if_num: the index of the interface being removed
@@ -260,8 +260,8 @@ batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to
- *  exclude the removed interface
+ * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node
+ *  to exclude the removed interface
  * @orig_node: the orig_node that has to be changed
  * @max_if_num: the current amount of interfaces
  * @del_if_num: the index of the interface being removed
@@ -291,7 +291,8 @@ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_iv_ogm_orig_get - retrieve or create (if does not exist) an originator
+ * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an
+ *  originator
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: mac address of the originator
  *
@@ -448,7 +449,7 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached
+ * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached
  * @buff_pos: current position in the skb
  * @packet_len: total length of the skb
  * @tvlv_len: tvlv length of the previously considered OGM
@@ -558,7 +559,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
 }
 
 /**
- * batadv_iv_ogm_can_aggregate - find out if an OGM can be aggregated on an
+ * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an
  *  existing forward packet
  * @new_bat_ogm_packet: OGM packet to be aggregated
  * @bat_priv: the bat priv with all the soft interface information
@@ -661,7 +662,7 @@ out:
 }
 
 /**
- * batadv_iv_ogm_aggregate_new - create a new aggregated packet and add this
+ * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this
  *  packet to it.
  * @packet_buff: pointer to the OGM
  * @packet_len: (total) length of the OGM
@@ -744,7 +745,7 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr,
 }
 
 /**
- * batadv_iv_ogm_queue_add - queue up an OGM for transmission
+ * batadv_iv_ogm_queue_add() - queue up an OGM for transmission
  * @bat_priv: the bat priv with all the soft interface information
  * @packet_buff: pointer to the OGM
  * @packet_len: (total) length of the OGM
@@ -870,8 +871,8 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_iv_ogm_slide_own_bcast_window - bitshift own OGM broadcast windows for
- * the given interface
+ * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows
+ *  for the given interface
  * @hard_iface: the interface for which the windows have to be shifted
  */
 static void
@@ -988,7 +989,7 @@ out:
 }
 
 /**
- * batadv_iv_ogm_orig_update - use OGM to update corresponding data in an
+ * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an
  *  originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: the orig node who originally emitted the ogm packet
@@ -1153,7 +1154,7 @@ out:
 }
 
 /**
- * batadv_iv_ogm_calc_tq - calculate tq for current received ogm packet
+ * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet
  * @orig_node: the orig node who originally emitted the ogm packet
  * @orig_neigh_node: the orig node struct of the neighbor who sent the packet
  * @batadv_ogm_packet: the ogm packet
@@ -1299,7 +1300,7 @@ out:
 }
 
 /**
- * batadv_iv_ogm_update_seqnos -  process a batman packet for all interfaces,
+ * batadv_iv_ogm_update_seqnos() -  process a batman packet for all interfaces,
  *  adjust the sequence number and find out whether it is a duplicate
  * @ethhdr: ethernet header of the packet
  * @batadv_ogm_packet: OGM packet to be considered
@@ -1402,7 +1403,8 @@ out:
 }
 
 /**
- * batadv_iv_ogm_process_per_outif - process a batman iv OGM for an outgoing if
+ * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing
+ *  interface
  * @skb: the skb containing the OGM
  * @ogm_offset: offset from skb->data to start of ogm header
  * @orig_node: the (cached) orig node for the originator of this OGM
@@ -1609,7 +1611,7 @@ out:
 }
 
 /**
- * batadv_iv_ogm_process - process an incoming batman iv OGM
+ * batadv_iv_ogm_process() - process an incoming batman iv OGM
  * @skb: the skb containing the OGM
  * @ogm_offset: offset to the OGM which should be processed (for aggregates)
  * @if_incoming: the interface where this packet was receved
@@ -1862,7 +1864,7 @@ free_skb:
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table
+ * batadv_iv_ogm_orig_print_neigh() - print neighbors for the originator table
  * @orig_node: the orig_node for which the neighbors are printed
  * @if_outgoing: outgoing interface for these entries
  * @seq: debugfs table seq_file struct
@@ -1891,7 +1893,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_iv_ogm_orig_print - print the originator table
+ * batadv_iv_ogm_orig_print() - print the originator table
  * @bat_priv: the bat priv with all the soft interface information
  * @seq: debugfs table seq_file struct
  * @if_outgoing: the outgoing interface for which this should be printed
@@ -1961,7 +1963,7 @@ next:
 #endif
 
 /**
- * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a
+ * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a
  *  given outgoing interface.
  * @neigh_node: Neighbour of interest
  * @if_outgoing: Outgoing interface of interest
@@ -1987,7 +1989,7 @@ batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node,
 }
 
 /**
- * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a
+ * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a
  *  message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
@@ -2049,7 +2051,7 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message
+ * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -2111,7 +2113,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a
+ * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a
  *  message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
@@ -2154,7 +2156,7 @@ batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_iv_ogm_orig_dump - Dump the originators into a message
+ * batadv_iv_ogm_orig_dump() - Dump the originators into a message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
  * @bat_priv: The bat priv with all the soft interface information
@@ -2191,7 +2193,7 @@ batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_iv_hardif_neigh_print - print a single hop neighbour node
+ * batadv_iv_hardif_neigh_print() - print a single hop neighbour node
  * @seq: neighbour table seq_file struct
  * @hardif_neigh: hardif neighbour information
  */
@@ -2210,7 +2212,7 @@ batadv_iv_hardif_neigh_print(struct seq_file *seq,
 }
 
 /**
- * batadv_iv_ogm_neigh_print - print the single hop neighbour list
+ * batadv_iv_ogm_neigh_print() - print the single hop neighbour list
  * @bat_priv: the bat priv with all the soft interface information
  * @seq: neighbour table seq_file struct
  */
@@ -2243,7 +2245,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
 #endif
 
 /**
- * batadv_iv_ogm_neigh_diff - calculate tq difference of two neighbors
+ * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors
  * @neigh1: the first neighbor object of the comparison
  * @if_outgoing1: outgoing interface for the first neighbor
  * @neigh2: the second neighbor object of the comparison
@@ -2288,7 +2290,7 @@ out:
 }
 
 /**
- * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message
+ * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -2327,7 +2329,7 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface
+ * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface
  *  into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
@@ -2366,7 +2368,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message
+ * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
  * @bat_priv: The bat priv with all the soft interface information
@@ -2418,7 +2420,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
 }
 
 /**
- * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
+ * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors
  * @neigh1: the first neighbor object of the comparison
  * @if_outgoing1: outgoing interface for the first neighbor
  * @neigh2: the second neighbor object of the comparison
@@ -2444,7 +2446,7 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
 }
 
 /**
- * batadv_iv_ogm_neigh_is_sob - check if neigh1 is similarly good or better
+ * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better
  *  than neigh2 from the metric prospective
  * @neigh1: the first neighbor object of the comparison
  * @if_outgoing1: outgoing interface for the first neighbor
@@ -2479,7 +2481,7 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_iv_init_sel_class - initialize GW selection class
+ * batadv_iv_init_sel_class() - initialize GW selection class
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv)
@@ -2704,7 +2706,7 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
 #endif
 
 /**
- * batadv_iv_gw_dump_entry - Dump a gateway into a message
+ * batadv_iv_gw_dump_entry() - Dump a gateway into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -2775,7 +2777,7 @@ out:
 }
 
 /**
- * batadv_iv_gw_dump - Dump gateways into a message
+ * batadv_iv_gw_dump() - Dump gateways into a message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
  * @bat_priv: The bat priv with all the soft interface information
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 16709552c21e..14ec3677c391 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -100,7 +100,7 @@ static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_v_iface_update_mac - react to hard-interface MAC address change
+ * batadv_v_iface_update_mac() - react to hard-interface MAC address change
  * @hard_iface: the modified interface
  *
  * If the modified interface is the primary one, update the originator
@@ -131,7 +131,7 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_v_orig_print_neigh - print neighbors for the originator table
+ * batadv_v_orig_print_neigh() - print neighbors for the originator table
  * @orig_node: the orig_node for which the neighbors are printed
  * @if_outgoing: outgoing interface for these entries
  * @seq: debugfs table seq_file struct
@@ -161,7 +161,7 @@ batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_v_hardif_neigh_print - print a single ELP neighbour node
+ * batadv_v_hardif_neigh_print() - print a single ELP neighbour node
  * @seq: neighbour table seq_file struct
  * @hardif_neigh: hardif neighbour information
  */
@@ -182,7 +182,7 @@ batadv_v_hardif_neigh_print(struct seq_file *seq,
 }
 
 /**
- * batadv_v_neigh_print - print the single hop neighbour list
+ * batadv_v_neigh_print() - print the single hop neighbour list
  * @bat_priv: the bat priv with all the soft interface information
  * @seq: neighbour table seq_file struct
  */
@@ -216,7 +216,7 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
 #endif
 
 /**
- * batadv_v_neigh_dump_neigh - Dump a neighbour into a message
+ * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -259,7 +259,7 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_v_neigh_dump_hardif - Dump the  neighbours of a hard interface  into
+ * batadv_v_neigh_dump_hardif() - Dump the  neighbours of a hard interface into
  *  a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
@@ -297,7 +297,7 @@ batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_v_neigh_dump - Dump the neighbours of a hard interface  into a
+ * batadv_v_neigh_dump() - Dump the neighbours of a hard interface  into a
  *  message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
@@ -349,7 +349,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_v_orig_print - print the originator table
+ * batadv_v_orig_print() - print the originator table
  * @bat_priv: the bat priv with all the soft interface information
  * @seq: debugfs table seq_file struct
  * @if_outgoing: the outgoing interface for which this should be printed
@@ -417,8 +417,7 @@ next:
 #endif
 
 /**
- * batadv_v_orig_dump_subentry - Dump an originator subentry into a
- *  message
+ * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -484,7 +483,7 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_v_orig_dump_entry - Dump an originator entry into a message
+ * batadv_v_orig_dump_entry() - Dump an originator entry into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -537,8 +536,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_v_orig_dump_bucket - Dump an originator bucket into a
- *  message
+ * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -579,7 +577,7 @@ batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_v_orig_dump - Dump the originators into a message
+ * batadv_v_orig_dump() - Dump the originators into a message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
  * @bat_priv: The bat priv with all the soft interface information
@@ -669,7 +667,7 @@ err_ifinfo1:
 }
 
 /**
- * batadv_v_init_sel_class - initialize GW selection class
+ * batadv_v_init_sel_class() - initialize GW selection class
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
@@ -705,7 +703,7 @@ static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff)
 }
 
 /**
- * batadv_v_gw_throughput_get - retrieve the GW-bandwidth for a given GW
+ * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW
  * @gw_node: the GW to retrieve the metric for
  * @bw: the pointer where the metric will be stored. The metric is computed as
  *  the minimum between the GW advertised throughput and the path throughput to
@@ -748,7 +746,7 @@ out:
 }
 
 /**
- * batadv_v_gw_get_best_gw_node - retrieve the best GW node
+ * batadv_v_gw_get_best_gw_node() - retrieve the best GW node
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: the GW node having the best GW-metric, NULL if no GW is known
@@ -786,7 +784,7 @@ next:
 }
 
 /**
- * batadv_v_gw_is_eligible - check if a originator would be selected as GW
+ * batadv_v_gw_is_eligible() - check if a originator would be selected as GW
  * @bat_priv: the bat priv with all the soft interface information
  * @curr_gw_orig: originator representing the currently selected GW
  * @orig_node: the originator representing the new candidate
@@ -885,7 +883,7 @@ out:
 }
 
 /**
- * batadv_v_gw_print - print the gateway list
+ * batadv_v_gw_print() - print the gateway list
  * @bat_priv: the bat priv with all the soft interface information
  * @seq: gateway table seq_file struct
  */
@@ -914,7 +912,7 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv,
 #endif
 
 /**
- * batadv_v_gw_dump_entry - Dump a gateway into a message
+ * batadv_v_gw_dump_entry() - Dump a gateway into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -1005,7 +1003,7 @@ out:
 }
 
 /**
- * batadv_v_gw_dump - Dump gateways into a message
+ * batadv_v_gw_dump() - Dump gateways into a message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
  * @bat_priv: The bat priv with all the soft interface information
@@ -1075,7 +1073,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 };
 
 /**
- * batadv_v_hardif_init - initialize the algorithm specific fields in the
+ * batadv_v_hardif_init() - initialize the algorithm specific fields in the
  *  hard-interface object
  * @hard_iface: the hard-interface to initialize
  */
@@ -1089,7 +1087,7 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a
+ * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a
  *  mesh
  * @bat_priv: the object representing the mesh interface to initialise
  *
@@ -1107,7 +1105,7 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_v_mesh_free - free the B.A.T.M.A.N. V private resources for a mesh
+ * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh
  * @bat_priv: the object representing the mesh interface to free
  */
 void batadv_v_mesh_free(struct batadv_priv *bat_priv)
@@ -1116,7 +1114,7 @@ void batadv_v_mesh_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_v_init - B.A.T.M.A.N. V initialization function
+ * batadv_v_init() - B.A.T.M.A.N. V initialization function
  *
  * Description: Takes care of initializing all the subcomponents.
  * It is invoked upon module load only.
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 92b56bb5686d..59ae96cef596 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -53,7 +53,7 @@
 #include "send.h"
 
 /**
- * batadv_v_elp_start_timer - restart timer for ELP periodic work
+ * batadv_v_elp_start_timer() - restart timer for ELP periodic work
  * @hard_iface: the interface for which the timer has to be reset
  */
 static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
@@ -68,7 +68,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_v_elp_get_throughput - get the throughput towards a neighbour
+ * batadv_v_elp_get_throughput() - get the throughput towards a neighbour
  * @neigh: the neighbour for which the throughput has to be obtained
  *
  * Return: The throughput towards the given neighbour in multiples of 100kpbs
@@ -154,8 +154,8 @@ default_throughput:
 }
 
 /**
- * batadv_v_elp_throughput_metric_update - worker updating the throughput metric
- *  of a single hop neighbour
+ * batadv_v_elp_throughput_metric_update() - worker updating the throughput
+ *  metric of a single hop neighbour
  * @work: the work queue item
  */
 void batadv_v_elp_throughput_metric_update(struct work_struct *work)
@@ -178,7 +178,7 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work)
 }
 
 /**
- * batadv_v_elp_wifi_neigh_probe - send link probing packets to a neighbour
+ * batadv_v_elp_wifi_neigh_probe() - send link probing packets to a neighbour
  * @neigh: the neighbour to probe
  *
  * Sends a predefined number of unicast wifi packets to a given neighbour in
@@ -241,7 +241,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
 }
 
 /**
- * batadv_v_elp_periodic_work - ELP periodic task per interface
+ * batadv_v_elp_periodic_work() - ELP periodic task per interface
  * @work: work queue item
  *
  * Emits broadcast ELP message in regular intervals.
@@ -328,7 +328,7 @@ out:
 }
 
 /**
- * batadv_v_elp_iface_enable - setup the ELP interface private resources
+ * batadv_v_elp_iface_enable() - setup the ELP interface private resources
  * @hard_iface: interface for which the data has to be prepared
  *
  * Return: 0 on success or a -ENOMEM in case of failure.
@@ -376,7 +376,7 @@ out:
 }
 
 /**
- * batadv_v_elp_iface_disable - release ELP interface private resources
+ * batadv_v_elp_iface_disable() - release ELP interface private resources
  * @hard_iface: interface for which the resources have to be released
  */
 void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
@@ -388,7 +388,7 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_v_elp_iface_activate - update the ELP buffer belonging to the given
+ * batadv_v_elp_iface_activate() - update the ELP buffer belonging to the given
  *  hard-interface
  * @primary_iface: the new primary interface
  * @hard_iface: interface holding the to-be-updated buffer
@@ -409,7 +409,7 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
 }
 
 /**
- * batadv_v_elp_primary_iface_set - change internal data to reflect the new
+ * batadv_v_elp_primary_iface_set() - change internal data to reflect the new
  *  primary interface
  * @primary_iface: the new primary interface
  */
@@ -429,7 +429,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
 }
 
 /**
- * batadv_v_elp_neigh_update - update an ELP neighbour node
+ * batadv_v_elp_neigh_update() - update an ELP neighbour node
  * @bat_priv: the bat priv with all the soft interface information
  * @neigh_addr: the neighbour interface address
  * @if_incoming: the interface the packet was received through
@@ -489,7 +489,7 @@ orig_free:
 }
 
 /**
- * batadv_v_elp_packet_recv - main ELP packet handler
+ * batadv_v_elp_packet_recv() - main ELP packet handler
  * @skb: the received packet
  * @if_incoming: the interface this packet was received through
  *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 7055a9483788..e415974c540d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -52,7 +52,7 @@
 #include "tvlv.h"
 
 /**
- * batadv_v_ogm_orig_get - retrieve and possibly create an originator node
+ * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the address of the originator
  *
@@ -89,7 +89,7 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_v_ogm_start_timer - restart the OGM sending timer
+ * batadv_v_ogm_start_timer() - restart the OGM sending timer
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
@@ -108,7 +108,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_v_ogm_send_to_if - send a batman ogm using a given interface
+ * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface
  * @skb: the OGM to send
  * @hard_iface: the interface to use to send the OGM
  */
@@ -128,7 +128,7 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
 }
 
 /**
- * batadv_v_ogm_send - periodic worker broadcasting the own OGM
+ * batadv_v_ogm_send() - periodic worker broadcasting the own OGM
  * @work: work queue item
  */
 static void batadv_v_ogm_send(struct work_struct *work)
@@ -236,7 +236,7 @@ out:
 }
 
 /**
- * batadv_v_ogm_iface_enable - prepare an interface for B.A.T.M.A.N. V
+ * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V
  * @hard_iface: the interface to prepare
  *
  * Takes care of scheduling own OGM sending routine for this interface.
@@ -253,7 +253,7 @@ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_v_ogm_primary_iface_set - set a new primary interface
+ * batadv_v_ogm_primary_iface_set() - set a new primary interface
  * @primary_iface: the new primary interface
  */
 void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
@@ -269,8 +269,8 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
 }
 
 /**
- * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded
- *  with B.A.T.M.A.N. V OGMs
+ * batadv_v_forward_penalty() - apply a penalty to the throughput metric
+ *  forwarded with B.A.T.M.A.N. V OGMs
  * @bat_priv: the bat priv with all the soft interface information
  * @if_incoming: the interface where the OGM has been received
  * @if_outgoing: the interface where the OGM has to be forwarded to
@@ -315,7 +315,7 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_v_ogm_forward - check conditions and forward an OGM to the given
+ * batadv_v_ogm_forward() - check conditions and forward an OGM to the given
  *  outgoing interface
  * @bat_priv: the bat priv with all the soft interface information
  * @ogm_received: previously received OGM to be forwarded
@@ -406,7 +406,7 @@ out:
 }
 
 /**
- * batadv_v_ogm_metric_update - update route metric based on OGM
+ * batadv_v_ogm_metric_update() - update route metric based on OGM
  * @bat_priv: the bat priv with all the soft interface information
  * @ogm2: OGM2 structure
  * @orig_node: Originator structure for which the OGM has been received
@@ -491,7 +491,7 @@ out:
 }
 
 /**
- * batadv_v_ogm_route_update - update routes based on OGM
+ * batadv_v_ogm_route_update() - update routes based on OGM
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: the Ethernet header of the OGM2
  * @ogm2: OGM2 structure
@@ -591,7 +591,7 @@ out:
 }
 
 /**
- * batadv_v_ogm_process_per_outif - process a batman v OGM for an outgoing if
+ * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: the Ethernet header of the OGM2
  * @ogm2: OGM2 structure
@@ -640,7 +640,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_v_ogm_aggr_packet - checks if there is another OGM aggregated
+ * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated
  * @buff_pos: current position in the skb
  * @packet_len: total length of the skb
  * @tvlv_len: tvlv length of the previously considered OGM
@@ -660,7 +660,7 @@ static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
 }
 
 /**
- * batadv_v_ogm_process - process an incoming batman v OGM
+ * batadv_v_ogm_process() - process an incoming batman v OGM
  * @skb: the skb containing the OGM
  * @ogm_offset: offset to the OGM which should be processed (for aggregates)
  * @if_incoming: the interface where this packet was receved
@@ -788,7 +788,7 @@ out:
 }
 
 /**
- * batadv_v_ogm_packet_recv - OGM2 receiving handler
+ * batadv_v_ogm_packet_recv() - OGM2 receiving handler
  * @skb: the received OGM
  * @if_incoming: the interface where this OGM has been received
  *
@@ -852,7 +852,7 @@ free_skb:
 }
 
 /**
- * batadv_v_ogm_init - initialise the OGM2 engine
+ * batadv_v_ogm_init() - initialise the OGM2 engine
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 on success or a negative error code in case of failure
@@ -885,7 +885,7 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_v_ogm_free - free OGM private resources
+ * batadv_v_ogm_free() - free OGM private resources
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_v_ogm_free(struct batadv_priv *bat_priv)
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 125817c389e5..bdc1ef06e05b 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -33,7 +33,7 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
 }
 
 /**
- * batadv_bit_get_packet - receive and process one packet within the sequence
+ * batadv_bit_get_packet() - receive and process one packet within the sequence
  *  number window
  * @priv: the bat priv with all the soft interface information
  * @seq_bits: pointer to the sequence number receive packet
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 8cb2c874f5d3..0508353fa28d 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -27,7 +27,7 @@
 #include <linux/types.h>
 
 /**
- * batadv_test_bit - check if bit is set in the current window
+ * batadv_test_bit() - check if bit is set in the current window
  *
  * @seq_bits: pointer to the sequence number receive packet
  * @last_seqno: latest sequence number in seq_bits
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index ce124a21fce5..e647450e5d0f 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -70,7 +70,7 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv,
 			 struct batadv_bla_backbone_gw *backbone_gw);
 
 /**
- * batadv_choose_claim - choose the right bucket for a claim.
+ * batadv_choose_claim() - choose the right bucket for a claim.
  * @data: data to hash
  * @size: size of the hash table
  *
@@ -88,7 +88,7 @@ static inline u32 batadv_choose_claim(const void *data, u32 size)
 }
 
 /**
- * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway.
+ * batadv_choose_backbone_gw() - choose the right bucket for a backbone gateway.
  * @data: data to hash
  * @size: size of the hash table
  *
@@ -106,7 +106,7 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
 }
 
 /**
- * batadv_compare_backbone_gw - compare address and vid of two backbone gws
+ * batadv_compare_backbone_gw() - compare address and vid of two backbone gws
  * @node: list node of the first entry to compare
  * @data2: pointer to the second backbone gateway
  *
@@ -130,7 +130,7 @@ static bool batadv_compare_backbone_gw(const struct hlist_node *node,
 }
 
 /**
- * batadv_compare_claim - compare address and vid of two claims
+ * batadv_compare_claim() - compare address and vid of two claims
  * @node: list node of the first entry to compare
  * @data2: pointer to the second claims
  *
@@ -154,7 +154,7 @@ static bool batadv_compare_claim(const struct hlist_node *node,
 }
 
 /**
- * batadv_backbone_gw_release - release backbone gw from lists and queue for
+ * batadv_backbone_gw_release() - release backbone gw from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the backbone gw
  */
@@ -169,7 +169,7 @@ static void batadv_backbone_gw_release(struct kref *ref)
 }
 
 /**
- * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly
+ * batadv_backbone_gw_put() - decrement the backbone gw refcounter and possibly
  *  release it
  * @backbone_gw: backbone gateway to be free'd
  */
@@ -179,8 +179,8 @@ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw)
 }
 
 /**
- * batadv_claim_release - release claim from lists and queue for free after rcu
- *  grace period
+ * batadv_claim_release() - release claim from lists and queue for free after
+ *  rcu grace period
  * @ref: kref pointer of the claim
  */
 static void batadv_claim_release(struct kref *ref)
@@ -205,8 +205,7 @@ static void batadv_claim_release(struct kref *ref)
 }
 
 /**
- * batadv_claim_put - decrement the claim refcounter and possibly
- *  release it
+ * batadv_claim_put() - decrement the claim refcounter and possibly release it
  * @claim: claim to be free'd
  */
 static void batadv_claim_put(struct batadv_bla_claim *claim)
@@ -215,7 +214,7 @@ static void batadv_claim_put(struct batadv_bla_claim *claim)
 }
 
 /**
- * batadv_claim_hash_find - looks for a claim in the claim hash
+ * batadv_claim_hash_find() - looks for a claim in the claim hash
  * @bat_priv: the bat priv with all the soft interface information
  * @data: search data (may be local/static data)
  *
@@ -254,7 +253,7 @@ batadv_claim_hash_find(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_backbone_hash_find - looks for a backbone gateway in the hash
+ * batadv_backbone_hash_find() - looks for a backbone gateway in the hash
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the address of the originator
  * @vid: the VLAN ID
@@ -298,7 +297,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
 }
 
 /**
- * batadv_bla_del_backbone_claims - delete all claims for a backbone
+ * batadv_bla_del_backbone_claims() - delete all claims for a backbone
  * @backbone_gw: backbone gateway where the claims should be removed
  */
 static void
@@ -338,7 +337,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
 }
 
 /**
- * batadv_bla_send_claim - sends a claim frame according to the provided info
+ * batadv_bla_send_claim() - sends a claim frame according to the provided info
  * @bat_priv: the bat priv with all the soft interface information
  * @mac: the mac address to be announced within the claim
  * @vid: the VLAN ID
@@ -458,7 +457,7 @@ out:
 }
 
 /**
- * batadv_bla_loopdetect_report - worker for reporting the loop
+ * batadv_bla_loopdetect_report() - worker for reporting the loop
  * @work: work queue item
  *
  * Throws an uevent, as the loopdetect check function can't do that itself
@@ -488,7 +487,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work)
 }
 
 /**
- * batadv_bla_get_backbone_gw - finds or creates a backbone gateway
+ * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the mac address of the originator
  * @vid: the VLAN ID
@@ -561,7 +560,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
 }
 
 /**
- * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN
+ * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the selected primary interface
  * @vid: VLAN identifier
@@ -587,7 +586,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_answer_request - answer a bla request by sending own claims
+ * batadv_bla_answer_request() - answer a bla request by sending own claims
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: interface where the request came on
  * @vid: the vid where the request came on
@@ -637,7 +636,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_send_request - send a request to repeat claims
+ * batadv_bla_send_request() - send a request to repeat claims
  * @backbone_gw: the backbone gateway from whom we are out of sync
  *
  * When the crc is wrong, ask the backbone gateway for a full table update.
@@ -664,7 +663,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
 }
 
 /**
- * batadv_bla_send_announce - Send an announcement frame
+ * batadv_bla_send_announce() - Send an announcement frame
  * @bat_priv: the bat priv with all the soft interface information
  * @backbone_gw: our backbone gateway which should be announced
  */
@@ -685,7 +684,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_add_claim - Adds a claim in the claim hash
+ * batadv_bla_add_claim() - Adds a claim in the claim hash
  * @bat_priv: the bat priv with all the soft interface information
  * @mac: the mac address of the claim
  * @vid: the VLAN ID of the frame
@@ -775,7 +774,7 @@ claim_free_ref:
 }
 
 /**
- * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of
+ * batadv_bla_claim_get_backbone_gw() - Get valid reference for backbone_gw of
  *  claim
  * @claim: claim whose backbone_gw should be returned
  *
@@ -795,7 +794,7 @@ batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim)
 }
 
 /**
- * batadv_bla_del_claim - delete a claim from the claim hash
+ * batadv_bla_del_claim() - delete a claim from the claim hash
  * @bat_priv: the bat priv with all the soft interface information
  * @mac: mac address of the claim to be removed
  * @vid: VLAN id for the claim to be removed
@@ -823,7 +822,7 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_handle_announce - check for ANNOUNCE frame
+ * batadv_handle_announce() - check for ANNOUNCE frame
  * @bat_priv: the bat priv with all the soft interface information
  * @an_addr: announcement mac address (ARP Sender HW address)
  * @backbone_addr: originator address of the sender (Ethernet source MAC)
@@ -881,7 +880,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
 }
 
 /**
- * batadv_handle_request - check for REQUEST frame
+ * batadv_handle_request() - check for REQUEST frame
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary hard interface of this batman soft interface
  * @backbone_addr: backbone address to be requested (ARP sender HW MAC)
@@ -914,7 +913,7 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_handle_unclaim - check for UNCLAIM frame
+ * batadv_handle_unclaim() - check for UNCLAIM frame
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary hard interface of this batman soft interface
  * @backbone_addr: originator address of the backbone (Ethernet source)
@@ -952,7 +951,7 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_handle_claim - check for CLAIM frame
+ * batadv_handle_claim() - check for CLAIM frame
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary hard interface of this batman soft interface
  * @backbone_addr: originator address of the backbone (Ethernet Source)
@@ -989,7 +988,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_check_claim_group - check for claim group membership
+ * batadv_check_claim_group() - check for claim group membership
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary interface of this batman interface
  * @hw_src: the Hardware source in the ARP Header
@@ -1064,7 +1063,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_process_claim - Check if this is a claim frame, and process it
+ * batadv_bla_process_claim() - Check if this is a claim frame, and process it
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary hard interface of this batman soft interface
  * @skb: the frame to be checked
@@ -1206,7 +1205,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or
+ * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or
  *  immediately
  * @bat_priv: the bat priv with all the soft interface information
  * @now: whether the whole hash shall be wiped now
@@ -1259,7 +1258,7 @@ purge_now:
 }
 
 /**
- * batadv_bla_purge_claims - Remove claims after a timeout or immediately
+ * batadv_bla_purge_claims() - Remove claims after a timeout or immediately
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the selected primary interface, may be NULL if now is set
  * @now: whether the whole hash shall be wiped now
@@ -1317,7 +1316,7 @@ skip:
 }
 
 /**
- * batadv_bla_update_orig_address - Update the backbone gateways when the own
+ * batadv_bla_update_orig_address() - Update the backbone gateways when the own
  *  originator address changes
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the new selected primary_if
@@ -1373,7 +1372,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_send_loopdetect - send a loopdetect frame
+ * batadv_bla_send_loopdetect() - send a loopdetect frame
  * @bat_priv: the bat priv with all the soft interface information
  * @backbone_gw: the backbone gateway for which a loop should be detected
  *
@@ -1393,7 +1392,7 @@ batadv_bla_send_loopdetect(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_status_update - purge bla interfaces if necessary
+ * batadv_bla_status_update() - purge bla interfaces if necessary
  * @net_dev: the soft interface net device
  */
 void batadv_bla_status_update(struct net_device *net_dev)
@@ -1413,7 +1412,7 @@ void batadv_bla_status_update(struct net_device *net_dev)
 }
 
 /**
- * batadv_bla_periodic_work - performs periodic bla work
+ * batadv_bla_periodic_work() - performs periodic bla work
  * @work: kernel work struct
  *
  * periodic work to do:
@@ -1518,7 +1517,7 @@ static struct lock_class_key batadv_claim_hash_lock_class_key;
 static struct lock_class_key batadv_backbone_hash_lock_class_key;
 
 /**
- * batadv_bla_init - initialize all bla structures
+ * batadv_bla_init() - initialize all bla structures
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 on success, < 0 on error.
@@ -1580,7 +1579,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup.
+ * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup.
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: contains the bcast_packet to be checked
  *
@@ -1653,7 +1652,7 @@ out:
 }
 
 /**
- * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for
+ * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for
  *  the VLAN identified by vid.
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: originator mac address
@@ -1693,7 +1692,7 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
 }
 
 /**
- * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN.
+ * batadv_bla_is_backbone_gw() - check if originator is a backbone gw for a VLAN
  * @skb: the frame to be checked
  * @orig_node: the orig_node of the frame
  * @hdr_size: maximum length of the frame
@@ -1727,7 +1726,7 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
 }
 
 /**
- * batadv_bla_free - free all bla structures
+ * batadv_bla_free() - free all bla structures
  * @bat_priv: the bat priv with all the soft interface information
  *
  * for softinterface free or module unload
@@ -1754,7 +1753,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_bla_loopdetect_check - check and handle a detected loop
+ * batadv_bla_loopdetect_check() - check and handle a detected loop
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the packet to check
  * @primary_if: interface where the request came on
@@ -1803,7 +1802,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
 }
 
 /**
- * batadv_bla_rx - check packets coming from the mesh.
+ * batadv_bla_rx() - check packets coming from the mesh.
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the frame to be checked
  * @vid: the VLAN ID of the frame
@@ -1915,7 +1914,7 @@ out:
 }
 
 /**
- * batadv_bla_tx - check packets going into the mesh
+ * batadv_bla_tx() - check packets going into the mesh
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the frame to be checked
  * @vid: the VLAN ID of the frame
@@ -2023,7 +2022,7 @@ out:
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file
+ * batadv_bla_claim_table_seq_print_text() - print the claim table in a seq file
  * @seq: seq file to print on
  * @offset: not used
  *
@@ -2085,7 +2084,7 @@ out:
 #endif
 
 /**
- * batadv_bla_claim_dump_entry - dump one entry of the claim table
+ * batadv_bla_claim_dump_entry() - dump one entry of the claim table
  * to a netlink socket
  * @msg: buffer for the message
  * @portid: netlink port
@@ -2144,7 +2143,7 @@ out:
 }
 
 /**
- * batadv_bla_claim_dump_bucket - dump one bucket of the claim table
+ * batadv_bla_claim_dump_bucket() - dump one bucket of the claim table
  * to a netlink socket
  * @msg: buffer for the message
  * @portid: netlink port
@@ -2181,7 +2180,7 @@ unlock:
 }
 
 /**
- * batadv_bla_claim_dump - dump claim table to a netlink socket
+ * batadv_bla_claim_dump() - dump claim table to a netlink socket
  * @msg: buffer for the message
  * @cb: callback structure containing arguments
  *
@@ -2248,8 +2247,8 @@ out:
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq
- *  file
+ * batadv_bla_backbone_table_seq_print_text() - print the backbone table in a
+ *  seq file
  * @seq: seq file to print on
  * @offset: not used
  *
@@ -2313,8 +2312,8 @@ out:
 #endif
 
 /**
- * batadv_bla_backbone_dump_entry - dump one entry of the backbone table
- * to a netlink socket
+ * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a
+ *  netlink socket
  * @msg: buffer for the message
  * @portid: netlink port
  * @seq: Sequence number of netlink message
@@ -2374,8 +2373,8 @@ out:
 }
 
 /**
- * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table
- * to a netlink socket
+ * batadv_bla_backbone_dump_bucket() - dump one bucket of the backbone table to
+ *  a netlink socket
  * @msg: buffer for the message
  * @portid: netlink port
  * @seq: Sequence number of netlink message
@@ -2411,7 +2410,7 @@ unlock:
 }
 
 /**
- * batadv_bla_backbone_dump - dump backbone table to a netlink socket
+ * batadv_bla_backbone_dump() - dump backbone table to a netlink socket
  * @msg: buffer for the message
  * @cb: callback structure containing arguments
  *
@@ -2478,7 +2477,7 @@ out:
 
 #ifdef CONFIG_BATMAN_ADV_DAT
 /**
- * batadv_bla_check_claim - check if address is claimed
+ * batadv_bla_check_claim() - check if address is claimed
  *
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: mac address of which the claim status is checked
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index b568cec819c5..b27571abcd2f 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -31,8 +31,8 @@ struct seq_file;
 struct sk_buff;
 
 /**
- * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect
- *  frame sent by bridge loop avoidance
+ * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop
+ *  detect frame sent by bridge loop avoidance
  * @mac: mac address to check
  *
  * Return: true if the it looks like a loop detect frame
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index fddf16a3dc89..97d6eb45cbf2 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -66,8 +66,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file)
 }
 
 /**
- * batadv_originators_hardif_open - handles debugfs output for the
- *  originator table of an hard interface
+ * batadv_originators_hardif_open() - handles debugfs output for the originator
+ *  table of an hard interface
  * @inode: inode pointer to debugfs file
  * @file: pointer to the seq_file
  *
@@ -117,7 +117,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
 
 #ifdef CONFIG_BATMAN_ADV_DAT
 /**
- * batadv_dat_cache_open - Prepare file handler for reads from dat_chache
+ * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache
  * @inode: inode which was opened
  * @file: file handle to be initialized
  *
@@ -154,7 +154,7 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
 
 #ifdef CONFIG_BATMAN_ADV_MCAST
 /**
- * batadv_mcast_flags_open - prepare file handler for reads from mcast_flags
+ * batadv_mcast_flags_open() - prepare file handler for reads from mcast_flags
  * @inode: inode which was opened
  * @file: file handle to be initialized
  *
@@ -296,7 +296,7 @@ void batadv_debugfs_destroy(void)
 }
 
 /**
- * batadv_debugfs_add_hardif - creates the base directory for a hard interface
+ * batadv_debugfs_add_hardif() - creates the base directory for a hard interface
  *  in debugfs.
  * @hard_iface: hard interface which should be added.
  *
@@ -338,7 +338,7 @@ out:
 }
 
 /**
- * batadv_debugfs_del_hardif - delete the base directory for a hard interface
+ * batadv_debugfs_del_hardif() - delete the base directory for a hard interface
  *  in debugfs.
  * @hard_iface: hard interface which is deleted.
  */
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index f704bbc76e2a..9703c791ffc5 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -56,7 +56,7 @@
 static void batadv_dat_purge(struct work_struct *work);
 
 /**
- * batadv_dat_start_timer - initialise the DAT periodic worker
+ * batadv_dat_start_timer() - initialise the DAT periodic worker
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
@@ -67,7 +67,7 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_dat_entry_release - release dat_entry from lists and queue for free
+ * batadv_dat_entry_release() - release dat_entry from lists and queue for free
  *  after rcu grace period
  * @ref: kref pointer of the dat_entry
  */
@@ -81,7 +81,7 @@ static void batadv_dat_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly
+ * batadv_dat_entry_put() - decrement the dat_entry refcounter and possibly
  *  release it
  * @dat_entry: dat_entry to be free'd
  */
@@ -91,7 +91,7 @@ static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry)
 }
 
 /**
- * batadv_dat_to_purge - check whether a dat_entry has to be purged or not
+ * batadv_dat_to_purge() - check whether a dat_entry has to be purged or not
  * @dat_entry: the entry to check
  *
  * Return: true if the entry has to be purged now, false otherwise.
@@ -103,7 +103,7 @@ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry)
 }
 
 /**
- * __batadv_dat_purge - delete entries from the DAT local storage
+ * __batadv_dat_purge() - delete entries from the DAT local storage
  * @bat_priv: the bat priv with all the soft interface information
  * @to_purge: function in charge to decide whether an entry has to be purged or
  *	      not. This function takes the dat_entry as argument and has to
@@ -146,8 +146,8 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_dat_purge - periodic task that deletes old entries from the local DAT
- * hash table
+ * batadv_dat_purge() - periodic task that deletes old entries from the local
+ *  DAT hash table
  * @work: kernel work struct
  */
 static void batadv_dat_purge(struct work_struct *work)
@@ -165,7 +165,7 @@ static void batadv_dat_purge(struct work_struct *work)
 }
 
 /**
- * batadv_compare_dat - comparing function used in the local DAT hash table
+ * batadv_compare_dat() - comparing function used in the local DAT hash table
  * @node: node in the local table
  * @data2: second object to compare the node to
  *
@@ -180,7 +180,7 @@ static bool batadv_compare_dat(const struct hlist_node *node, const void *data2)
 }
 
 /**
- * batadv_arp_hw_src - extract the hw_src field from an ARP packet
+ * batadv_arp_hw_src() - extract the hw_src field from an ARP packet
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
@@ -197,7 +197,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
 }
 
 /**
- * batadv_arp_ip_src - extract the ip_src field from an ARP packet
+ * batadv_arp_ip_src() - extract the ip_src field from an ARP packet
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
@@ -209,7 +209,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
 }
 
 /**
- * batadv_arp_hw_dst - extract the hw_dst field from an ARP packet
+ * batadv_arp_hw_dst() - extract the hw_dst field from an ARP packet
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
@@ -221,7 +221,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
 }
 
 /**
- * batadv_arp_ip_dst - extract the ip_dst field from an ARP packet
+ * batadv_arp_ip_dst() - extract the ip_dst field from an ARP packet
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
@@ -233,7 +233,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
 }
 
 /**
- * batadv_hash_dat - compute the hash value for an IP address
+ * batadv_hash_dat() - compute the hash value for an IP address
  * @data: data to hash
  * @size: size of the hash table
  *
@@ -268,7 +268,7 @@ static u32 batadv_hash_dat(const void *data, u32 size)
 }
 
 /**
- * batadv_dat_entry_hash_find - look for a given dat_entry in the local hash
+ * batadv_dat_entry_hash_find() - look for a given dat_entry in the local hash
  * table
  * @bat_priv: the bat priv with all the soft interface information
  * @ip: search key
@@ -311,7 +311,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
 }
 
 /**
- * batadv_dat_entry_add - add a new dat entry or update it if already exists
+ * batadv_dat_entry_add() - add a new dat entry or update it if already exists
  * @bat_priv: the bat priv with all the soft interface information
  * @ip: ipv4 to add/edit
  * @mac_addr: mac address to assign to the given ipv4
@@ -368,7 +368,8 @@ out:
 #ifdef CONFIG_BATMAN_ADV_DEBUG
 
 /**
- * batadv_dbg_arp - print a debug message containing all the ARP packet details
+ * batadv_dbg_arp() - print a debug message containing all the ARP packet
+ *  details
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
@@ -449,7 +450,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
 #endif /* CONFIG_BATMAN_ADV_DEBUG */
 
 /**
- * batadv_is_orig_node_eligible - check whether a node can be a DHT candidate
+ * batadv_is_orig_node_eligible() - check whether a node can be a DHT candidate
  * @res: the array with the already selected candidates
  * @select: number of already selected candidates
  * @tmp_max: address of the currently evaluated node
@@ -503,7 +504,7 @@ out:
 }
 
 /**
- * batadv_choose_next_candidate - select the next DHT candidate
+ * batadv_choose_next_candidate() - select the next DHT candidate
  * @bat_priv: the bat priv with all the soft interface information
  * @cands: candidates array
  * @select: number of candidates already present in the array
@@ -567,8 +568,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_dat_select_candidates - select the nodes which the DHT message has to
- * be sent to
+ * batadv_dat_select_candidates() - select the nodes which the DHT message has
+ *  to be sent to
  * @bat_priv: the bat priv with all the soft interface information
  * @ip_dst: ipv4 to look up in the DHT
  * @vid: VLAN identifier
@@ -613,7 +614,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst,
 }
 
 /**
- * batadv_dat_send_data - send a payload to the selected candidates
+ * batadv_dat_send_data() - send a payload to the selected candidates
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: payload to send
  * @ip: the DHT key
@@ -689,7 +690,7 @@ out:
 }
 
 /**
- * batadv_dat_tvlv_container_update - update the dat tvlv container after dat
+ * batadv_dat_tvlv_container_update() - update the dat tvlv container after dat
  *  setting change
  * @bat_priv: the bat priv with all the soft interface information
  */
@@ -711,7 +712,7 @@ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_dat_status_update - update the dat tvlv container after dat
+ * batadv_dat_status_update() - update the dat tvlv container after dat
  *  setting change
  * @net_dev: the soft interface net device
  */
@@ -723,7 +724,7 @@ void batadv_dat_status_update(struct net_device *net_dev)
 }
 
 /**
- * batadv_dat_tvlv_ogm_handler_v1 - process incoming dat tvlv container
+ * batadv_dat_tvlv_ogm_handler_v1() - process incoming dat tvlv container
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node of the ogm
  * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
@@ -742,7 +743,7 @@ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_dat_hash_free - free the local DAT hash table
+ * batadv_dat_hash_free() - free the local DAT hash table
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
@@ -758,7 +759,7 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_dat_init - initialise the DAT internals
+ * batadv_dat_init() - initialise the DAT internals
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 in case of success, a negative error code otherwise
@@ -783,7 +784,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_dat_free - free the DAT internals
+ * batadv_dat_free() - free the DAT internals
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_dat_free(struct batadv_priv *bat_priv)
@@ -798,7 +799,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_dat_cache_seq_print_text - print the local DAT hash table
+ * batadv_dat_cache_seq_print_text() - print the local DAT hash table
  * @seq: seq file to print on
  * @offset: not used
  *
@@ -851,7 +852,7 @@ out:
 #endif
 
 /**
- * batadv_arp_get_type - parse an ARP packet and gets the type
+ * batadv_arp_get_type() - parse an ARP packet and gets the type
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to analyse
  * @hdr_size: size of the possible header before the ARP packet in the skb
@@ -925,7 +926,7 @@ out:
 }
 
 /**
- * batadv_dat_get_vid - extract the VLAN identifier from skb if any
+ * batadv_dat_get_vid() - extract the VLAN identifier from skb if any
  * @skb: the buffer containing the packet to extract the VID from
  * @hdr_size: the size of the batman-adv header encapsulating the packet
  *
@@ -951,7 +952,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
 }
 
 /**
- * batadv_dat_arp_create_reply - create an ARP Reply
+ * batadv_dat_arp_create_reply() - create an ARP Reply
  * @bat_priv: the bat priv with all the soft interface information
  * @ip_src: ARP sender IP
  * @ip_dst: ARP target IP
@@ -986,7 +987,7 @@ batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
 }
 
 /**
- * batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to
+ * batadv_dat_snoop_outgoing_arp_request() - snoop the ARP request and try to
  * answer using DAT
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to check
@@ -1084,7 +1085,7 @@ out:
 }
 
 /**
- * batadv_dat_snoop_incoming_arp_request - snoop the ARP request and try to
+ * batadv_dat_snoop_incoming_arp_request() - snoop the ARP request and try to
  * answer using the local DAT storage
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to check
@@ -1154,7 +1155,7 @@ out:
 }
 
 /**
- * batadv_dat_snoop_outgoing_arp_reply - snoop the ARP reply and fill the DHT
+ * batadv_dat_snoop_outgoing_arp_reply() - snoop the ARP reply and fill the DHT
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to check
  */
@@ -1194,8 +1195,8 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_dat_snoop_incoming_arp_reply - snoop the ARP reply and fill the local
- * DAT storage only
+ * batadv_dat_snoop_incoming_arp_reply() - snoop the ARP reply and fill the
+ *  local DAT storage only
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to check
  * @hdr_size: size of the encapsulation header
@@ -1283,8 +1284,8 @@ out:
 }
 
 /**
- * batadv_dat_drop_broadcast_packet - check if an ARP request has to be dropped
- * (because the node has already obtained the reply via DAT) or not
+ * batadv_dat_drop_broadcast_packet() - check if an ARP request has to be
+ *  dropped (because the node has already obtained the reply via DAT) or not
  * @bat_priv: the bat priv with all the soft interface information
  * @forw_packet: the broadcast packet
  *
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index d81a05a6e6f9..3d47bedaf661 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -49,7 +49,7 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
 				      struct batadv_forw_packet *forw_packet);
 
 /**
- * batadv_dat_init_orig_node_addr - assign a DAT address to the orig_node
+ * batadv_dat_init_orig_node_addr() - assign a DAT address to the orig_node
  * @orig_node: the node to assign the DAT address to
  */
 static inline void
@@ -62,7 +62,7 @@ batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
 }
 
 /**
- * batadv_dat_init_own_addr - assign a DAT address to the node itself
+ * batadv_dat_init_own_addr() - assign a DAT address to the node itself
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: a pointer to the primary interface
  */
@@ -83,7 +83,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv);
 int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset);
 
 /**
- * batadv_dat_inc_counter - increment the correct DAT packet counter
+ * batadv_dat_inc_counter() - increment the correct DAT packet counter
  * @bat_priv: the bat priv with all the soft interface information
  * @subtype: the 4addr subtype of the packet to be counted
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 741c6b91664e..36f8a84153bf 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -42,7 +42,7 @@
 #include "soft-interface.h"
 
 /**
- * batadv_frag_clear_chain - delete entries in the fragment buffer chain
+ * batadv_frag_clear_chain() - delete entries in the fragment buffer chain
  * @head: head of chain with entries.
  * @dropped: whether the chain is cleared because all fragments are dropped
  *
@@ -66,7 +66,7 @@ static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped)
 }
 
 /**
- * batadv_frag_purge_orig - free fragments associated to an orig
+ * batadv_frag_purge_orig() - free fragments associated to an orig
  * @orig_node: originator to free fragments from
  * @check_cb: optional function to tell if an entry should be purged
  */
@@ -90,7 +90,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_frag_size_limit - maximum possible size of packet to be fragmented
+ * batadv_frag_size_limit() - maximum possible size of packet to be fragmented
  *
  * Return: the maximum size of payload that can be fragmented.
  */
@@ -105,7 +105,7 @@ static int batadv_frag_size_limit(void)
 }
 
 /**
- * batadv_frag_init_chain - check and prepare fragment chain for new fragment
+ * batadv_frag_init_chain() - check and prepare fragment chain for new fragment
  * @chain: chain in fragments table to init
  * @seqno: sequence number of the received fragment
  *
@@ -135,7 +135,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
 }
 
 /**
- * batadv_frag_insert_packet - insert a fragment into a fragment chain
+ * batadv_frag_insert_packet() - insert a fragment into a fragment chain
  * @orig_node: originator that the fragment was received from
  * @skb: skb to insert
  * @chain_out: list head to attach complete chains of fragments to
@@ -249,7 +249,7 @@ err:
 }
 
 /**
- * batadv_frag_merge_packets - merge a chain of fragments
+ * batadv_frag_merge_packets() - merge a chain of fragments
  * @chain: head of chain with fragments
  *
  * Expand the first skb in the chain and copy the content of the remaining
@@ -307,7 +307,7 @@ free:
 }
 
 /**
- * batadv_frag_skb_buffer - buffer fragment for later merge
+ * batadv_frag_skb_buffer() - buffer fragment for later merge
  * @skb: skb to buffer
  * @orig_node_src: originator that the skb is received from
  *
@@ -347,7 +347,7 @@ out_err:
 }
 
 /**
- * batadv_frag_skb_fwd - forward fragments that would exceed MTU when merged
+ * batadv_frag_skb_fwd() - forward fragments that would exceed MTU when merged
  * @skb: skb to forward
  * @recv_if: interface that the skb is received on
  * @orig_node_src: originator that the skb is received from
@@ -401,7 +401,7 @@ out:
 }
 
 /**
- * batadv_frag_create - create a fragment from skb
+ * batadv_frag_create() - create a fragment from skb
  * @skb: skb to create fragment from
  * @frag_head: header to use in new fragment
  * @fragment_size: size of new fragment
@@ -439,7 +439,7 @@ err:
 }
 
 /**
- * batadv_frag_send_packet - create up to 16 fragments from the passed skb
+ * batadv_frag_send_packet() - create up to 16 fragments from the passed skb
  * @skb: skb to create fragments from
  * @orig_node: final destination of the created fragments
  * @neigh_node: next-hop of the created fragments
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 30ffa992fcfc..138b22a1836a 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -40,7 +40,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
 			    struct batadv_neigh_node *neigh_node);
 
 /**
- * batadv_frag_check_entry - check if a list of fragments has timed out
+ * batadv_frag_check_entry() - check if a list of fragments has timed out
  * @frags_entry: table entry to check
  *
  * Return: true if the frags entry has timed out, false otherwise.
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 21db0165175b..6731f7dabeb9 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -69,8 +69,8 @@
 #define BATADV_DHCP_CHADDR_OFFSET	28
 
 /**
- * batadv_gw_node_release - release gw_node from lists and queue for free after
- *  rcu grace period
+ * batadv_gw_node_release() - release gw_node from lists and queue for free
+ *  after rcu grace period
  * @ref: kref pointer of the gw_node
  */
 static void batadv_gw_node_release(struct kref *ref)
@@ -84,7 +84,8 @@ static void batadv_gw_node_release(struct kref *ref)
 }
 
 /**
- * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it
+ * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release
+ *  it
  * @gw_node: gateway node to free
  */
 void batadv_gw_node_put(struct batadv_gw_node *gw_node)
@@ -156,7 +157,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_gw_reselect - force a gateway reselection
+ * batadv_gw_reselect() - force a gateway reselection
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Set a flag to remind the GW component to perform a new gateway reselection.
@@ -172,7 +173,7 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_gw_check_client_stop - check if client mode has been switched off
+ * batadv_gw_check_client_stop() - check if client mode has been switched off
  * @bat_priv: the bat priv with all the soft interface information
  *
  * This function assumes the caller has checked that the gw state *is actually
@@ -322,7 +323,7 @@ out:
 }
 
 /**
- * batadv_gw_node_add - add gateway node to list of available gateways
+ * batadv_gw_node_add() - add gateway node to list of available gateways
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: originator announcing gateway capabilities
  * @gateway: announced bandwidth information
@@ -365,7 +366,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_gw_node_get - retrieve gateway node from list of available gateways
+ * batadv_gw_node_get() - retrieve gateway node from list of available gateways
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: originator announcing gateway capabilities
  *
@@ -394,7 +395,7 @@ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_gw_node_update - update list of available gateways with changed
+ * batadv_gw_node_update() - update list of available gateways with changed
  *  bandwidth information
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: originator announcing gateway capabilities
@@ -515,7 +516,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
 #endif
 
 /**
- * batadv_gw_dump - Dump gateways into a message
+ * batadv_gw_dump() - Dump gateways into a message
  * @msg: Netlink message to dump into
  * @cb: Control block containing additional options
  *
@@ -568,7 +569,7 @@ out:
 }
 
 /**
- * batadv_gw_dhcp_recipient_get - check if a packet is a DHCP message
+ * batadv_gw_dhcp_recipient_get() - check if a packet is a DHCP message
  * @skb: the packet to check
  * @header_len: a pointer to the batman-adv header size
  * @chaddr: buffer where the client address will be stored. Valid
@@ -687,7 +688,8 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
 }
 
 /**
- * batadv_gw_out_of_range - check if the dhcp request destination is the best gw
+ * batadv_gw_out_of_range() - check if the dhcp request destination is the best
+ *  gateway
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the outgoing packet
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index a7039503d88e..1c58727835ca 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -34,8 +34,8 @@
 #include "tvlv.h"
 
 /**
- * batadv_parse_throughput - parse supplied string buffer to extract throughput
- *  information
+ * batadv_parse_throughput() - parse supplied string buffer to extract
+ *  throughput information
  * @net_dev: the soft interface net device
  * @buff: string buffer to parse
  * @description: text shown when throughput string cannot be parsed
@@ -101,8 +101,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
 }
 
 /**
- * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download
- *  and upload bandwidth information
+ * batadv_parse_gw_bandwidth() - parse supplied string buffer to extract
+ *  download and upload bandwidth information
  * @net_dev: the soft interface net device
  * @buff: string buffer to parse
  * @down: pointer holding the returned download bandwidth information
@@ -137,8 +137,8 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
 }
 
 /**
- * batadv_gw_tvlv_container_update - update the gw tvlv container after gateway
- *  setting change
+ * batadv_gw_tvlv_container_update() - update the gw tvlv container after
+ *  gateway setting change
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
@@ -208,7 +208,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
 }
 
 /**
- * batadv_gw_tvlv_ogm_handler_v1 - process incoming gateway tvlv container
+ * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node of the ogm
  * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
@@ -249,7 +249,7 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_gw_init - initialise the gateway handling internals
+ * batadv_gw_init() - initialise the gateway handling internals
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_gw_init(struct batadv_priv *bat_priv)
@@ -265,7 +265,7 @@ void batadv_gw_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_gw_free - free the gateway handling internals
+ * batadv_gw_free() - free the gateway handling internals
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_gw_free(struct batadv_priv *bat_priv)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 394e69b77535..33425a022026 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -53,7 +53,7 @@
 #include "translation-table.h"
 
 /**
- * batadv_hardif_release - release hard interface from lists and queue for
+ * batadv_hardif_release() - release hard interface from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the hard interface
  */
@@ -87,7 +87,7 @@ out:
 }
 
 /**
- * batadv_getlink_net - return link net namespace (of use fallback)
+ * batadv_getlink_net() - return link net namespace (of use fallback)
  * @netdev: net_device to check
  * @fallback_net: return in case get_link_net is not available for @netdev
  *
@@ -106,7 +106,7 @@ static struct net *batadv_getlink_net(const struct net_device *netdev,
 }
 
 /**
- * batadv_mutual_parents - check if two devices are each others parent
+ * batadv_mutual_parents() - check if two devices are each others parent
  * @dev1: 1st net dev
  * @net1: 1st devices netns
  * @dev2: 2nd net dev
@@ -139,7 +139,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1,
 }
 
 /**
- * batadv_is_on_batman_iface - check if a device is a batman iface descendant
+ * batadv_is_on_batman_iface() - check if a device is a batman iface descendant
  * @net_dev: the device to check
  *
  * If the user creates any virtual device on top of a batman-adv interface, it
@@ -203,7 +203,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev)
 }
 
 /**
- * batadv_get_real_netdevice - check if the given netdev struct is a virtual
+ * batadv_get_real_netdevice() - check if the given netdev struct is a virtual
  *  interface on top of another 'real' interface
  * @netdev: the device to check
  *
@@ -247,7 +247,7 @@ out:
 }
 
 /**
- * batadv_get_real_netdev - check if the given net_device struct is a virtual
+ * batadv_get_real_netdev() - check if the given net_device struct is a virtual
  *  interface on top of another 'real' interface
  * @net_device: the device to check
  *
@@ -266,7 +266,7 @@ struct net_device *batadv_get_real_netdev(struct net_device *net_device)
 }
 
 /**
- * batadv_is_wext_netdev - check if the given net_device struct is a
+ * batadv_is_wext_netdev() - check if the given net_device struct is a
  *  wext wifi interface
  * @net_device: the device to check
  *
@@ -290,7 +290,7 @@ static bool batadv_is_wext_netdev(struct net_device *net_device)
 }
 
 /**
- * batadv_is_cfg80211_netdev - check if the given net_device struct is a
+ * batadv_is_cfg80211_netdev() - check if the given net_device struct is a
  *  cfg80211 wifi interface
  * @net_device: the device to check
  *
@@ -310,7 +310,7 @@ static bool batadv_is_cfg80211_netdev(struct net_device *net_device)
 }
 
 /**
- * batadv_wifi_flags_evaluate - calculate wifi flags for net_device
+ * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device
  * @net_device: the device to check
  *
  * Return: batadv_hard_iface_wifi_flags flags of the device
@@ -345,7 +345,7 @@ out:
 }
 
 /**
- * batadv_is_cfg80211_hardif - check if the given hardif is a cfg80211 wifi
+ * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi
  *  interface
  * @hard_iface: the device to check
  *
@@ -363,7 +363,7 @@ bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_is_wifi_hardif - check if the given hardif is a wifi interface
+ * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface
  * @hard_iface: the device to check
  *
  * Return: true if the net device is a 802.11 wireless device, false otherwise.
@@ -377,7 +377,7 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_hardif_no_broadcast - check whether (re)broadcast is necessary
+ * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary
  * @if_outgoing: the outgoing interface checked and considered for (re)broadcast
  * @orig_addr: the originator of this packet
  * @orig_neigh: originator address of the forwarder we just got the packet from
@@ -668,7 +668,7 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
 }
 
 /**
- * batadv_master_del_slave - remove hard_iface from the current master interface
+ * batadv_master_del_slave() - remove hard_iface from the current master iface
  * @slave: the interface enslaved in another master
  * @master: the master from which slave has to be removed
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index ac7311a91f9d..a7f9036f0e3a 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -83,7 +83,7 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
 			       u8 *orig_addr, u8 *orig_neigh);
 
 /**
- * batadv_hardif_put - decrement the hard interface refcounter and possibly
+ * batadv_hardif_put() - decrement the hard interface refcounter and possibly
  *  release it
  * @hard_iface: the hard interface to free
  */
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 81cf54eb2fad..c92fde593959 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -63,7 +63,7 @@ void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
 void batadv_hash_destroy(struct batadv_hashtable *hash);
 
 /**
- *	batadv_hash_add - adds data to the hashtable
+ *	batadv_hash_add() - adds data to the hashtable
  *	@hash: storage hash table
  *	@compare: callback to determine if 2 hash elements are identical
  *	@choose: callback calculating the hash index
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 71ba58cc51fa..8af5d30e59b1 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -335,7 +335,7 @@ err:
 }
 
 /**
- * batadv_socket_add_packet - schedule an icmp packet to be sent to
+ * batadv_socket_add_packet() - schedule an icmp packet to be sent to
  *  userspace on an icmp socket.
  * @socket_client: the socket this packet belongs to
  * @icmph: pointer to the header of the icmp packet
@@ -392,7 +392,7 @@ static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
 }
 
 /**
- * batadv_socket_receive_packet - schedule an icmp packet to be received
+ * batadv_socket_receive_packet() - schedule an icmp packet to be received
  *  locally and sent to userspace.
  * @icmph: pointer to the header of the icmp packet
  * @icmp_len: total length of the icmp packet
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 6f6c500e8aa8..e6e1f5eae494 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -256,8 +256,8 @@ void batadv_mesh_free(struct net_device *soft_iface)
 }
 
 /**
- * batadv_is_my_mac - check if the given mac address belongs to any of the real
- * interfaces in the current mesh
+ * batadv_is_my_mac() - check if the given mac address belongs to any of the
+ *  real interfaces in the current mesh
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the address to check
  *
@@ -287,7 +287,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_seq_print_text_primary_if_get - called from debugfs table printing
+ * batadv_seq_print_text_primary_if_get() - called from debugfs table printing
  *  function that requires the primary interface
  * @seq: debugfs table seq_file struct
  *
@@ -324,7 +324,7 @@ out:
 #endif
 
 /**
- * batadv_max_header_len - calculate maximum encapsulation overhead for a
+ * batadv_max_header_len() - calculate maximum encapsulation overhead for a
  *  payload packet
  *
  * Return: the maximum encapsulation overhead in bytes.
@@ -349,7 +349,7 @@ int batadv_max_header_len(void)
 }
 
 /**
- * batadv_skb_set_priority - sets skb priority according to packet content
+ * batadv_skb_set_priority() - sets skb priority according to packet content
  * @skb: the packet to be sent
  * @offset: offset to the packet content
  *
@@ -559,7 +559,7 @@ void batadv_recv_handler_unregister(u8 packet_type)
 }
 
 /**
- * batadv_skb_crc32 - calculate CRC32 of the whole packet and skip bytes in
+ * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in
  *  the header
  * @skb: skb pointing to fragmented socket buffers
  * @payload_ptr: Pointer to position inside the head buffer of the skb
@@ -592,7 +592,7 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
 }
 
 /**
- * batadv_get_vid - extract the VLAN identifier from skb if any
+ * batadv_get_vid() - extract the VLAN identifier from skb if any
  * @skb: the buffer containing the packet
  * @header_len: length of the batman header preceding the ethernet header
  *
@@ -619,7 +619,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
 }
 
 /**
- * batadv_vlan_ap_isola_get - return the AP isolation status for the given vlan
+ * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier for which the AP isolation attributed as to be
  *  looked up
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 4bdb39ab3b20..7f6a3123e1a4 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -203,7 +203,7 @@ struct seq_file;
 struct sk_buff;
 
 /**
- * batadv_print_vid - return printable version of vid information
+ * batadv_print_vid() - return printable version of vid information
  * @vid: the VLAN identifier
  *
  * Return: -1 when no VLAN is used, VLAN id otherwise
@@ -239,7 +239,7 @@ void batadv_recv_handler_unregister(u8 packet_type);
 __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr);
 
 /**
- * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses
+ * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses
  * @data1: Pointer to a six-byte array containing the Ethernet address
  * @data2: Pointer other six-byte array containing the Ethernet address
  *
@@ -253,7 +253,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2)
 }
 
 /**
- * batadv_has_timed_out - compares current time (jiffies) and timestamp +
+ * batadv_has_timed_out() - compares current time (jiffies) and timestamp +
  *  timeout
  * @timestamp:		base value to compare with (in jiffies)
  * @timeout:		added to base value before comparing (in milliseconds)
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index d8617c2794db..8a503c526b90 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -66,7 +66,7 @@
 static void batadv_mcast_mla_update(struct work_struct *work);
 
 /**
- * batadv_mcast_start_timer - schedule the multicast periodic worker
+ * batadv_mcast_start_timer() - schedule the multicast periodic worker
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
@@ -76,7 +76,7 @@ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists
+ * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists
  * @soft_iface: netdev struct of the mesh interface
  *
  * If the given soft interface has a bridge on top then the refcount
@@ -102,7 +102,7 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
 }
 
 /**
- * batadv_mcast_mla_softif_get - get softif multicast listeners
+ * batadv_mcast_mla_softif_get() - get softif multicast listeners
  * @dev: the device to collect multicast addresses from
  * @mcast_list: a list to put found addresses into
  *
@@ -148,7 +148,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev,
 }
 
 /**
- * batadv_mcast_mla_is_duplicate - check whether an address is in a list
+ * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
  * @mcast_addr: the multicast address to check
  * @mcast_list: the list with multicast addresses to search in
  *
@@ -168,7 +168,7 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
 }
 
 /**
- * batadv_mcast_mla_br_addr_cpy - copy a bridge multicast address
+ * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address
  * @dst: destination to write to - a multicast MAC address
  * @src: source to read from - a multicast IP address
  *
@@ -192,7 +192,7 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
 }
 
 /**
- * batadv_mcast_mla_bridge_get - get bridged-in multicast listeners
+ * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners
  * @dev: a bridge slave whose bridge to collect multicast addresses from
  * @mcast_list: a list to put found addresses into
  *
@@ -245,7 +245,7 @@ out:
 }
 
 /**
- * batadv_mcast_mla_list_free - free a list of multicast addresses
+ * batadv_mcast_mla_list_free() - free a list of multicast addresses
  * @mcast_list: the list to free
  *
  * Removes and frees all items in the given mcast_list.
@@ -262,7 +262,7 @@ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
 }
 
 /**
- * batadv_mcast_mla_tt_retract - clean up multicast listener announcements
+ * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements
  * @bat_priv: the bat priv with all the soft interface information
  * @mcast_list: a list of addresses which should _not_ be removed
  *
@@ -298,7 +298,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_mla_tt_add - add multicast listener announcements
+ * batadv_mcast_mla_tt_add() - add multicast listener announcements
  * @bat_priv: the bat priv with all the soft interface information
  * @mcast_list: a list of addresses which are going to get added
  *
@@ -334,7 +334,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_has_bridge - check whether the soft-iface is bridged
+ * batadv_mcast_has_bridge() - check whether the soft-iface is bridged
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Checks whether there is a bridge on top of our soft interface.
@@ -355,7 +355,8 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_querier_log - debug output regarding the querier status on link
+ * batadv_mcast_querier_log() - debug output regarding the querier status on
+ *  link
  * @bat_priv: the bat priv with all the soft interface information
  * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD")
  * @old_state: the previous querier state on our link
@@ -406,7 +407,8 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
 }
 
 /**
- * batadv_mcast_bridge_log - debug output for topology changes in bridged setups
+ * batadv_mcast_bridge_log() - debug output for topology changes in bridged
+ *  setups
  * @bat_priv: the bat priv with all the soft interface information
  * @bridged: a flag about whether the soft interface is currently bridged or not
  * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier
@@ -445,7 +447,7 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged,
 }
 
 /**
- * batadv_mcast_flags_logs - output debug information about mcast flag changes
+ * batadv_mcast_flags_logs() - output debug information about mcast flag changes
  * @bat_priv: the bat priv with all the soft interface information
  * @flags: flags indicating the new multicast state
  *
@@ -471,7 +473,7 @@ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags)
 }
 
 /**
- * batadv_mcast_mla_tvlv_update - update multicast tvlv
+ * batadv_mcast_mla_tvlv_update() - update multicast tvlv
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Updates the own multicast tvlv with our current multicast related settings,
@@ -546,7 +548,7 @@ update:
 }
 
 /**
- * __batadv_mcast_mla_update - update the own MLAs
+ * __batadv_mcast_mla_update() - update the own MLAs
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Updates the own multicast listener announcements in the translation
@@ -583,7 +585,7 @@ out:
 }
 
 /**
- * batadv_mcast_mla_update - update the own MLAs
+ * batadv_mcast_mla_update() - update the own MLAs
  * @work: kernel work struct
  *
  * Updates the own multicast listener announcements in the translation
@@ -606,7 +608,7 @@ static void batadv_mcast_mla_update(struct work_struct *work)
 }
 
 /**
- * batadv_mcast_is_report_ipv4 - check for IGMP reports
+ * batadv_mcast_is_report_ipv4() - check for IGMP reports
  * @skb: the ethernet frame destined for the mesh
  *
  * This call might reallocate skb data.
@@ -631,7 +633,8 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
 }
 
 /**
- * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential
+ * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding
+ *  potential
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the IPv4 packet to check
  * @is_unsnoopable: stores whether the destination is snoopable
@@ -672,7 +675,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_is_report_ipv6 - check for MLD reports
+ * batadv_mcast_is_report_ipv6() - check for MLD reports
  * @skb: the ethernet frame destined for the mesh
  *
  * This call might reallocate skb data.
@@ -696,7 +699,8 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
 }
 
 /**
- * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential
+ * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding
+ *  potential
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the IPv6 packet to check
  * @is_unsnoopable: stores whether the destination is snoopable
@@ -737,7 +741,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_forw_mode_check - check for optimized forwarding potential
+ * batadv_mcast_forw_mode_check() - check for optimized forwarding potential
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the multicast frame to check
  * @is_unsnoopable: stores whether the destination is snoopable
@@ -775,7 +779,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_forw_want_all_ip_count - count nodes with unspecific mcast
+ * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast
  *  interest
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: ethernet header of a packet
@@ -799,7 +803,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_forw_tt_node_get - get a multicast tt node
+ * batadv_mcast_forw_tt_node_get() - get a multicast tt node
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: the ether header containing the multicast destination
  *
@@ -815,7 +819,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_forw_ipv4_node_get - get a node with an ipv4 flag
+ * batadv_mcast_forw_ipv4_node_get() - get a node with an ipv4 flag
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
@@ -842,7 +846,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_forw_ipv6_node_get - get a node with an ipv6 flag
+ * batadv_mcast_forw_ipv6_node_get() - get a node with an ipv6 flag
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
@@ -869,7 +873,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_forw_ip_node_get - get a node with an ipv4/ipv6 flag
+ * batadv_mcast_forw_ip_node_get() - get a node with an ipv4/ipv6 flag
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: an ethernet header to determine the protocol family from
  *
@@ -893,7 +897,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_forw_unsnoop_node_get - get a node with an unsnoopable flag
+ * batadv_mcast_forw_unsnoop_node_get() - get a node with an unsnoopable flag
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
@@ -920,7 +924,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_forw_mode - check on how to forward a multicast packet
+ * batadv_mcast_forw_mode() - check on how to forward a multicast packet
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: The multicast packet to check
  * @orig: an originator to be set to forward the skb to
@@ -974,7 +978,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
 }
 
 /**
- * batadv_mcast_want_unsnoop_update - update unsnoop counter and list
+ * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node which multicast state might have changed of
  * @mcast_flags: flags indicating the new multicast state
@@ -1019,7 +1023,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_want_ipv4_update - update want-all-ipv4 counter and list
+ * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node which multicast state might have changed of
  * @mcast_flags: flags indicating the new multicast state
@@ -1064,7 +1068,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_want_ipv6_update - update want-all-ipv6 counter and list
+ * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node which multicast state might have changed of
  * @mcast_flags: flags indicating the new multicast state
@@ -1109,7 +1113,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_tvlv_ogm_handler - process incoming multicast tvlv container
+ * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node of the ogm
  * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
@@ -1165,7 +1169,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_init - initialize the multicast optimizations structures
+ * batadv_mcast_init() - initialize the multicast optimizations structures
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_mcast_init(struct batadv_priv *bat_priv)
@@ -1180,7 +1184,7 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_mcast_flags_print_header - print own mcast flags to debugfs table
+ * batadv_mcast_flags_print_header() - print own mcast flags to debugfs table
  * @bat_priv: the bat priv with all the soft interface information
  * @seq: debugfs table seq_file struct
  *
@@ -1221,7 +1225,7 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_mcast_flags_seq_print_text - print the mcast flags of other nodes
+ * batadv_mcast_flags_seq_print_text() - print the mcast flags of other nodes
  * @seq: seq file to print on
  * @offset: not used
  *
@@ -1282,7 +1286,7 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
 #endif
 
 /**
- * batadv_mcast_free - free the multicast optimizations structures
+ * batadv_mcast_free() - free the multicast optimizations structures
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_mcast_free(struct batadv_priv *bat_priv)
@@ -1297,7 +1301,7 @@ void batadv_mcast_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_purge_orig - reset originator global mcast state modifications
+ * batadv_mcast_purge_orig() - reset originator global mcast state modifications
  * @orig: the originator which is going to get purged
  */
 void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index f7281685633c..103d4bdcdbdb 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -100,7 +100,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 };
 
 /**
- * batadv_netlink_get_ifindex - Extract an interface index from a message
+ * batadv_netlink_get_ifindex() - Extract an interface index from a message
  * @nlh: Message header
  * @attrtype: Attribute which holds an interface index
  *
@@ -115,7 +115,7 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
 }
 
 /**
- * batadv_netlink_mesh_info_put - fill in generic information about mesh
+ * batadv_netlink_mesh_info_put() - fill in generic information about mesh
  *  interface
  * @msg: netlink message to be sent back
  * @soft_iface: interface for which the data should be taken
@@ -170,7 +170,7 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
 }
 
 /**
- * batadv_netlink_get_mesh_info - handle incoming BATADV_CMD_GET_MESH_INFO
+ * batadv_netlink_get_mesh_info() - handle incoming BATADV_CMD_GET_MESH_INFO
  *  netlink request
  * @skb: received netlink message
  * @info: receiver information
@@ -231,7 +231,7 @@ batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info)
 }
 
 /**
- * batadv_netlink_tp_meter_put - Fill information of started tp_meter session
+ * batadv_netlink_tp_meter_put() - Fill information of started tp_meter session
  * @msg: netlink message to be sent back
  * @cookie: tp meter session cookie
  *
@@ -247,7 +247,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie)
 }
 
 /**
- * batadv_netlink_tpmeter_notify - send tp_meter result via netlink to client
+ * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client
  * @bat_priv: the bat priv with all the soft interface information
  * @dst: destination of tp_meter session
  * @result: reason for tp meter session stop
@@ -310,7 +310,7 @@ err_genlmsg:
 }
 
 /**
- * batadv_netlink_tp_meter_start - Start a new tp_meter session
+ * batadv_netlink_tp_meter_start() - Start a new tp_meter session
  * @skb: received netlink message
  * @info: receiver information
  *
@@ -387,7 +387,7 @@ batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
 }
 
 /**
- * batadv_netlink_tp_meter_start - Cancel a running tp_meter session
+ * batadv_netlink_tp_meter_start() - Cancel a running tp_meter session
  * @skb: received netlink message
  * @info: receiver information
  *
@@ -432,7 +432,7 @@ out:
 }
 
 /**
- * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message
+ * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -474,7 +474,7 @@ batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_netlink_dump_hardifs - Dump all hard interface into a messages
+ * batadv_netlink_dump_hardifs() - Dump all hard interface into a messages
  * @msg: Netlink message to dump into
  * @cb: Parameters from query
  *
@@ -621,7 +621,7 @@ struct genl_family batadv_netlink_family __ro_after_init = {
 };
 
 /**
- * batadv_netlink_register - register batadv genl netlink family
+ * batadv_netlink_register() - register batadv genl netlink family
  */
 void __init batadv_netlink_register(void)
 {
@@ -633,7 +633,7 @@ void __init batadv_netlink_register(void)
 }
 
 /**
- * batadv_netlink_unregister - unregister batadv genl netlink family
+ * batadv_netlink_unregister() - unregister batadv genl netlink family
  */
 void batadv_netlink_unregister(void)
 {
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index bd421408d9e7..3758be7fd881 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -67,7 +67,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
 				       struct batadv_hard_iface *recv_if);
 
 /**
- * batadv_nc_init - one-time initialization for network coding
+ * batadv_nc_init() - one-time initialization for network coding
  *
  * Return: 0 on success or negative error number in case of failure
  */
@@ -83,7 +83,7 @@ int __init batadv_nc_init(void)
 }
 
 /**
- * batadv_nc_start_timer - initialise the nc periodic worker
+ * batadv_nc_start_timer() - initialise the nc periodic worker
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
@@ -93,7 +93,7 @@ static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_nc_tvlv_container_update - update the network coding tvlv container
+ * batadv_nc_tvlv_container_update() - update the network coding tvlv container
  *  after network coding setting change
  * @bat_priv: the bat priv with all the soft interface information
  */
@@ -115,7 +115,7 @@ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_nc_status_update - update the network coding tvlv container after
+ * batadv_nc_status_update() - update the network coding tvlv container after
  *  network coding setting change
  * @net_dev: the soft interface net device
  */
@@ -127,7 +127,7 @@ void batadv_nc_status_update(struct net_device *net_dev)
 }
 
 /**
- * batadv_nc_tvlv_ogm_handler_v1 - process incoming nc tvlv container
+ * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node of the ogm
  * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
@@ -146,7 +146,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_mesh_init - initialise coding hash table and start house keeping
+ * batadv_nc_mesh_init() - initialise coding hash table and start house keeping
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 on success or negative error number in case of failure
@@ -187,7 +187,7 @@ err:
 }
 
 /**
- * batadv_nc_init_bat_priv - initialise the nc specific bat_priv variables
+ * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
@@ -199,7 +199,7 @@ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_nc_init_orig - initialise the nc fields of an orig_node
+ * batadv_nc_init_orig() - initialise the nc fields of an orig_node
  * @orig_node: the orig_node which is going to be initialised
  */
 void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
@@ -211,8 +211,8 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
 }
 
 /**
- * batadv_nc_node_release - release nc_node from lists and queue for free after
- *  rcu grace period
+ * batadv_nc_node_release() - release nc_node from lists and queue for free
+ *  after rcu grace period
  * @ref: kref pointer of the nc_node
  */
 static void batadv_nc_node_release(struct kref *ref)
@@ -226,7 +226,7 @@ static void batadv_nc_node_release(struct kref *ref)
 }
 
 /**
- * batadv_nc_node_put - decrement the nc_node refcounter and possibly
+ * batadv_nc_node_put() - decrement the nc_node refcounter and possibly
  *  release it
  * @nc_node: nc_node to be free'd
  */
@@ -236,8 +236,8 @@ static void batadv_nc_node_put(struct batadv_nc_node *nc_node)
 }
 
 /**
- * batadv_nc_path_release - release nc_path from lists and queue for free after
- *  rcu grace period
+ * batadv_nc_path_release() - release nc_path from lists and queue for free
+ *  after rcu grace period
  * @ref: kref pointer of the nc_path
  */
 static void batadv_nc_path_release(struct kref *ref)
@@ -250,7 +250,7 @@ static void batadv_nc_path_release(struct kref *ref)
 }
 
 /**
- * batadv_nc_path_put - decrement the nc_path refcounter and possibly
+ * batadv_nc_path_put() - decrement the nc_path refcounter and possibly
  *  release it
  * @nc_path: nc_path to be free'd
  */
@@ -260,7 +260,7 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
 }
 
 /**
- * batadv_nc_packet_free - frees nc packet
+ * batadv_nc_packet_free() - frees nc packet
  * @nc_packet: the nc packet to free
  * @dropped: whether the packet is freed because is is dropped
  */
@@ -277,7 +277,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
 }
 
 /**
- * batadv_nc_to_purge_nc_node - checks whether an nc node has to be purged
+ * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_node: the nc node to check
  *
@@ -293,7 +293,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_to_purge_nc_path_coding - checks whether an nc path has timed out
+ * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_path: the nc path to check
  *
@@ -313,7 +313,8 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_to_purge_nc_path_decoding - checks whether an nc path has timed out
+ * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed
+ *  out
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_path: the nc path to check
  *
@@ -333,7 +334,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_purge_orig_nc_nodes - go through list of nc nodes and purge stale
+ * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale
  *  entries
  * @bat_priv: the bat priv with all the soft interface information
  * @list: list of nc nodes
@@ -371,7 +372,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_purge_orig - purges all nc node data attached of the given
+ * batadv_nc_purge_orig() - purges all nc node data attached of the given
  *  originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig_node with the nc node entries to be purged
@@ -397,8 +398,8 @@ void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_purge_orig_hash - traverse entire originator hash to check if they
- *  have timed out nc nodes
+ * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if
+ *  they have timed out nc nodes
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
@@ -424,7 +425,7 @@ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_nc_purge_paths - traverse all nc paths part of the hash and remove
+ * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove
  *  unused ones
  * @bat_priv: the bat priv with all the soft interface information
  * @hash: hash table containing the nc paths to check
@@ -483,7 +484,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_hash_key_gen - computes the nc_path hash key
+ * batadv_nc_hash_key_gen() - computes the nc_path hash key
  * @key: buffer to hold the final hash key
  * @src: source ethernet mac address going into the hash key
  * @dst: destination ethernet mac address going into the hash key
@@ -496,7 +497,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
 }
 
 /**
- * batadv_nc_hash_choose - compute the hash value for an nc path
+ * batadv_nc_hash_choose() - compute the hash value for an nc path
  * @data: data to hash
  * @size: size of the hash table
  *
@@ -514,7 +515,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size)
 }
 
 /**
- * batadv_nc_hash_compare - comparing function used in the network coding hash
+ * batadv_nc_hash_compare() - comparing function used in the network coding hash
  *  tables
  * @node: node in the local table
  * @data2: second object to compare the node to
@@ -540,7 +541,7 @@ static bool batadv_nc_hash_compare(const struct hlist_node *node,
 }
 
 /**
- * batadv_nc_hash_find - search for an existing nc path and return it
+ * batadv_nc_hash_find() - search for an existing nc path and return it
  * @hash: hash table containing the nc path
  * @data: search key
  *
@@ -577,7 +578,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash,
 }
 
 /**
- * batadv_nc_send_packet - send non-coded packet and free nc_packet struct
+ * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct
  * @nc_packet: the nc packet to send
  */
 static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
@@ -588,7 +589,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
 }
 
 /**
- * batadv_nc_sniffed_purge - Checks timestamp of given sniffed nc_packet.
+ * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet.
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_path: the nc path the packet belongs to
  * @nc_packet: the nc packet to be checked
@@ -627,7 +628,7 @@ out:
 }
 
 /**
- * batadv_nc_fwd_flush - Checks the timestamp of the given nc packet.
+ * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet.
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_path: the nc path the packet belongs to
  * @nc_packet: the nc packet to be checked
@@ -665,8 +666,8 @@ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_process_nc_paths - traverse given nc packet pool and free timed out
- *  nc packets
+ * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed
+ *  out nc packets
  * @bat_priv: the bat priv with all the soft interface information
  * @hash: to be processed hash table
  * @process_fn: Function called to process given nc packet. Should return true
@@ -711,7 +712,8 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_worker - periodic task for house keeping related to network coding
+ * batadv_nc_worker() - periodic task for house keeping related to network
+ *  coding
  * @work: kernel work struct
  */
 static void batadv_nc_worker(struct work_struct *work)
@@ -751,8 +753,8 @@ static void batadv_nc_worker(struct work_struct *work)
 }
 
 /**
- * batadv_can_nc_with_orig - checks whether the given orig node is suitable for
- *  coding or not
+ * batadv_can_nc_with_orig() - checks whether the given orig node is suitable
+ *  for coding or not
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: neighboring orig node which may be used as nc candidate
  * @ogm_packet: incoming ogm packet also used for the checks
@@ -792,7 +794,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_find_nc_node - search for an existing nc node and return it
+ * batadv_nc_find_nc_node() - search for an existing nc node and return it
  * @orig_node: orig node originating the ogm packet
  * @orig_neigh_node: neighboring orig node from which we received the ogm packet
  *  (can be equal to orig_node)
@@ -832,7 +834,7 @@ batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_nc_get_nc_node - retrieves an nc node or creates the entry if it was
+ * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was
  *  not found
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node originating the ogm packet
@@ -892,7 +894,7 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node
+ * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node
  *  structs (best called on incoming OGMs)
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node originating the ogm packet
@@ -947,7 +949,7 @@ out:
 }
 
 /**
- * batadv_nc_get_path - get existing nc_path or allocate a new one
+ * batadv_nc_get_path() - get existing nc_path or allocate a new one
  * @bat_priv: the bat priv with all the soft interface information
  * @hash: hash table containing the nc path
  * @src: ethernet source address - first half of the nc path search key
@@ -1008,7 +1010,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair
+ * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair
  *  selection of a receiver with slightly lower TQ than the other
  * @tq: to be weighted tq value
  *
@@ -1031,7 +1033,7 @@ static u8 batadv_nc_random_weight_tq(u8 tq)
 }
 
 /**
- * batadv_nc_memxor - XOR destination with source
+ * batadv_nc_memxor() - XOR destination with source
  * @dst: byte array to XOR into
  * @src: byte array to XOR from
  * @len: length of destination array
@@ -1045,7 +1047,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
 }
 
 /**
- * batadv_nc_code_packets - code a received unicast_packet with an nc packet
+ * batadv_nc_code_packets() - code a received unicast_packet with an nc packet
  *  into a coded_packet and send it
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: data skb to forward
@@ -1238,7 +1240,7 @@ out:
 }
 
 /**
- * batadv_nc_skb_coding_possible - true if a decoded skb is available at dst.
+ * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst.
  * @skb: data skb to forward
  * @dst: destination mac address of the other skb to code with
  * @src: source mac address of skb
@@ -1262,7 +1264,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
 }
 
 /**
- * batadv_nc_path_search - Find the coding path matching in_nc_node and
+ * batadv_nc_path_search() - Find the coding path matching in_nc_node and
  *  out_nc_node to retrieve a buffered packet that can be used for coding.
  * @bat_priv: the bat priv with all the soft interface information
  * @in_nc_node: pointer to skb next hop's neighbor nc node
@@ -1330,8 +1332,8 @@ batadv_nc_path_search(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_skb_src_search - Loops through the list of neighoring nodes of the
- *  skb's sender (may be equal to the originator).
+ * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of
+ *  the skb's sender (may be equal to the originator).
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: data skb to forward
  * @eth_dst: next hop mac address of skb
@@ -1376,7 +1378,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_skb_store_before_coding - set the ethernet src and dst of the
+ * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the
  *  unicast skb before it is stored for use in later decoding
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: data skb to store
@@ -1411,7 +1413,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_skb_dst_search - Loops through list of neighboring nodes to dst.
+ * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst.
  * @skb: data skb to forward
  * @neigh_node: next hop to forward packet to
  * @ethhdr: pointer to the ethernet header inside the skb
@@ -1469,7 +1471,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
 }
 
 /**
- * batadv_nc_skb_add_to_path - buffer skb for later encoding / decoding
+ * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding
  * @skb: skb to add to path
  * @nc_path: path to add skb to
  * @neigh_node: next hop to forward packet to
@@ -1504,7 +1506,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
 }
 
 /**
- * batadv_nc_skb_forward - try to code a packet or add it to the coding packet
+ * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet
  *  buffer
  * @skb: data skb to forward
  * @neigh_node: next hop to forward packet to
@@ -1561,8 +1563,8 @@ out:
 }
 
 /**
- * batadv_nc_skb_store_for_decoding - save a clone of the skb which can be used
- *  when decoding coded packets
+ * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be
+ *  used when decoding coded packets
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: data skb to store
  */
@@ -1622,7 +1624,7 @@ out:
 }
 
 /**
- * batadv_nc_skb_store_sniffed_unicast - check if a received unicast packet
+ * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet
  *  should be saved in the decoding buffer and, if so, store it there
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: unicast skb to store
@@ -1642,7 +1644,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_skb_decode_packet - decode given skb using the decode data stored
+ * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored
  *  in nc_packet
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: unicast skb to decode
@@ -1736,7 +1738,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
 }
 
 /**
- * batadv_nc_find_decoding_packet - search through buffered decoding data to
+ * batadv_nc_find_decoding_packet() - search through buffered decoding data to
  *  find the data needed to decode the coded packet
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: pointer to the ethernet header inside the coded packet
@@ -1801,7 +1803,7 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_nc_recv_coded_packet - try to decode coded packet and enqueue the
+ * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the
  *  resulting unicast packet
  * @skb: incoming coded packet
  * @recv_if: pointer to interface this packet was received on
@@ -1876,7 +1878,7 @@ free_skb:
 }
 
 /**
- * batadv_nc_mesh_free - clean up network coding memory
+ * batadv_nc_mesh_free() - clean up network coding memory
  * @bat_priv: the bat priv with all the soft interface information
  */
 void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
@@ -1893,7 +1895,7 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_nc_nodes_seq_print_text - print the nc node information
+ * batadv_nc_nodes_seq_print_text() - print the nc node information
  * @seq: seq file to print on
  * @offset: not used
  *
@@ -1956,7 +1958,7 @@ out:
 }
 
 /**
- * batadv_nc_init_debugfs - create nc folder and related files in debugfs
+ * batadv_nc_init_debugfs() - create nc folder and related files in debugfs
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 on success or negative error number in case of failure
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 007b6bd8df95..412a603b2fda 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -91,7 +91,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
 static void batadv_purge_orig(struct work_struct *work);
 
 /**
- * batadv_compare_orig - comparing function used in the originator hash table
+ * batadv_compare_orig() - comparing function used in the originator hash table
  * @node: node in the local table
  * @data2: second object to compare the node to
  *
@@ -106,7 +106,7 @@ bool batadv_compare_orig(const struct hlist_node *node, const void *data2)
 }
 
 /**
- * batadv_orig_node_vlan_get - get an orig_node_vlan object
+ * batadv_orig_node_vlan_get() - get an orig_node_vlan object
  * @orig_node: the originator serving the VLAN
  * @vid: the VLAN identifier
  *
@@ -137,7 +137,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_orig_node_vlan_new - search and possibly create an orig_node_vlan
+ * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan
  *  object
  * @orig_node: the originator serving the VLAN
  * @vid: the VLAN identifier
@@ -178,7 +178,7 @@ out:
 }
 
 /**
- * batadv_orig_node_vlan_release - release originator-vlan object from lists
+ * batadv_orig_node_vlan_release() - release originator-vlan object from lists
  *  and queue for free after rcu grace period
  * @ref: kref pointer of the originator-vlan object
  */
@@ -192,7 +192,7 @@ static void batadv_orig_node_vlan_release(struct kref *ref)
 }
 
 /**
- * batadv_orig_node_vlan_put - decrement the refcounter and possibly release
+ * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release
  *  the originator-vlan object
  * @orig_vlan: the originator-vlan object to release
  */
@@ -226,7 +226,7 @@ err:
 }
 
 /**
- * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for
+ * batadv_neigh_ifinfo_release() - release neigh_ifinfo from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the neigh_ifinfo
  */
@@ -243,7 +243,7 @@ static void batadv_neigh_ifinfo_release(struct kref *ref)
 }
 
 /**
- * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release
+ * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release
  *  the neigh_ifinfo
  * @neigh_ifinfo: the neigh_ifinfo object to release
  */
@@ -253,7 +253,7 @@ void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo)
 }
 
 /**
- * batadv_hardif_neigh_release - release hardif neigh node from lists and
+ * batadv_hardif_neigh_release() - release hardif neigh node from lists and
  *  queue for free after rcu grace period
  * @ref: kref pointer of the neigh_node
  */
@@ -273,7 +273,7 @@ static void batadv_hardif_neigh_release(struct kref *ref)
 }
 
 /**
- * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter
+ * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter
  *  and possibly release it
  * @hardif_neigh: hardif neigh neighbor to free
  */
@@ -283,7 +283,7 @@ void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh)
 }
 
 /**
- * batadv_neigh_node_release - release neigh_node from lists and queue for
+ * batadv_neigh_node_release() - release neigh_node from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the neigh_node
  */
@@ -308,7 +308,7 @@ static void batadv_neigh_node_release(struct kref *ref)
 }
 
 /**
- * batadv_neigh_node_put - decrement the neighbors refcounter and possibly
+ * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly
  *  release it
  * @neigh_node: neigh neighbor to free
  */
@@ -318,7 +318,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
 }
 
 /**
- * batadv_orig_router_get - router to the originator depending on iface
+ * batadv_orig_router_get() - router to the originator depending on iface
  * @orig_node: the orig node for the router
  * @if_outgoing: the interface where the payload packet has been received or
  *  the OGM should be sent to
@@ -351,7 +351,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_orig_ifinfo_get - find the ifinfo from an orig_node
+ * batadv_orig_ifinfo_get() - find the ifinfo from an orig_node
  * @orig_node: the orig node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
@@ -383,7 +383,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_orig_ifinfo_new - search and possibly create an orig_ifinfo object
+ * batadv_orig_ifinfo_new() - search and possibly create an orig_ifinfo object
  * @orig_node: the orig node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
@@ -429,7 +429,7 @@ out:
 }
 
 /**
- * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node
+ * batadv_neigh_ifinfo_get() - find the ifinfo from an neigh_node
  * @neigh: the neigh node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
@@ -462,7 +462,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
 }
 
 /**
- * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object
+ * batadv_neigh_ifinfo_new() - search and possibly create an neigh_ifinfo object
  * @neigh: the neigh node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
@@ -505,7 +505,7 @@ out:
 }
 
 /**
- * batadv_neigh_node_get - retrieve a neighbour from the list
+ * batadv_neigh_node_get() - retrieve a neighbour from the list
  * @orig_node: originator which the neighbour belongs to
  * @hard_iface: the interface where this neighbour is connected to
  * @addr: the address of the neighbour
@@ -542,7 +542,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_hardif_neigh_create - create a hardif neighbour node
+ * batadv_hardif_neigh_create() - create a hardif neighbour node
  * @hard_iface: the interface this neighbour is connected to
  * @neigh_addr: the interface address of the neighbour to retrieve
  * @orig_node: originator object representing the neighbour
@@ -588,7 +588,7 @@ out:
 }
 
 /**
- * batadv_hardif_neigh_get_or_create - retrieve or create a hardif neighbour
+ * batadv_hardif_neigh_get_or_create() - retrieve or create a hardif neighbour
  *  node
  * @hard_iface: the interface this neighbour is connected to
  * @neigh_addr: the interface address of the neighbour to retrieve
@@ -612,7 +612,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
 }
 
 /**
- * batadv_hardif_neigh_get - retrieve a hardif neighbour from the list
+ * batadv_hardif_neigh_get() - retrieve a hardif neighbour from the list
  * @hard_iface: the interface where this neighbour is connected to
  * @neigh_addr: the address of the neighbour
  *
@@ -644,7 +644,7 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
 }
 
 /**
- * batadv_neigh_node_create - create a neigh node object
+ * batadv_neigh_node_create() - create a neigh node object
  * @orig_node: originator object representing the neighbour
  * @hard_iface: the interface where the neighbour is connected to
  * @neigh_addr: the mac address of the neighbour interface
@@ -709,7 +709,7 @@ out:
 }
 
 /**
- * batadv_neigh_node_get_or_create - retrieve or create a neigh node object
+ * batadv_neigh_node_get_or_create() - retrieve or create a neigh node object
  * @orig_node: originator object representing the neighbour
  * @hard_iface: the interface where the neighbour is connected to
  * @neigh_addr: the mac address of the neighbour interface
@@ -733,7 +733,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list
+ * batadv_hardif_neigh_seq_print_text() - print the single hop neighbour list
  * @seq: neighbour table seq_file struct
  * @offset: not used
  *
@@ -768,8 +768,8 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
 #endif
 
 /**
- * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific
- *  outgoing interface
+ * batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a
+ *  specific outgoing interface
  * @msg: message to dump into
  * @cb: parameters for the dump
  *
@@ -845,7 +845,7 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
 }
 
 /**
- * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
+ * batadv_orig_ifinfo_release() - release orig_ifinfo from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the orig_ifinfo
  */
@@ -868,7 +868,7 @@ static void batadv_orig_ifinfo_release(struct kref *ref)
 }
 
 /**
- * batadv_orig_ifinfo_put - decrement the refcounter and possibly release
+ * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release
  *  the orig_ifinfo
  * @orig_ifinfo: the orig_ifinfo object to release
  */
@@ -878,7 +878,7 @@ void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo)
 }
 
 /**
- * batadv_orig_node_free_rcu - free the orig_node
+ * batadv_orig_node_free_rcu() - free the orig_node
  * @rcu: rcu pointer of the orig_node
  */
 static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
@@ -899,7 +899,7 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
 }
 
 /**
- * batadv_orig_node_release - release orig_node from lists and queue for
+ * batadv_orig_node_release() - release orig_node from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the orig_node
  */
@@ -950,7 +950,7 @@ static void batadv_orig_node_release(struct kref *ref)
 }
 
 /**
- * batadv_orig_node_put - decrement the orig node refcounter and possibly
+ * batadv_orig_node_put() - decrement the orig node refcounter and possibly
  *  release it
  * @orig_node: the orig node to free
  */
@@ -992,7 +992,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_orig_node_new - creates a new orig_node
+ * batadv_orig_node_new() - creates a new orig_node
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the originator
  *
@@ -1071,7 +1071,7 @@ free_orig_node:
 }
 
 /**
- * batadv_purge_neigh_ifinfo - purge obsolete ifinfo entries from neighbor
+ * batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor
  * @bat_priv: the bat priv with all the soft interface information
  * @neigh: orig node which is to be checked
  */
@@ -1112,7 +1112,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_purge_orig_ifinfo - purge obsolete ifinfo entries from originator
+ * batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be checked
  *
@@ -1164,7 +1164,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_purge_orig_neighbors - purges neighbors from originator
+ * batadv_purge_orig_neighbors() - purges neighbors from originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be checked
  *
@@ -1222,7 +1222,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_find_best_neighbor - finds the best neighbor after purging
+ * batadv_find_best_neighbor() - finds the best neighbor after purging
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be checked
  * @if_outgoing: the interface for which the metric should be compared
@@ -1257,7 +1257,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_purge_orig_node - purges obsolete information from an orig_node
+ * batadv_purge_orig_node() - purges obsolete information from an orig_node
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be checked
  *
@@ -1409,7 +1409,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
 }
 
 /**
- * batadv_orig_hardif_seq_print_text - writes originator infos for a specific
+ * batadv_orig_hardif_seq_print_text() - writes originator infos for a specific
  *  outgoing interface
  * @seq: debugfs table seq_file struct
  * @offset: not used
@@ -1456,7 +1456,7 @@ out:
 #endif
 
 /**
- * batadv_orig_dump - Dump to netlink the originator infos for a specific
+ * batadv_orig_dump() - Dump to netlink the originator infos for a specific
  *  outgoing interface
  * @msg: message to dump into
  * @cb: parameters for the dump
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 86b0ea1e5c1c..01820be4ae5a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -55,7 +55,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 				       struct batadv_hard_iface *recv_if);
 
 /**
- * _batadv_update_route - set the router for this originator
+ * _batadv_update_route() - set the router for this originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be configured
  * @recv_if: the receive interface for which this route is set
@@ -119,7 +119,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_update_route - set the router for this originator
+ * batadv_update_route() - set the router for this originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be configured
  * @recv_if: the receive interface for which this route is set
@@ -146,7 +146,7 @@ out:
 }
 
 /**
- * batadv_window_protected - checks whether the host restarted and is in the
+ * batadv_window_protected() - checks whether the host restarted and is in the
  *  protection time.
  * @bat_priv: the bat priv with all the soft interface information
  * @seq_num_diff: difference between the current/received sequence number and
@@ -213,7 +213,7 @@ bool batadv_check_management_packet(struct sk_buff *skb,
 }
 
 /**
- * batadv_recv_my_icmp_packet - receive an icmp packet locally
+ * batadv_recv_my_icmp_packet() - receive an icmp packet locally
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: icmp packet to process
  *
@@ -441,7 +441,7 @@ free_skb:
 }
 
 /**
- * batadv_check_unicast_packet - Check for malformed unicast packets
+ * batadv_check_unicast_packet() - Check for malformed unicast packets
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to check
  * @hdr_size: size of header to pull
@@ -479,7 +479,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_last_bonding_get - Get last_bonding_candidate of orig_node
+ * batadv_last_bonding_get() - Get last_bonding_candidate of orig_node
  * @orig_node: originator node whose last bonding candidate should be retrieved
  *
  * Return: last bonding candidate of router or NULL if not found
@@ -502,7 +502,7 @@ batadv_last_bonding_get(struct batadv_orig_node *orig_node)
 }
 
 /**
- * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
+ * batadv_last_bonding_replace() - Replace last_bonding_candidate of orig_node
  * @orig_node: originator node whose bonding candidates should be replaced
  * @new_candidate: new bonding candidate or NULL
  */
@@ -525,7 +525,7 @@ batadv_last_bonding_replace(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_find_router - find a suitable router for this originator
+ * batadv_find_router() - find a suitable router for this originator
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: the destination node
  * @recv_if: pointer to interface this packet was received on
@@ -742,7 +742,7 @@ free_skb:
 }
 
 /**
- * batadv_reroute_unicast_packet - update the unicast header for re-routing
+ * batadv_reroute_unicast_packet() - update the unicast header for re-routing
  * @bat_priv: the bat priv with all the soft interface information
  * @unicast_packet: the unicast header to be updated
  * @dst_addr: the payload destination
@@ -905,7 +905,7 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_recv_unhandled_unicast_packet - receive and process packets which
+ * batadv_recv_unhandled_unicast_packet() - receive and process packets which
  *	are in the unicast number space but not yet known to the implementation
  * @skb: unicast tvlv packet to process
  * @recv_if: pointer to interface this packet was received on
@@ -1037,7 +1037,7 @@ free_skb:
 }
 
 /**
- * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets
+ * batadv_recv_unicast_tvlv() - receive and process unicast tvlv packets
  * @skb: unicast tvlv packet to process
  * @recv_if: pointer to interface this packet was received on
  *
@@ -1091,7 +1091,7 @@ free_skb:
 }
 
 /**
- * batadv_recv_frag_packet - process received fragment
+ * batadv_recv_frag_packet() - process received fragment
  * @skb: the received fragment
  * @recv_if: interface that the skb is received on
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index c53b11d41d8b..0700b3dfb595 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -55,7 +55,7 @@
 static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
 
 /**
- * batadv_send_skb_packet - send an already prepared packet
+ * batadv_send_skb_packet() - send an already prepared packet
  * @skb: the packet to send
  * @hard_iface: the interface to use to send the broadcast packet
  * @dst_addr: the payload destination
@@ -154,7 +154,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
 }
 
 /**
- * batadv_send_skb_to_orig - Lookup next-hop and transmit skb.
+ * batadv_send_skb_to_orig() - Lookup next-hop and transmit skb.
  * @skb: Packet to be transmitted.
  * @orig_node: Final destination of the packet.
  * @recv_if: Interface used when receiving the packet (can be NULL).
@@ -217,7 +217,7 @@ free_skb:
 }
 
 /**
- * batadv_send_skb_push_fill_unicast - extend the buffer and initialize the
+ * batadv_send_skb_push_fill_unicast() - extend the buffer and initialize the
  *  common fields for unicast packets
  * @skb: the skb carrying the unicast header to initialize
  * @hdr_size: amount of bytes to push at the beginning of the skb
@@ -250,7 +250,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
 }
 
 /**
- * batadv_send_skb_prepare_unicast - encapsulate an skb with a unicast header
+ * batadv_send_skb_prepare_unicast() - encapsulate an skb with a unicast header
  * @skb: the skb containing the payload to encapsulate
  * @orig_node: the destination node
  *
@@ -265,7 +265,7 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
 }
 
 /**
- * batadv_send_skb_prepare_unicast_4addr - encapsulate an skb with a
+ * batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a
  *  unicast 4addr header
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the skb containing the payload to encapsulate
@@ -309,7 +309,7 @@ out:
 }
 
 /**
- * batadv_send_skb_unicast - encapsulate and send an skb via unicast
+ * batadv_send_skb_unicast() - encapsulate and send an skb via unicast
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: payload to send
  * @packet_type: the batman unicast packet type to use
@@ -379,7 +379,7 @@ out:
 }
 
 /**
- * batadv_send_skb_via_tt_generic - send an skb via TT lookup
+ * batadv_send_skb_via_tt_generic() - send an skb via TT lookup
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: payload to send
  * @packet_type: the batman unicast packet type to use
@@ -426,7 +426,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_send_skb_via_gw - send an skb via gateway lookup
+ * batadv_send_skb_via_gw() - send an skb via gateway lookup
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: payload to send
  * @vid: the vid to be used to search the translation table
@@ -453,7 +453,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
 }
 
 /**
- * batadv_forw_packet_free - free a forwarding packet
+ * batadv_forw_packet_free() - free a forwarding packet
  * @forw_packet: The packet to free
  * @dropped: whether the packet is freed because is is dropped
  *
@@ -478,7 +478,7 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
 }
 
 /**
- * batadv_forw_packet_alloc - allocate a forwarding packet
+ * batadv_forw_packet_alloc() - allocate a forwarding packet
  * @if_incoming: The (optional) if_incoming to be grabbed
  * @if_outgoing: The (optional) if_outgoing to be grabbed
  * @queue_left: The (optional) queue counter to decrease
@@ -544,7 +544,7 @@ err:
 }
 
 /**
- * batadv_forw_packet_was_stolen - check whether someone stole this packet
+ * batadv_forw_packet_was_stolen() - check whether someone stole this packet
  * @forw_packet: the forwarding packet to check
  *
  * This function checks whether the given forwarding packet was claimed by
@@ -559,7 +559,7 @@ batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet)
 }
 
 /**
- * batadv_forw_packet_steal - claim a forw_packet for free()
+ * batadv_forw_packet_steal() - claim a forw_packet for free()
  * @forw_packet: the forwarding packet to steal
  * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock)
  *
@@ -590,7 +590,7 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet,
 }
 
 /**
- * batadv_forw_packet_list_steal - claim a list of forward packets for free()
+ * batadv_forw_packet_list_steal() - claim a list of forward packets for free()
  * @forw_list: the to be stolen forward packets
  * @cleanup_list: a backup pointer, to be able to dispose the packet later
  * @hard_iface: the interface to steal forward packets from
@@ -626,7 +626,7 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list,
 }
 
 /**
- * batadv_forw_packet_list_free - free a list of forward packets
+ * batadv_forw_packet_list_free() - free a list of forward packets
  * @head: a list of to be freed forw_packets
  *
  * This function cancels the scheduling of any packet in the provided list,
@@ -650,7 +650,7 @@ static void batadv_forw_packet_list_free(struct hlist_head *head)
 }
 
 /**
- * batadv_forw_packet_queue - try to queue a forwarding packet
+ * batadv_forw_packet_queue() - try to queue a forwarding packet
  * @forw_packet: the forwarding packet to queue
  * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock)
  * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list)
@@ -694,7 +694,7 @@ static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet,
 }
 
 /**
- * batadv_forw_packet_bcast_queue - try to queue a broadcast packet
+ * batadv_forw_packet_bcast_queue() - try to queue a broadcast packet
  * @bat_priv: the bat priv with all the soft interface information
  * @forw_packet: the forwarding packet to queue
  * @send_time: timestamp (jiffies) when the packet is to be sent
@@ -713,7 +713,7 @@ batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_forw_packet_ogmv1_queue - try to queue an OGMv1 packet
+ * batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet
  * @bat_priv: the bat priv with all the soft interface information
  * @forw_packet: the forwarding packet to queue
  * @send_time: timestamp (jiffies) when the packet is to be sent
@@ -731,7 +731,7 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends
+ * batadv_add_bcast_packet_to_list() - queue broadcast packet for multiple sends
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: broadcast packet to add
  * @delay: number of jiffies to wait before sending
@@ -791,7 +791,7 @@ err:
 }
 
 /**
- * batadv_forw_packet_bcasts_left - check if a retransmission is necessary
+ * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary
  * @forw_packet: the forwarding packet to check
  * @hard_iface: the interface to check on
  *
@@ -819,7 +819,8 @@ batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet,
 }
 
 /**
- * batadv_forw_packet_bcasts_inc - increment retransmission counter of a packet
+ * batadv_forw_packet_bcasts_inc() - increment retransmission counter of a
+ *  packet
  * @forw_packet: the packet to increase the counter for
  */
 static void
@@ -829,7 +830,7 @@ batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet)
 }
 
 /**
- * batadv_forw_packet_is_rebroadcast - check packet for previous transmissions
+ * batadv_forw_packet_is_rebroadcast() - check packet for previous transmissions
  * @forw_packet: the packet to check
  *
  * Return: True if this packet was transmitted before, false otherwise.
@@ -954,7 +955,7 @@ out:
 }
 
 /**
- * batadv_purge_outstanding_packets - stop/purge scheduled bcast/OGMv1 packets
+ * batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets
  * @bat_priv: the bat priv with all the soft interface information
  * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on
  *
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index eb36820e41bc..8c7399dd06ca 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -77,7 +77,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
 			   unsigned short vid);
 
 /**
- * batadv_send_skb_via_tt - send an skb via TT lookup
+ * batadv_send_skb_via_tt() - send an skb via TT lookup
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the payload to send
  * @dst_hint: can be used to override the destination contained in the skb
@@ -98,7 +98,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_send_skb_via_tt_4addr - send an skb via TT lookup
+ * batadv_send_skb_via_tt_4addr() - send an skb via TT lookup
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the payload to send
  * @packet_subtype: the unicast 4addr packet subtype to use
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index ba8fd06eee7e..9b66e0edc741 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -97,7 +97,7 @@ static int batadv_interface_release(struct net_device *dev)
 }
 
 /**
- * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
+ * batadv_sum_counter() - Sum the cpu-local counters for index 'idx'
  * @bat_priv: the bat priv with all the soft interface information
  * @idx: index of counter to sum up
  *
@@ -170,7 +170,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
 }
 
 /**
- * batadv_interface_set_rx_mode - set the rx mode of a device
+ * batadv_interface_set_rx_mode() - set the rx mode of a device
  * @dev: registered network device to modify
  *
  * We do not actually need to set any rx filters for the virtual batman
@@ -390,7 +390,7 @@ end:
 }
 
 /**
- * batadv_interface_rx - receive ethernet frame on local batman-adv interface
+ * batadv_interface_rx() - receive ethernet frame on local batman-adv interface
  * @soft_iface: local interface which will receive the ethernet frame
  * @skb: ethernet frame for @soft_iface
  * @hdr_size: size of already parsed batman-adv header
@@ -502,8 +502,8 @@ out:
 }
 
 /**
- * batadv_softif_vlan_release - release vlan from lists and queue for free after
- *  rcu grace period
+ * batadv_softif_vlan_release() - release vlan from lists and queue for free
+ *  after rcu grace period
  * @ref: kref pointer of the vlan object
  */
 static void batadv_softif_vlan_release(struct kref *ref)
@@ -520,7 +520,7 @@ static void batadv_softif_vlan_release(struct kref *ref)
 }
 
 /**
- * batadv_softif_vlan_put - decrease the vlan object refcounter and
+ * batadv_softif_vlan_put() - decrease the vlan object refcounter and
  *  possibly release it
  * @vlan: the vlan object to release
  */
@@ -533,7 +533,7 @@ void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan)
 }
 
 /**
- * batadv_softif_vlan_get - get the vlan object for a specific vid
+ * batadv_softif_vlan_get() - get the vlan object for a specific vid
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the identifier of the vlan object to retrieve
  *
@@ -562,7 +562,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_softif_create_vlan - allocate the needed resources for a new vlan
+ * batadv_softif_create_vlan() - allocate the needed resources for a new vlan
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier
  *
@@ -614,7 +614,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
 }
 
 /**
- * batadv_softif_destroy_vlan - remove and destroy a softif_vlan object
+ * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object
  * @bat_priv: the bat priv with all the soft interface information
  * @vlan: the object to remove
  */
@@ -632,7 +632,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_interface_add_vid - ndo_add_vid API implementation
+ * batadv_interface_add_vid() - ndo_add_vid API implementation
  * @dev: the netdev of the mesh interface
  * @proto: protocol of the the vlan id
  * @vid: identifier of the new vlan
@@ -690,7 +690,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
 }
 
 /**
- * batadv_interface_kill_vid - ndo_kill_vid API implementation
+ * batadv_interface_kill_vid() - ndo_kill_vid API implementation
  * @dev: the netdev of the mesh interface
  * @proto: protocol of the the vlan id
  * @vid: identifier of the deleted vlan
@@ -733,7 +733,7 @@ static struct lock_class_key batadv_netdev_xmit_lock_key;
 static struct lock_class_key batadv_netdev_addr_lock_key;
 
 /**
- * batadv_set_lockdep_class_one - Set lockdep class for a single tx queue
+ * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue
  * @dev: device which owns the tx queue
  * @txq: tx queue to modify
  * @_unused: always NULL
@@ -746,7 +746,7 @@ static void batadv_set_lockdep_class_one(struct net_device *dev,
 }
 
 /**
- * batadv_set_lockdep_class - Set txq and addr_list lockdep class
+ * batadv_set_lockdep_class() - Set txq and addr_list lockdep class
  * @dev: network device to modify
  */
 static void batadv_set_lockdep_class(struct net_device *dev)
@@ -756,7 +756,7 @@ static void batadv_set_lockdep_class(struct net_device *dev)
 }
 
 /**
- * batadv_softif_init_late - late stage initialization of soft interface
+ * batadv_softif_init_late() - late stage initialization of soft interface
  * @dev: registered network device to modify
  *
  * Return: error code on failures
@@ -861,7 +861,7 @@ free_bat_counters:
 }
 
 /**
- * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface
+ * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface
  * @dev: batadv_soft_interface used as master interface
  * @slave_dev: net_device which should become the slave interface
  * @extack: extended ACK report struct
@@ -889,7 +889,7 @@ out:
 }
 
 /**
- * batadv_softif_slave_del - Delete a slave iface from a batadv_soft_interface
+ * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface
  * @dev: batadv_soft_interface used as master interface
  * @slave_dev: net_device which should be removed from the master interface
  *
@@ -1024,7 +1024,7 @@ static const struct ethtool_ops batadv_ethtool_ops = {
 };
 
 /**
- * batadv_softif_free - Deconstructor of batadv_soft_interface
+ * batadv_softif_free() - Deconstructor of batadv_soft_interface
  * @dev: Device to cleanup and remove
  */
 static void batadv_softif_free(struct net_device *dev)
@@ -1040,7 +1040,7 @@ static void batadv_softif_free(struct net_device *dev)
 }
 
 /**
- * batadv_softif_init_early - early stage initialization of soft interface
+ * batadv_softif_init_early() - early stage initialization of soft interface
  * @dev: registered network device to modify
  */
 static void batadv_softif_init_early(struct net_device *dev)
@@ -1090,7 +1090,7 @@ struct net_device *batadv_softif_create(struct net *net, const char *name)
 }
 
 /**
- * batadv_softif_destroy_sysfs - deletion of batadv_soft_interface via sysfs
+ * batadv_softif_destroy_sysfs() - deletion of batadv_soft_interface via sysfs
  * @soft_iface: the to-be-removed batman-adv interface
  */
 void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
@@ -1112,7 +1112,8 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
 }
 
 /**
- * batadv_softif_destroy_netlink - deletion of batadv_soft_interface via netlink
+ * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via
+ *  netlink
  * @soft_iface: the to-be-removed batman-adv interface
  * @head: list pointer
  */
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 227a072dc1d3..8e2b7c7d2358 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -65,7 +65,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj)
 }
 
 /**
- * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv
+ * batadv_vlan_kobj_to_batpriv() - convert a vlan kobj in the associated batpriv
  * @obj: kobject to covert
  *
  * Return: the associated batadv_priv struct.
@@ -85,7 +85,7 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
 }
 
 /**
- * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct
+ * batadv_kobj_to_vlan() - convert a kobj in the associated softif_vlan struct
  * @bat_priv: the bat priv with all the soft interface information
  * @obj: kobject to covert
  *
@@ -600,7 +600,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj,
 }
 
 /**
- * batadv_show_isolation_mark - print the current isolation mark/mask
+ * batadv_show_isolation_mark() - print the current isolation mark/mask
  * @kobj: kobject representing the private mesh sysfs directory
  * @attr: the batman-adv attribute the user is interacting with
  * @buff: the buffer that will contain the data to send back to the user
@@ -618,8 +618,8 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
 }
 
 /**
- * batadv_store_isolation_mark - parse and store the isolation mark/mask entered
- *  by the user
+ * batadv_store_isolation_mark() - parse and store the isolation mark/mask
+ *  entered by the user
  * @kobj: kobject representing the private mesh sysfs directory
  * @attr: the batman-adv attribute the user is interacting with
  * @buff: the buffer containing the user data
@@ -790,7 +790,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev)
 }
 
 /**
- * batadv_sysfs_add_vlan - add all the needed sysfs objects for the new vlan
+ * batadv_sysfs_add_vlan() - add all the needed sysfs objects for the new vlan
  * @dev: netdev of the mesh interface
  * @vlan: private data of the newly added VLAN interface
  *
@@ -851,7 +851,7 @@ out:
 }
 
 /**
- * batadv_sysfs_del_vlan - remove all the sysfs objects for a given VLAN
+ * batadv_sysfs_del_vlan() - remove all the sysfs objects for a given VLAN
  * @bat_priv: the bat priv with all the soft interface information
  * @vlan: the private data of the VLAN to destroy
  */
@@ -896,7 +896,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
 }
 
 /**
- * batadv_store_mesh_iface_finish - store new hardif mesh_iface state
+ * batadv_store_mesh_iface_finish() - store new hardif mesh_iface state
  * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
  * @ifname: name of soft-interface to modify
  *
@@ -949,7 +949,7 @@ out:
 }
 
 /**
- * batadv_store_mesh_iface_work - store new hardif mesh_iface state
+ * batadv_store_mesh_iface_work() - store new hardif mesh_iface state
  * @work: work queue item
  *
  * Changes the parts of the hard+soft interface which can not be modified under
@@ -1045,7 +1045,7 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj,
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
 
 /**
- * batadv_store_throughput_override - parse and store throughput override
+ * batadv_store_throughput_override() - parse and store throughput override
  *  entered by the user
  * @kobj: kobject representing the private mesh sysfs directory
  * @attr: the batman-adv attribute the user is interacting with
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index fe9eb2970ec9..b443b9d28918 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -98,7 +98,7 @@
 static u8 batadv_tp_prerandom[4096] __read_mostly;
 
 /**
- * batadv_tp_session_cookie - generate session cookie based on session ids
+ * batadv_tp_session_cookie() - generate session cookie based on session ids
  * @session: TP session identifier
  * @icmp_uid: icmp pseudo uid of the tp session
  *
@@ -116,7 +116,7 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid)
 }
 
 /**
- * batadv_tp_cwnd - compute the new cwnd size
+ * batadv_tp_cwnd() - compute the new cwnd size
  * @base: base cwnd size value
  * @increment: the value to add to base to get the new size
  * @min: minumim cwnd value (usually MSS)
@@ -141,7 +141,7 @@ static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min)
 }
 
 /**
- * batadv_tp_updated_cwnd - update the Congestion Windows
+ * batadv_tp_updated_cwnd() - update the Congestion Windows
  * @tp_vars: the private data of the current TP meter session
  * @mss: maximum segment size of transmission
  *
@@ -177,7 +177,7 @@ static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss)
 }
 
 /**
- * batadv_tp_update_rto - calculate new retransmission timeout
+ * batadv_tp_update_rto() - calculate new retransmission timeout
  * @tp_vars: the private data of the current TP meter session
  * @new_rtt: new roundtrip time in msec
  */
@@ -213,7 +213,7 @@ static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars,
 }
 
 /**
- * batadv_tp_batctl_notify - send client status result to client
+ * batadv_tp_batctl_notify() - send client status result to client
  * @reason: reason for tp meter session stop
  * @dst: destination of tp_meter session
  * @bat_priv: the bat priv with all the soft interface information
@@ -245,7 +245,7 @@ static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason,
 }
 
 /**
- * batadv_tp_batctl_error_notify - send client error result to client
+ * batadv_tp_batctl_error_notify() - send client error result to client
  * @reason: reason for tp meter session stop
  * @dst: destination of tp_meter session
  * @bat_priv: the bat priv with all the soft interface information
@@ -260,7 +260,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
 }
 
 /**
- * batadv_tp_list_find - find a tp_vars object in the global list
+ * batadv_tp_list_find() - find a tp_vars object in the global list
  * @bat_priv: the bat priv with all the soft interface information
  * @dst: the other endpoint MAC address to look for
  *
@@ -295,7 +295,8 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tp_list_find_session - find tp_vars session object in the global list
+ * batadv_tp_list_find_session() - find tp_vars session object in the global
+ *  list
  * @bat_priv: the bat priv with all the soft interface information
  * @dst: the other endpoint MAC address to look for
  * @session: session identifier
@@ -336,7 +337,7 @@ batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst,
 }
 
 /**
- * batadv_tp_vars_release - release batadv_tp_vars from lists and queue for
+ * batadv_tp_vars_release() - release batadv_tp_vars from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the batadv_tp_vars
  */
@@ -361,7 +362,7 @@ static void batadv_tp_vars_release(struct kref *ref)
 }
 
 /**
- * batadv_tp_vars_put - decrement the batadv_tp_vars refcounter and possibly
+ * batadv_tp_vars_put() - decrement the batadv_tp_vars refcounter and possibly
  *  release it
  * @tp_vars: the private data of the current TP meter session to be free'd
  */
@@ -371,7 +372,7 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars)
 }
 
 /**
- * batadv_tp_sender_cleanup - cleanup sender data and drop and timer
+ * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer
  * @bat_priv: the bat priv with all the soft interface information
  * @tp_vars: the private data of the current TP meter session to cleanup
  */
@@ -401,7 +402,7 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tp_sender_end - print info about ended session and inform client
+ * batadv_tp_sender_end() - print info about ended session and inform client
  * @bat_priv: the bat priv with all the soft interface information
  * @tp_vars: the private data of the current TP meter session
  */
@@ -434,7 +435,7 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tp_sender_shutdown - let sender thread/timer stop gracefully
+ * batadv_tp_sender_shutdown() - let sender thread/timer stop gracefully
  * @tp_vars: the private data of the current TP meter session
  * @reason: reason for tp meter session stop
  */
@@ -448,7 +449,7 @@ static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars,
 }
 
 /**
- * batadv_tp_sender_finish - stop sender session after test_length was reached
+ * batadv_tp_sender_finish() - stop sender session after test_length was reached
  * @work: delayed work reference of the related tp_vars
  */
 static void batadv_tp_sender_finish(struct work_struct *work)
@@ -464,7 +465,7 @@ static void batadv_tp_sender_finish(struct work_struct *work)
 }
 
 /**
- * batadv_tp_reset_sender_timer - reschedule the sender timer
+ * batadv_tp_reset_sender_timer() - reschedule the sender timer
  * @tp_vars: the private TP meter data for this session
  *
  * Reschedule the timer using tp_vars->rto as delay
@@ -482,7 +483,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
 }
 
 /**
- * batadv_tp_sender_timeout - timer that fires in case of packet loss
+ * batadv_tp_sender_timeout() - timer that fires in case of packet loss
  * @arg: address of the related tp_vars
  *
  * If fired it means that there was packet loss.
@@ -532,7 +533,7 @@ static void batadv_tp_sender_timeout(struct timer_list *t)
 }
 
 /**
- * batadv_tp_fill_prerandom - Fill buffer with prefetched random bytes
+ * batadv_tp_fill_prerandom() - Fill buffer with prefetched random bytes
  * @tp_vars: the private TP meter data for this session
  * @buf: Buffer to fill with bytes
  * @nbytes: amount of pseudorandom bytes
@@ -564,7 +565,7 @@ static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars,
 }
 
 /**
- * batadv_tp_send_msg - send a single message
+ * batadv_tp_send_msg() - send a single message
  * @tp_vars: the private TP meter data for this session
  * @src: source mac address
  * @orig_node: the originator of the destination
@@ -624,7 +625,7 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
 }
 
 /**
- * batadv_tp_recv_ack - ACK receiving function
+ * batadv_tp_recv_ack() - ACK receiving function
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the buffer containing the received packet
  *
@@ -766,7 +767,7 @@ out:
 }
 
 /**
- * batadv_tp_avail - check if congestion window is not full
+ * batadv_tp_avail() - check if congestion window is not full
  * @tp_vars: the private data of the current TP meter session
  * @payload_len: size of the payload of a single message
  *
@@ -784,7 +785,7 @@ static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars,
 }
 
 /**
- * batadv_tp_wait_available - wait until congestion window becomes free or
+ * batadv_tp_wait_available() - wait until congestion window becomes free or
  *  timeout is reached
  * @tp_vars: the private data of the current TP meter session
  * @plen: size of the payload of a single message
@@ -806,7 +807,7 @@ static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen)
 }
 
 /**
- * batadv_tp_send - main sending thread of a tp meter session
+ * batadv_tp_send() - main sending thread of a tp meter session
  * @arg: address of the related tp_vars
  *
  * Return: nothing, this function never returns
@@ -905,7 +906,8 @@ out:
 }
 
 /**
- * batadv_tp_start_kthread - start new thread which manages the tp meter sender
+ * batadv_tp_start_kthread() - start new thread which manages the tp meter
+ *  sender
  * @tp_vars: the private data of the current TP meter session
  */
 static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars)
@@ -936,7 +938,7 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars)
 }
 
 /**
- * batadv_tp_start - start a new tp meter session
+ * batadv_tp_start() - start a new tp meter session
  * @bat_priv: the bat priv with all the soft interface information
  * @dst: the receiver MAC address
  * @test_length: test length in milliseconds
@@ -1061,7 +1063,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
 }
 
 /**
- * batadv_tp_stop - stop currently running tp meter session
+ * batadv_tp_stop() - stop currently running tp meter session
  * @bat_priv: the bat priv with all the soft interface information
  * @dst: the receiver MAC address
  * @return_value: reason for tp meter session stop
@@ -1093,7 +1095,7 @@ out:
 }
 
 /**
- * batadv_tp_reset_receiver_timer - reset the receiver shutdown timer
+ * batadv_tp_reset_receiver_timer() - reset the receiver shutdown timer
  * @tp_vars: the private data of the current TP meter session
  *
  * start the receiver shutdown timer or reset it if already started
@@ -1105,7 +1107,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
 }
 
 /**
- * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is
+ * batadv_tp_receiver_shutdown() - stop a tp meter receiver when timeout is
  *  reached without received ack
  * @arg: address of the related tp_vars
  */
@@ -1150,7 +1152,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t)
 }
 
 /**
- * batadv_tp_send_ack - send an ACK packet
+ * batadv_tp_send_ack() - send an ACK packet
  * @bat_priv: the bat priv with all the soft interface information
  * @dst: the mac address of the destination originator
  * @seq: the sequence number to ACK
@@ -1222,7 +1224,7 @@ out:
 }
 
 /**
- * batadv_tp_handle_out_of_order - store an out of order packet
+ * batadv_tp_handle_out_of_order() - store an out of order packet
  * @tp_vars: the private data of the current TP meter session
  * @skb: the buffer containing the received packet
  *
@@ -1298,7 +1300,7 @@ out:
 }
 
 /**
- * batadv_tp_ack_unordered - update number received bytes in current stream
+ * batadv_tp_ack_unordered() - update number received bytes in current stream
  *  without gaps
  * @tp_vars: the private data of the current TP meter session
  */
@@ -1331,7 +1333,7 @@ static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars)
 }
 
 /**
- * batadv_tp_init_recv - return matching or create new receiver tp_vars
+ * batadv_tp_init_recv() - return matching or create new receiver tp_vars
  * @bat_priv: the bat priv with all the soft interface information
  * @icmp: received icmp tp msg
  *
@@ -1384,7 +1386,7 @@ out_unlock:
 }
 
 /**
- * batadv_tp_recv_msg - process a single data message
+ * batadv_tp_recv_msg() - process a single data message
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the buffer containing the received packet
  *
@@ -1469,7 +1471,7 @@ out:
 }
 
 /**
- * batadv_tp_meter_recv - main TP Meter receiving function
+ * batadv_tp_meter_recv() - main TP Meter receiving function
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the buffer containing the received packet
  */
@@ -1495,7 +1497,7 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb)
 }
 
 /**
- * batadv_tp_meter_init - initialize global tp_meter structures
+ * batadv_tp_meter_init() - initialize global tp_meter structures
  */
 void __init batadv_tp_meter_init(void)
 {
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index b4b20ad1ed9a..8b583d3e86e6 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -88,7 +88,7 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
 				 bool roaming);
 
 /**
- * batadv_compare_tt - check if two TT entries are the same
+ * batadv_compare_tt() - check if two TT entries are the same
  * @node: the list element pointer of the first TT entry
  * @data2: pointer to the tt_common_entry of the second TT entry
  *
@@ -107,7 +107,7 @@ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2)
 }
 
 /**
- * batadv_choose_tt - return the index of the tt entry in the hash table
+ * batadv_choose_tt() - return the index of the tt entry in the hash table
  * @data: pointer to the tt_common_entry object to map
  * @size: the size of the hash table
  *
@@ -127,7 +127,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size)
 }
 
 /**
- * batadv_tt_hash_find - look for a client in the given hash table
+ * batadv_tt_hash_find() - look for a client in the given hash table
  * @hash: the hash table to search
  * @addr: the mac address of the client to look for
  * @vid: VLAN identifier
@@ -172,7 +172,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
 }
 
 /**
- * batadv_tt_local_hash_find - search the local table for a given client
+ * batadv_tt_local_hash_find() - search the local table for a given client
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client to look for
  * @vid: VLAN identifier
@@ -197,7 +197,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
 }
 
 /**
- * batadv_tt_global_hash_find - search the global table for a given client
+ * batadv_tt_global_hash_find() - search the global table for a given client
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client to look for
  * @vid: VLAN identifier
@@ -222,7 +222,7 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
 }
 
 /**
- * batadv_tt_local_entry_free_rcu - free the tt_local_entry
+ * batadv_tt_local_entry_free_rcu() - free the tt_local_entry
  * @rcu: rcu pointer of the tt_local_entry
  */
 static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu)
@@ -236,7 +236,7 @@ static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu)
 }
 
 /**
- * batadv_tt_local_entry_release - release tt_local_entry from lists and queue
+ * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue
  *  for free after rcu grace period
  * @ref: kref pointer of the nc_node
  */
@@ -253,7 +253,7 @@ static void batadv_tt_local_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and
+ * batadv_tt_local_entry_put() - decrement the tt_local_entry refcounter and
  *  possibly release it
  * @tt_local_entry: tt_local_entry to be free'd
  */
@@ -265,7 +265,7 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
 }
 
 /**
- * batadv_tt_global_entry_free_rcu - free the tt_global_entry
+ * batadv_tt_global_entry_free_rcu() - free the tt_global_entry
  * @rcu: rcu pointer of the tt_global_entry
  */
 static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu)
@@ -279,8 +279,8 @@ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu)
 }
 
 /**
- * batadv_tt_global_entry_release - release tt_global_entry from lists and queue
- *  for free after rcu grace period
+ * batadv_tt_global_entry_release() - release tt_global_entry from lists and
+ *  queue for free after rcu grace period
  * @ref: kref pointer of the nc_node
  */
 static void batadv_tt_global_entry_release(struct kref *ref)
@@ -296,7 +296,7 @@ static void batadv_tt_global_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and
+ * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and
  *  possibly release it
  * @tt_global_entry: tt_global_entry to be free'd
  */
@@ -308,7 +308,7 @@ batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry)
 }
 
 /**
- * batadv_tt_global_hash_count - count the number of orig entries
+ * batadv_tt_global_hash_count() - count the number of orig entries
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client to count entries for
  * @vid: VLAN identifier
@@ -333,8 +333,8 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_size_mod - change the size by v of the local table identified
- *  by vid
+ * batadv_tt_local_size_mod() - change the size by v of the local table
+ *  identified by vid
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier of the sub-table to change
  * @v: the amount to sum to the local table size
@@ -354,8 +354,8 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_size_inc - increase by one the local table size for the given
- *  vid
+ * batadv_tt_local_size_inc() - increase by one the local table size for the
+ *  given vid
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier
  */
@@ -366,8 +366,8 @@ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_size_dec - decrease by one the local table size for the given
- *  vid
+ * batadv_tt_local_size_dec() - decrease by one the local table size for the
+ *  given vid
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier
  */
@@ -378,7 +378,7 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_global_size_mod - change the size by v of the global table
+ * batadv_tt_global_size_mod() - change the size by v of the global table
  *  for orig_node identified by vid
  * @orig_node: the originator for which the table has to be modified
  * @vid: the VLAN identifier
@@ -406,7 +406,7 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_tt_global_size_inc - increase by one the global table size for the
+ * batadv_tt_global_size_inc() - increase by one the global table size for the
  *  given vid
  * @orig_node: the originator which global table size has to be decreased
  * @vid: the vlan identifier
@@ -418,7 +418,7 @@ static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_tt_global_size_dec - decrease by one the global table size for the
+ * batadv_tt_global_size_dec() - decrease by one the global table size for the
  *  given vid
  * @orig_node: the originator which global table size has to be decreased
  * @vid: the vlan identifier
@@ -430,7 +430,7 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_tt_orig_list_entry_free_rcu - free the orig_entry
+ * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry
  * @rcu: rcu pointer of the orig_entry
  */
 static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
@@ -443,7 +443,7 @@ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
 }
 
 /**
- * batadv_tt_orig_list_entry_release - release tt orig entry from lists and
+ * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and
  *  queue for free after rcu grace period
  * @ref: kref pointer of the tt orig entry
  */
@@ -459,7 +459,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and
+ * batadv_tt_orig_list_entry_put() - decrement the tt orig entry refcounter and
  *  possibly release it
  * @orig_entry: tt orig entry to be free'd
  */
@@ -470,7 +470,7 @@ batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry)
 }
 
 /**
- * batadv_tt_local_event - store a local TT event (ADD/DEL)
+ * batadv_tt_local_event() - store a local TT event (ADD/DEL)
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_local_entry: the TT entry involved in the event
  * @event_flags: flags to store in the event structure
@@ -545,7 +545,7 @@ unlock:
 }
 
 /**
- * batadv_tt_len - compute length in bytes of given number of tt changes
+ * batadv_tt_len() - compute length in bytes of given number of tt changes
  * @changes_num: number of tt changes
  *
  * Return: computed length in bytes.
@@ -556,7 +556,7 @@ static int batadv_tt_len(int changes_num)
 }
 
 /**
- * batadv_tt_entries - compute the number of entries fitting in tt_len bytes
+ * batadv_tt_entries() - compute the number of entries fitting in tt_len bytes
  * @tt_len: available space
  *
  * Return: the number of entries.
@@ -567,8 +567,8 @@ static u16 batadv_tt_entries(u16 tt_len)
 }
 
 /**
- * batadv_tt_local_table_transmit_size - calculates the local translation table
- *  size when transmitted over the air
+ * batadv_tt_local_table_transmit_size() - calculates the local translation
+ *  table size when transmitted over the air
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: local translation table size in bytes.
@@ -627,7 +627,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_add - add a new client to the local table or update an
+ * batadv_tt_local_add() - add a new client to the local table or update an
  *  existing client
  * @soft_iface: netdev struct of the mesh interface
  * @addr: the mac address of the client to add
@@ -832,7 +832,7 @@ out:
 }
 
 /**
- * batadv_tt_prepare_tvlv_global_data - prepare the TVLV TT header to send
+ * batadv_tt_prepare_tvlv_global_data() - prepare the TVLV TT header to send
  *  within a TT Response directed to another node
  * @orig_node: originator for which the TT data has to be prepared
  * @tt_data: uninitialised pointer to the address of the TVLV buffer
@@ -905,8 +905,8 @@ out:
 }
 
 /**
- * batadv_tt_prepare_tvlv_local_data - allocate and prepare the TT TVLV for this
- *  node
+ * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for
+ *  this node
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_data: uninitialised pointer to the address of the TVLV buffer
  * @tt_change: uninitialised pointer to the address of the area where the TT
@@ -979,8 +979,8 @@ out:
 }
 
 /**
- * batadv_tt_tvlv_container_update - update the translation table tvlv container
- *  after local tt changes have been committed
+ * batadv_tt_tvlv_container_update() - update the translation table tvlv
+ *  container after local tt changes have been committed
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
@@ -1125,7 +1125,7 @@ out:
 #endif
 
 /**
- * batadv_tt_local_dump_entry - Dump one TT local entry into a message
+ * batadv_tt_local_dump_entry() - Dump one TT local entry into a message
  * @msg :Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -1181,7 +1181,7 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message
+ * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -1218,7 +1218,7 @@ batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_tt_local_dump - Dump TT local entries into a message
+ * batadv_tt_local_dump() - Dump TT local entries into a message
  * @msg: Netlink message to dump into
  * @cb: Parameters from query
  *
@@ -1302,7 +1302,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_remove - logically remove an entry from the local table
+ * batadv_tt_local_remove() - logically remove an entry from the local table
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the MAC address of the client to remove
  * @vid: VLAN identifier
@@ -1364,7 +1364,7 @@ out:
 }
 
 /**
- * batadv_tt_local_purge_list - purge inactive tt local entries
+ * batadv_tt_local_purge_list() - purge inactive tt local entries
  * @bat_priv: the bat priv with all the soft interface information
  * @head: pointer to the list containing the local tt entries
  * @timeout: parameter deciding whether a given tt local entry is considered
@@ -1399,7 +1399,7 @@ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_purge - purge inactive tt local entries
+ * batadv_tt_local_purge() - purge inactive tt local entries
  * @bat_priv: the bat priv with all the soft interface information
  * @timeout: parameter deciding whether a given tt local entry is considered
  *  inactive or not
@@ -1492,7 +1492,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_global_orig_entry_find - find a TT orig_list_entry
+ * batadv_tt_global_orig_entry_find() - find a TT orig_list_entry
  * @entry: the TT global entry where the orig_list_entry has to be
  *  extracted from
  * @orig_node: the originator for which the orig_list_entry has to be found
@@ -1526,8 +1526,8 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
 }
 
 /**
- * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled
- *  by a given originator
+ * batadv_tt_global_entry_has_orig() - check if a TT global entry is also
+ *  handled by a given originator
  * @entry: the TT global entry to check
  * @orig_node: the originator to search in the list
  *
@@ -1552,7 +1552,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
 }
 
 /**
- * batadv_tt_global_sync_flags - update TT sync flags
+ * batadv_tt_global_sync_flags() - update TT sync flags
  * @tt_global: the TT global entry to update sync flags in
  *
  * Updates the sync flag bits in the tt_global flag attribute with a logical
@@ -1576,7 +1576,7 @@ batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global)
 }
 
 /**
- * batadv_tt_global_orig_entry_add - add or update a TT orig entry
+ * batadv_tt_global_orig_entry_add() - add or update a TT orig entry
  * @tt_global: the TT global entry to add an orig entry in
  * @orig_node: the originator to add an orig entry for
  * @ttvn: translation table version number of this changeset
@@ -1626,7 +1626,7 @@ out:
 }
 
 /**
- * batadv_tt_global_add - add a new TT global entry or update an existing one
+ * batadv_tt_global_add() - add a new TT global entry or update an existing one
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: the originator announcing the client
  * @tt_addr: the mac address of the non-mesh client
@@ -1798,7 +1798,7 @@ out:
 }
 
 /**
- * batadv_transtable_best_orig - Get best originator list entry from tt entry
+ * batadv_transtable_best_orig() - Get best originator list entry from tt entry
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_global_entry: global translation table entry to be analyzed
  *
@@ -1844,8 +1844,8 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
 /**
- * batadv_tt_global_print_entry - print all orig nodes who announce the address
- *  for this global entry
+ * batadv_tt_global_print_entry() - print all orig nodes who announce the
+ *  address for this global entry
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_global_entry: global translation table entry to be printed
  * @seq: debugfs table seq_file struct
@@ -1969,7 +1969,7 @@ out:
 #endif
 
 /**
- * batadv_tt_global_dump_subentry - Dump all TT local entries into a message
+ * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -2030,7 +2030,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_tt_global_dump_entry - Dump one TT global entry into a message
+ * batadv_tt_global_dump_entry() - Dump one TT global entry into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -2075,7 +2075,7 @@ batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message
+ * batadv_tt_global_dump_bucket() - Dump one TT local bucket into a message
  * @msg: Netlink message to dump into
  * @portid: Port making netlink request
  * @seq: Sequence number of netlink message
@@ -2114,7 +2114,7 @@ batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
 }
 
 /**
- * batadv_tt_global_dump -  Dump TT global entries into a message
+ * batadv_tt_global_dump() -  Dump TT global entries into a message
  * @msg: Netlink message to dump into
  * @cb: Parameters from query
  *
@@ -2182,7 +2182,7 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
 }
 
 /**
- * _batadv_tt_global_del_orig_entry - remove and free an orig_entry
+ * _batadv_tt_global_del_orig_entry() - remove and free an orig_entry
  * @tt_global_entry: the global entry to remove the orig_entry from
  * @orig_entry: the orig entry to remove and free
  *
@@ -2224,7 +2224,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry)
 }
 
 /**
- * batadv_tt_global_del_orig_node - remove orig_node from a global tt entry
+ * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_global_entry: the global entry to remove the orig_node from
  * @orig_node: the originator announcing the client
@@ -2303,7 +2303,7 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_global_del - remove a client from the global table
+ * batadv_tt_global_del() - remove a client from the global table
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: an originator serving this client
  * @addr: the mac address of the client
@@ -2369,8 +2369,8 @@ out:
 }
 
 /**
- * batadv_tt_global_del_orig - remove all the TT global entries belonging to the
- *  given originator matching the provided vid
+ * batadv_tt_global_del_orig() - remove all the TT global entries belonging to
+ *  the given originator matching the provided vid
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: the originator owning the entries to remove
  * @match_vid: the VLAN identifier to match. If negative all the entries will be
@@ -2541,7 +2541,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
 }
 
 /**
- * batadv_transtable_search - get the mesh destination for a given client
+ * batadv_transtable_search() - get the mesh destination for a given client
  * @bat_priv: the bat priv with all the soft interface information
  * @src: mac address of the source client
  * @addr: mac address of the destination client
@@ -2601,7 +2601,7 @@ out:
 }
 
 /**
- * batadv_tt_global_crc - calculates the checksum of the local table belonging
+ * batadv_tt_global_crc() - calculates the checksum of the local table belonging
  *  to the given orig_node
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: originator for which the CRC should be computed
@@ -2696,7 +2696,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_crc - calculates the checksum of the local table
+ * batadv_tt_local_crc() - calculates the checksum of the local table
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: VLAN identifier for which the CRC32 has to be computed
  *
@@ -2753,7 +2753,7 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_req_node_release - free tt_req node entry
+ * batadv_tt_req_node_release() - free tt_req node entry
  * @ref: kref pointer of the tt req_node entry
  */
 static void batadv_tt_req_node_release(struct kref *ref)
@@ -2766,7 +2766,7 @@ static void batadv_tt_req_node_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_req_node_put - decrement the tt_req_node refcounter and
+ * batadv_tt_req_node_put() - decrement the tt_req_node refcounter and
  *  possibly release it
  * @tt_req_node: tt_req_node to be free'd
  */
@@ -2828,7 +2828,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_req_node_new - search and possibly create a tt_req_node object
+ * batadv_tt_req_node_new() - search and possibly create a tt_req_node object
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node this request is being issued for
  *
@@ -2865,7 +2865,7 @@ unlock:
 }
 
 /**
- * batadv_tt_local_valid - verify that given tt entry is a valid one
+ * batadv_tt_local_valid() - verify that given tt entry is a valid one
  * @entry_ptr: to be checked local tt entry
  * @data_ptr: not used but definition required to satisfy the callback prototype
  *
@@ -2899,7 +2899,7 @@ static bool batadv_tt_global_valid(const void *entry_ptr,
 }
 
 /**
- * batadv_tt_tvlv_generate - fill the tvlv buff with the tt entries from the
+ * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the
  *  specified tt hash
  * @bat_priv: the bat priv with all the soft interface information
  * @hash: hash table containing the tt entries
@@ -2950,7 +2950,7 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_global_check_crc - check if all the CRCs are correct
+ * batadv_tt_global_check_crc() - check if all the CRCs are correct
  * @orig_node: originator for which the CRCs have to be checked
  * @tt_vlan: pointer to the first tvlv VLAN entry
  * @num_vlan: number of tvlv VLAN entries
@@ -3007,7 +3007,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
 }
 
 /**
- * batadv_tt_local_update_crc - update all the local CRCs
+ * batadv_tt_local_update_crc() - update all the local CRCs
  * @bat_priv: the bat priv with all the soft interface information
  */
 static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv)
@@ -3023,7 +3023,7 @@ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_global_update_crc - update all the global CRCs for this orig_node
+ * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: the orig_node for which the CRCs have to be updated
  */
@@ -3050,7 +3050,7 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_send_tt_request - send a TT Request message to a given node
+ * batadv_send_tt_request() - send a TT Request message to a given node
  * @bat_priv: the bat priv with all the soft interface information
  * @dst_orig_node: the destination of the message
  * @ttvn: the version number that the source of the message is looking for
@@ -3139,7 +3139,7 @@ out:
 }
 
 /**
- * batadv_send_other_tt_response - send reply to tt request concerning another
+ * batadv_send_other_tt_response() - send reply to tt request concerning another
  *  node's translation table
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_data: tt data containing the tt request information
@@ -3272,8 +3272,8 @@ out:
 }
 
 /**
- * batadv_send_my_tt_response - send reply to tt request concerning this node's
- *  translation table
+ * batadv_send_my_tt_response() - send reply to tt request concerning this
+ *  node's translation table
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_data: tt data containing the tt request information
  * @req_src: mac address of tt request sender
@@ -3390,7 +3390,7 @@ out:
 }
 
 /**
- * batadv_send_tt_response - send reply to tt request
+ * batadv_send_tt_response() - send reply to tt request
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_data: tt data containing the tt request information
  * @req_src: mac address of tt request sender
@@ -3486,7 +3486,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_is_my_client - check if a client is served by the local node
+ * batadv_is_my_client() - check if a client is served by the local node
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client to check
  * @vid: VLAN identifier
@@ -3516,7 +3516,7 @@ out:
 }
 
 /**
- * batadv_handle_tt_response - process incoming tt reply
+ * batadv_handle_tt_response() - process incoming tt reply
  * @bat_priv: the bat priv with all the soft interface information
  * @tt_data: tt data containing the tt request information
  * @resp_src: mac address of tt reply sender
@@ -3609,7 +3609,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_check_roam_count - check if a client has roamed too frequently
+ * batadv_tt_check_roam_count() - check if a client has roamed too frequently
  * @bat_priv: the bat priv with all the soft interface information
  * @client: mac address of the roaming client
  *
@@ -3664,7 +3664,7 @@ unlock:
 }
 
 /**
- * batadv_send_roam_adv - send a roaming advertisement message
+ * batadv_send_roam_adv() - send a roaming advertisement message
  * @bat_priv: the bat priv with all the soft interface information
  * @client: mac address of the roaming client
  * @vid: VLAN identifier
@@ -3746,7 +3746,7 @@ void batadv_tt_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_local_set_flags - set or unset the specified flags on the local
+ * batadv_tt_local_set_flags() - set or unset the specified flags on the local
  *  table and possibly count them in the TT size
  * @bat_priv: the bat priv with all the soft interface information
  * @flags: the flag to switch
@@ -3832,7 +3832,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_local_commit_changes_nolock - commit all pending local tt changes
+ * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes
  *  which have been queued in the time since the last commit
  * @bat_priv: the bat priv with all the soft interface information
  *
@@ -3865,7 +3865,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_local_commit_changes - commit all pending local tt changes which
+ * batadv_tt_local_commit_changes() - commit all pending local tt changes which
  *  have been queued in the time since the last commit
  * @bat_priv: the bat priv with all the soft interface information
  */
@@ -3911,7 +3911,7 @@ vlan_put:
 }
 
 /**
- * batadv_tt_update_orig - update global translation table with new tt
+ * batadv_tt_update_orig() - update global translation table with new tt
  *  information received via ogms
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: the orig_node of the ogm
@@ -3996,7 +3996,7 @@ request_table:
 }
 
 /**
- * batadv_tt_global_client_is_roaming - check if a client is marked as roaming
+ * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client to check
  * @vid: VLAN identifier
@@ -4022,7 +4022,7 @@ out:
 }
 
 /**
- * batadv_tt_local_client_is_roaming - tells whether the client is roaming
+ * batadv_tt_local_client_is_roaming() - tells whether the client is roaming
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the local client to query
  * @vid: VLAN identifier
@@ -4071,7 +4071,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_local_resize_to_mtu - resize the local translation table fit the
+ * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the
  *  maximum packet size that can be transported through the mesh
  * @soft_iface: netdev struct of the mesh interface
  *
@@ -4112,7 +4112,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface)
 }
 
 /**
- * batadv_tt_tvlv_ogm_handler_v1 - process incoming tt tvlv container
+ * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the orig_node of the ogm
  * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
@@ -4151,7 +4151,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_tvlv_unicast_handler_v1 - process incoming (unicast) tt tvlv
+ * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv
  *  container
  * @bat_priv: the bat priv with all the soft interface information
  * @src: mac address of tt tvlv sender
@@ -4233,7 +4233,8 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_roam_tvlv_unicast_handler_v1 - process incoming tt roam tvlv container
+ * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv
+ *  container
  * @bat_priv: the bat priv with all the soft interface information
  * @src: mac address of tt tvlv sender
  * @dst: mac address of tt tvlv recipient
@@ -4283,7 +4284,7 @@ out:
 }
 
 /**
- * batadv_tt_init - initialise the translation table internals
+ * batadv_tt_init() - initialise the translation table internals
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 on success or negative error number in case of failure.
@@ -4319,7 +4320,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_global_is_isolated - check if a client is marked as isolated
+ * batadv_tt_global_is_isolated() - check if a client is marked as isolated
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client
  * @vid: the identifier of the VLAN where this client is connected
@@ -4345,7 +4346,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_cache_init - Initialize tt memory object cache
+ * batadv_tt_cache_init() - Initialize tt memory object cache
  *
  * Return: 0 on success or negative error number in case of failure.
  */
@@ -4414,7 +4415,7 @@ err_tt_tl_destroy:
 }
 
 /**
- * batadv_tt_cache_destroy - Destroy tt memory object cache
+ * batadv_tt_cache_destroy() - Destroy tt memory object cache
  */
 void batadv_tt_cache_destroy(void)
 {
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index d956c2a0e9cb..e189f026974c 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -43,7 +43,7 @@
 #include "tvlv.h"
 
 /**
- * batadv_tvlv_handler_release - release tvlv handler from lists and queue for
+ * batadv_tvlv_handler_release() - release tvlv handler from lists and queue for
  *  free after rcu grace period
  * @ref: kref pointer of the tvlv
  */
@@ -56,7 +56,7 @@ static void batadv_tvlv_handler_release(struct kref *ref)
 }
 
 /**
- * batadv_tvlv_handler_put - decrement the tvlv container refcounter and
+ * batadv_tvlv_handler_put() - decrement the tvlv container refcounter and
  *  possibly release it
  * @tvlv_handler: the tvlv handler to free
  */
@@ -66,7 +66,7 @@ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
 }
 
 /**
- * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list
+ * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list
  *  based on the provided type and version (both need to match)
  * @bat_priv: the bat priv with all the soft interface information
  * @type: tvlv handler type to look for
@@ -100,7 +100,7 @@ batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
 }
 
 /**
- * batadv_tvlv_container_release - release tvlv from lists and free
+ * batadv_tvlv_container_release() - release tvlv from lists and free
  * @ref: kref pointer of the tvlv
  */
 static void batadv_tvlv_container_release(struct kref *ref)
@@ -112,7 +112,7 @@ static void batadv_tvlv_container_release(struct kref *ref)
 }
 
 /**
- * batadv_tvlv_container_put - decrement the tvlv container refcounter and
+ * batadv_tvlv_container_put() - decrement the tvlv container refcounter and
  *  possibly release it
  * @tvlv: the tvlv container to free
  */
@@ -122,7 +122,7 @@ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
 }
 
 /**
- * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container
+ * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container
  *  list based on the provided type and version (both need to match)
  * @bat_priv: the bat priv with all the soft interface information
  * @type: tvlv container type to look for
@@ -156,7 +156,7 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
 }
 
 /**
- * batadv_tvlv_container_list_size - calculate the size of the tvlv container
+ * batadv_tvlv_container_list_size() - calculate the size of the tvlv container
  *  list entries
  * @bat_priv: the bat priv with all the soft interface information
  *
@@ -181,8 +181,8 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tvlv_container_remove - remove tvlv container from the tvlv container
- *  list
+ * batadv_tvlv_container_remove() - remove tvlv container from the tvlv
+ *  container list
  * @bat_priv: the bat priv with all the soft interface information
  * @tvlv: the to be removed tvlv container
  *
@@ -205,7 +205,7 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_container_unregister - unregister tvlv container based on the
+ * batadv_tvlv_container_unregister() - unregister tvlv container based on the
  *  provided type and version (both need to match)
  * @bat_priv: the bat priv with all the soft interface information
  * @type: tvlv container type to unregister
@@ -223,7 +223,7 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_container_register - register tvlv type, version and content
+ * batadv_tvlv_container_register() - register tvlv type, version and content
  *  to be propagated with each (primary interface) OGM
  * @bat_priv: the bat priv with all the soft interface information
  * @type: tvlv container type
@@ -268,7 +268,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate
+ * batadv_tvlv_realloc_packet_buff() - reallocate packet buffer to accommodate
  *  requested packet size
  * @packet_buff: packet buffer
  * @packet_buff_len: packet buffer size
@@ -301,7 +301,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
 }
 
 /**
- * batadv_tvlv_container_ogm_append - append tvlv container content to given
+ * batadv_tvlv_container_ogm_append() - append tvlv container content to given
  *  OGM packet buffer
  * @bat_priv: the bat priv with all the soft interface information
  * @packet_buff: ogm packet buffer
@@ -354,7 +354,7 @@ end:
 }
 
 /**
- * batadv_tvlv_call_handler - parse the given tvlv buffer to call the
+ * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the
  *  appropriate handlers
  * @bat_priv: the bat priv with all the soft interface information
  * @tvlv_handler: tvlv callback function handling the tvlv content
@@ -408,7 +408,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_containers_process - parse the given tvlv buffer to call the
+ * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the
  *  appropriate handlers
  * @bat_priv: the bat priv with all the soft interface information
  * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
@@ -475,7 +475,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate
+ * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate
  *  handlers
  * @bat_priv: the bat priv with all the soft interface information
  * @batadv_ogm_packet: ogm packet containing the tvlv containers
@@ -502,7 +502,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_handler_register - register tvlv handler based on the provided
+ * batadv_tvlv_handler_register() - register tvlv handler based on the provided
  *  type and version (both need to match) for ogm tvlv payload and/or unicast
  *  payload
  * @bat_priv: the bat priv with all the soft interface information
@@ -557,7 +557,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_handler_unregister - unregister tvlv handler based on the
+ * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the
  *  provided type and version (both need to match)
  * @bat_priv: the bat priv with all the soft interface information
  * @type: tvlv handler type to be unregistered
@@ -580,7 +580,7 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the
+ * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the
  *  specified host
  * @bat_priv: the bat priv with all the soft interface information
  * @src: source mac address of the unicast packet
-- 
cgit v1.2.3


From 8b84cc4fb556b24fcd2c9529ad4b13556258f668 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:48 +0100
Subject: batman-adv: Use inline kernel-doc for enum/struct

The inline kernel-doc comments make it easier to keep changes to the
struct/enum synchronized with the documentation of the it. And it makes it
easier for larger structures like struct batadv_priv to read the
documentation inside the code.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_iv_ogm.c     |   17 +-
 net/batman-adv/gateway_common.h |    5 +-
 net/batman-adv/hard-interface.h |   27 +-
 net/batman-adv/log.h            |   26 +-
 net/batman-adv/multicast.h      |   16 +-
 net/batman-adv/types.h          | 1989 +++++++++++++++++++++++++++------------
 6 files changed, 1444 insertions(+), 636 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 1fc67aa8d7df..0973e8c5a063 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -73,16 +73,23 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work);
 
 /**
  * enum batadv_dup_status - duplicate status
- * @BATADV_NO_DUP: the packet is no duplicate
- * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the
- *  neighbor)
- * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor
- * @BATADV_PROTECTED: originator is currently protected (after reboot)
  */
 enum batadv_dup_status {
+	/** @BATADV_NO_DUP: the packet is no duplicate */
 	BATADV_NO_DUP = 0,
+
+	/**
+	 * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for
+	 *  the neighbor)
+	 */
 	BATADV_ORIG_DUP,
+
+	/** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */
 	BATADV_NEIGH_DUP,
+
+	/**
+	 * @BATADV_PROTECTED: originator is currently protected (after reboot)
+	 */
 	BATADV_PROTECTED,
 };
 
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 7c298b05c1dc..afebd9c7edf4 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -33,11 +33,12 @@ enum batadv_gw_modes {
 
 /**
  * enum batadv_bandwidth_units - bandwidth unit types
- * @BATADV_BW_UNIT_KBIT: unit type kbit
- * @BATADV_BW_UNIT_MBIT: unit type mbit
  */
 enum batadv_bandwidth_units {
+	/** @BATADV_BW_UNIT_KBIT: unit type kbit */
 	BATADV_BW_UNIT_KBIT,
+
+	/** @BATADV_BW_UNIT_MBIT: unit type mbit */
 	BATADV_BW_UNIT_MBIT,
 };
 
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index a7f9036f0e3a..fb7a5d6b5ce3 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -42,25 +42,40 @@ enum batadv_hard_if_state {
 
 /**
  * enum batadv_hard_if_bcast - broadcast avoidance options
- * @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface
- * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no recipient
- * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it from
- * @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator
  */
 enum batadv_hard_if_bcast {
+	/** @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface */
 	BATADV_HARDIF_BCAST_OK = 0,
+
+	/**
+	 * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no
+	 *  recipient
+	 */
 	BATADV_HARDIF_BCAST_NORECIPIENT,
+
+	/**
+	 * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it
+	 *  from
+	 */
 	BATADV_HARDIF_BCAST_DUPFWD,
+
+	/** @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator */
 	BATADV_HARDIF_BCAST_DUPORIG,
 };
 
 /**
  * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
- * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
- * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed
  */
 enum batadv_hard_if_cleanup {
+	/**
+	 * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
+	 */
 	BATADV_IF_CLEANUP_KEEP,
+
+	/**
+	 * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was
+	 *  removed
+	 */
 	BATADV_IF_CLEANUP_AUTO,
 };
 
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 6744a64143c0..dd22e17b84b4 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -45,25 +45,33 @@ static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
 
 /**
  * enum batadv_dbg_level - available log levels
- * @BATADV_DBG_BATMAN: OGM and TQ computations related messages
- * @BATADV_DBG_ROUTES: route added / changed / deleted
- * @BATADV_DBG_TT: translation table messages
- * @BATADV_DBG_BLA: bridge loop avoidance messages
- * @BATADV_DBG_DAT: ARP snooping and DAT related messages
- * @BATADV_DBG_NC: network coding related messages
- * @BATADV_DBG_MCAST: multicast related messages
- * @BATADV_DBG_TP_METER: throughput meter messages
- * @BATADV_DBG_ALL: the union of all the above log levels
  */
 enum batadv_dbg_level {
+	/** @BATADV_DBG_BATMAN: OGM and TQ computations related messages */
 	BATADV_DBG_BATMAN	= BIT(0),
+
+	/** @BATADV_DBG_ROUTES: route added / changed / deleted */
 	BATADV_DBG_ROUTES	= BIT(1),
+
+	/** @BATADV_DBG_TT: translation table messages */
 	BATADV_DBG_TT		= BIT(2),
+
+	/** @BATADV_DBG_BLA: bridge loop avoidance messages */
 	BATADV_DBG_BLA		= BIT(3),
+
+	/** @BATADV_DBG_DAT: ARP snooping and DAT related messages */
 	BATADV_DBG_DAT		= BIT(4),
+
+	/** @BATADV_DBG_NC: network coding related messages */
 	BATADV_DBG_NC		= BIT(5),
+
+	/** @BATADV_DBG_MCAST: multicast related messages */
 	BATADV_DBG_MCAST	= BIT(6),
+
+	/** @BATADV_DBG_TP_METER: throughput meter messages */
 	BATADV_DBG_TP_METER	= BIT(7),
+
+	/** @BATADV_DBG_ALL: the union of all the above log levels */
 	BATADV_DBG_ALL		= 255,
 };
 
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 51f273b5b77d..3ac06337ab71 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -26,15 +26,21 @@ struct sk_buff;
 
 /**
  * enum batadv_forw_mode - the way a packet should be forwarded as
- * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic
- *  flooding)
- * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the
- *  BATMAN unicast routing protocol)
- * @BATADV_FORW_NONE: don't forward, drop it
  */
 enum batadv_forw_mode {
+	/**
+	 * @BATADV_FORW_ALL: forward the packet to all nodes (currently via
+	 *  classic flooding)
+	 */
 	BATADV_FORW_ALL,
+
+	/**
+	 * @BATADV_FORW_SINGLE: forward the packet to a single node (currently
+	 *  via the BATMAN unicast routing protocol)
+	 */
 	BATADV_FORW_SINGLE,
+
+	/** @BATADV_FORW_NONE: don't forward, drop it */
 	BATADV_FORW_NONE,
 };
 
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 1df798b32077..c39f879d7dde 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -55,13 +55,15 @@ struct seq_file;
 
 /**
  * enum batadv_dhcp_recipient - dhcp destination
- * @BATADV_DHCP_NO: packet is not a dhcp message
- * @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server
- * @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client
  */
 enum batadv_dhcp_recipient {
+	/** @BATADV_DHCP_NO: packet is not a dhcp message */
 	BATADV_DHCP_NO = 0,
+
+	/** @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server */
 	BATADV_DHCP_TO_SERVER,
+
+	/** @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client */
 	BATADV_DHCP_TO_CLIENT,
 };
 
@@ -79,196 +81,274 @@ enum batadv_dhcp_recipient {
 
 /**
  * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data
- * @ogm_buff: buffer holding the OGM packet
- * @ogm_buff_len: length of the OGM packet buffer
- * @ogm_seqno: OGM sequence number - used to identify each OGM
  */
 struct batadv_hard_iface_bat_iv {
+	/** @ogm_buff: buffer holding the OGM packet */
 	unsigned char *ogm_buff;
+
+	/** @ogm_buff_len: length of the OGM packet buffer */
 	int ogm_buff_len;
+
+	/** @ogm_seqno: OGM sequence number - used to identify each OGM */
 	atomic_t ogm_seqno;
 };
 
 /**
  * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V
- * @BATADV_FULL_DUPLEX: tells if the connection over this link is full-duplex
- * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that no
- *  throughput data is available for this interface and that default values are
- *  assumed.
  */
 enum batadv_v_hard_iface_flags {
+	/**
+	 * @BATADV_FULL_DUPLEX: tells if the connection over this link is
+	 *  full-duplex
+	 */
 	BATADV_FULL_DUPLEX	= BIT(0),
+
+	/**
+	 * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that
+	 *  no throughput data is available for this interface and that default
+	 *  values are assumed.
+	 */
 	BATADV_WARNING_DEFAULT	= BIT(1),
 };
 
 /**
  * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data
- * @elp_interval: time interval between two ELP transmissions
- * @elp_seqno: current ELP sequence number
- * @elp_skb: base skb containing the ELP message to send
- * @elp_wq: workqueue used to schedule ELP transmissions
- * @throughput_override: throughput override to disable link auto-detection
- * @flags: interface specific flags
  */
 struct batadv_hard_iface_bat_v {
+	/** @elp_interval: time interval between two ELP transmissions */
 	atomic_t elp_interval;
+
+	/** @elp_seqno: current ELP sequence number */
 	atomic_t elp_seqno;
+
+	/** @elp_skb: base skb containing the ELP message to send */
 	struct sk_buff *elp_skb;
+
+	/** @elp_wq: workqueue used to schedule ELP transmissions */
 	struct delayed_work elp_wq;
+
+	/**
+	 * @throughput_override: throughput override to disable link
+	 *  auto-detection
+	 */
 	atomic_t throughput_override;
+
+	/** @flags: interface specific flags */
 	u8 flags;
 };
 
 /**
  * enum batadv_hard_iface_wifi_flags - Flags describing the wifi configuration
  *  of a batadv_hard_iface
- * @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device
- * @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device
- * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device
- * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi device
  */
 enum batadv_hard_iface_wifi_flags {
+	/** @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device */
 	BATADV_HARDIF_WIFI_WEXT_DIRECT = BIT(0),
+
+	/** @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device */
 	BATADV_HARDIF_WIFI_CFG80211_DIRECT = BIT(1),
+
+	/**
+	 * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device
+	 */
 	BATADV_HARDIF_WIFI_WEXT_INDIRECT = BIT(2),
+
+	/**
+	 * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi
+	 * device
+	 */
 	BATADV_HARDIF_WIFI_CFG80211_INDIRECT = BIT(3),
 };
 
 /**
  * struct batadv_hard_iface - network device known to batman-adv
- * @list: list node for batadv_hardif_list
- * @if_num: identificator of the interface
- * @if_status: status of the interface for batman-adv
- * @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
- * @wifi_flags: flags whether this is (directly or indirectly) a wifi interface
- * @net_dev: pointer to the net_device
- * @hardif_obj: kobject of the per interface sysfs "mesh" directory
- * @refcount: number of contexts the object is used
- * @batman_adv_ptype: packet type describing packets that should be processed by
- *  batman-adv for this interface
- * @soft_iface: the batman-adv interface which uses this network interface
- * @rcu: struct used for freeing in an RCU-safe manner
- * @bat_iv: per hard-interface B.A.T.M.A.N. IV data
- * @bat_v: per hard-interface B.A.T.M.A.N. V data
- * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
- * @neigh_list: list of unique single hop neighbors via this interface
- * @neigh_list_lock: lock protecting neigh_list
  */
 struct batadv_hard_iface {
+	/** @list: list node for batadv_hardif_list */
 	struct list_head list;
+
+	/** @if_num: identificator of the interface */
 	s16 if_num;
+
+	/** @if_status: status of the interface for batman-adv */
 	char if_status;
+
+	/**
+	 * @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
+	 */
 	u8 num_bcasts;
+
+	/**
+	 * @wifi_flags: flags whether this is (directly or indirectly) a wifi
+	 *  interface
+	 */
 	u32 wifi_flags;
+
+	/** @net_dev: pointer to the net_device */
 	struct net_device *net_dev;
+
+	/** @hardif_obj: kobject of the per interface sysfs "mesh" directory */
 	struct kobject *hardif_obj;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/**
+	 * @batman_adv_ptype: packet type describing packets that should be
+	 * processed by batman-adv for this interface
+	 */
 	struct packet_type batman_adv_ptype;
+
+	/**
+	 * @soft_iface: the batman-adv interface which uses this network
+	 *  interface
+	 */
 	struct net_device *soft_iface;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
+
+	/** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */
 	struct batadv_hard_iface_bat_iv bat_iv;
+
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	/** @bat_v: per hard-interface B.A.T.M.A.N. V data */
 	struct batadv_hard_iface_bat_v bat_v;
 #endif
+
+	/**
+	 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
+	 */
 	struct dentry *debug_dir;
+
+	/**
+	 * @neigh_list: list of unique single hop neighbors via this interface
+	 */
 	struct hlist_head neigh_list;
-	/* neigh_list_lock protects: neigh_list */
+
+	/** @neigh_list_lock: lock protecting neigh_list */
 	spinlock_t neigh_list_lock;
 };
 
 /**
  * struct batadv_orig_ifinfo - originator info per outgoing interface
- * @list: list node for orig_node::ifinfo_list
- * @if_outgoing: pointer to outgoing hard-interface
- * @router: router that should be used to reach this originator
- * @last_real_seqno: last and best known sequence number
- * @last_ttl: ttl of last received packet
- * @last_seqno_forwarded: seqno of the OGM which was forwarded last
- * @batman_seqno_reset: time when the batman seqno window was reset
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_orig_ifinfo {
+	/** @list: list node for orig_node::ifinfo_list */
 	struct hlist_node list;
+
+	/** @if_outgoing: pointer to outgoing hard-interface */
 	struct batadv_hard_iface *if_outgoing;
-	struct batadv_neigh_node __rcu *router; /* rcu protected pointer */
+
+	/** @router: router that should be used to reach this originator */
+	struct batadv_neigh_node __rcu *router;
+
+	/** @last_real_seqno: last and best known sequence number */
 	u32 last_real_seqno;
+
+	/** @last_ttl: ttl of last received packet */
 	u8 last_ttl;
+
+	/** @last_seqno_forwarded: seqno of the OGM which was forwarded last */
 	u32 last_seqno_forwarded;
+
+	/** @batman_seqno_reset: time when the batman seqno window was reset */
 	unsigned long batman_seqno_reset;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_frag_table_entry - head in the fragment buffer table
- * @fragment_list: head of list with fragments
- * @lock: lock to protect the list of fragments
- * @timestamp: time (jiffie) of last received fragment
- * @seqno: sequence number of the fragments in the list
- * @size: accumulated size of packets in list
- * @total_size: expected size of the assembled packet
  */
 struct batadv_frag_table_entry {
+	/** @fragment_list: head of list with fragments */
 	struct hlist_head fragment_list;
-	spinlock_t lock; /* protects fragment_list */
+
+	/** @lock: lock to protect the list of fragments */
+	spinlock_t lock;
+
+	/** @timestamp: time (jiffie) of last received fragment */
 	unsigned long timestamp;
+
+	/** @seqno: sequence number of the fragments in the list */
 	u16 seqno;
+
+	/** @size: accumulated size of packets in list */
 	u16 size;
+
+	/** @total_size: expected size of the assembled packet */
 	u16 total_size;
 };
 
 /**
  * struct batadv_frag_list_entry - entry in a list of fragments
- * @list: list node information
- * @skb: fragment
- * @no: fragment number in the set
  */
 struct batadv_frag_list_entry {
+	/** @list: list node information */
 	struct hlist_node list;
+
+	/** @skb: fragment */
 	struct sk_buff *skb;
+
+	/** @no: fragment number in the set */
 	u8 no;
 };
 
 /**
  * struct batadv_vlan_tt - VLAN specific TT attributes
- * @crc: CRC32 checksum of the entries belonging to this vlan
- * @num_entries: number of TT entries for this VLAN
  */
 struct batadv_vlan_tt {
+	/** @crc: CRC32 checksum of the entries belonging to this vlan */
 	u32 crc;
+
+	/** @num_entries: number of TT entries for this VLAN */
 	atomic_t num_entries;
 };
 
 /**
  * struct batadv_orig_node_vlan - VLAN specific data per orig_node
- * @vid: the VLAN identifier
- * @tt: VLAN specific TT attributes
- * @list: list node for orig_node::vlan_list
- * @refcount: number of context where this object is currently in use
- * @rcu: struct used for freeing in a RCU-safe manner
  */
 struct batadv_orig_node_vlan {
+	/** @vid: the VLAN identifier */
 	unsigned short vid;
+
+	/** @tt: VLAN specific TT attributes */
 	struct batadv_vlan_tt tt;
+
+	/** @list: list node for orig_node::vlan_list */
 	struct hlist_node list;
+
+	/**
+	 * @refcount: number of context where this object is currently in use
+	 */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in a RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members
- * @bcast_own: set of bitfields (one per hard-interface) where each one counts
- * the number of our OGMs this orig_node rebroadcasted "back" to us  (relative
- * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
- * @bcast_own_sum: sum of bcast_own
- * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
- *  neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
  */
 struct batadv_orig_bat_iv {
+	/**
+	 * @bcast_own: set of bitfields (one per hard-interface) where each one
+	 * counts the number of our OGMs this orig_node rebroadcasted "back" to
+	 * us  (relative to last_real_seqno). Every bitfield is
+	 * BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
+	 */
 	unsigned long *bcast_own;
+
+	/** @bcast_own_sum: sum of bcast_own */
 	u8 *bcast_own_sum;
-	/* ogm_cnt_lock protects: bcast_own, bcast_own_sum,
+
+	/**
+	 * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
 	 * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
 	 */
 	spinlock_t ogm_cnt_lock;
@@ -276,130 +356,205 @@ struct batadv_orig_bat_iv {
 
 /**
  * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh
- * @orig: originator ethernet address
- * @ifinfo_list: list for routers per outgoing interface
- * @last_bonding_candidate: pointer to last ifinfo of last used router
- * @dat_addr: address of the orig node in the distributed hash
- * @last_seen: time when last packet from this node was received
- * @bcast_seqno_reset: time when the broadcast seqno window was reset
- * @mcast_handler_lock: synchronizes mcast-capability and -flag changes
- * @mcast_flags: multicast flags announced by the orig node
- * @mcast_want_all_unsnoopables_node: a list node for the
- *  mcast.want_all_unsnoopables list
- * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 list
- * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 list
- * @capabilities: announced capabilities of this originator
- * @capa_initialized: bitfield to remember whether a capability was initialized
- * @last_ttvn: last seen translation table version number
- * @tt_buff: last tt changeset this node received from the orig node
- * @tt_buff_len: length of the last tt changeset this node received from the
- *  orig node
- * @tt_buff_lock: lock that protects tt_buff and tt_buff_len
- * @tt_lock: prevents from updating the table while reading it. Table update is
- *  made up by two operations (data structure update and metdata -CRC/TTVN-
- *  recalculation) and they have to be executed atomically in order to avoid
- *  another thread to read the table/metadata between those.
- * @bcast_bits: bitfield containing the info which payload broadcast originated
- *  from this orig node this host already has seen (relative to
- *  last_bcast_seqno)
- * @last_bcast_seqno: last broadcast sequence number received by this host
- * @neigh_list: list of potential next hop neighbor towards this orig node
- * @neigh_list_lock: lock protecting neigh_list and router
- * @hash_entry: hlist node for batadv_priv::orig_hash
- * @bat_priv: pointer to soft_iface this orig node belongs to
- * @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
- * @in_coding_list: list of nodes this orig can hear
- * @out_coding_list: list of nodes that can hear this orig
- * @in_coding_list_lock: protects in_coding_list
- * @out_coding_list_lock: protects out_coding_list
- * @fragments: array with heads for fragment chains
- * @vlan_list: a list of orig_node_vlan structs, one per VLAN served by the
- *  originator represented by this object
- * @vlan_list_lock: lock protecting vlan_list
- * @bat_iv: B.A.T.M.A.N. IV private structure
  */
 struct batadv_orig_node {
+	/** @orig: originator ethernet address */
 	u8 orig[ETH_ALEN];
+
+	/** @ifinfo_list: list for routers per outgoing interface */
 	struct hlist_head ifinfo_list;
+
+	/**
+	 * @last_bonding_candidate: pointer to last ifinfo of last used router
+	 */
 	struct batadv_orig_ifinfo *last_bonding_candidate;
+
 #ifdef CONFIG_BATMAN_ADV_DAT
+	/** @dat_addr: address of the orig node in the distributed hash */
 	batadv_dat_addr_t dat_addr;
 #endif
+
+	/** @last_seen: time when last packet from this node was received */
 	unsigned long last_seen;
+
+	/**
+	 * @bcast_seqno_reset: time when the broadcast seqno window was reset
+	 */
 	unsigned long bcast_seqno_reset;
+
 #ifdef CONFIG_BATMAN_ADV_MCAST
-	/* synchronizes mcast tvlv specific orig changes */
+	/**
+	 * @mcast_handler_lock: synchronizes mcast-capability and -flag changes
+	 */
 	spinlock_t mcast_handler_lock;
+
+	/** @mcast_flags: multicast flags announced by the orig node */
 	u8 mcast_flags;
+
+	/**
+	 * @mcast_want_all_unsnoopables_node: a list node for the
+	 *  mcast.want_all_unsnoopables list
+	 */
 	struct hlist_node mcast_want_all_unsnoopables_node;
+
+	/**
+	 * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4
+	 *  list
+	 */
 	struct hlist_node mcast_want_all_ipv4_node;
+	/**
+	 * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6
+	 *  list
+	 */
 	struct hlist_node mcast_want_all_ipv6_node;
 #endif
+
+	/** @capabilities: announced capabilities of this originator */
 	unsigned long capabilities;
+
+	/**
+	 * @capa_initialized: bitfield to remember whether a capability was
+	 *  initialized
+	 */
 	unsigned long capa_initialized;
+
+	/** @last_ttvn: last seen translation table version number */
 	atomic_t last_ttvn;
+
+	/** @tt_buff: last tt changeset this node received from the orig node */
 	unsigned char *tt_buff;
+
+	/**
+	 * @tt_buff_len: length of the last tt changeset this node received
+	 *  from the orig node
+	 */
 	s16 tt_buff_len;
-	spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */
-	/* prevents from changing the table while reading it */
+
+	/** @tt_buff_lock: lock that protects tt_buff and tt_buff_len */
+	spinlock_t tt_buff_lock;
+
+	/**
+	 * @tt_lock: prevents from updating the table while reading it. Table
+	 *  update is made up by two operations (data structure update and
+	 *  metdata -CRC/TTVN-recalculation) and they have to be executed
+	 *  atomically in order to avoid another thread to read the
+	 *  table/metadata between those.
+	 */
 	spinlock_t tt_lock;
+
+	/**
+	 * @bcast_bits: bitfield containing the info which payload broadcast
+	 *  originated from this orig node this host already has seen (relative
+	 *  to last_bcast_seqno)
+	 */
 	DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+	/**
+	 * @last_bcast_seqno: last broadcast sequence number received by this
+	 *  host
+	 */
 	u32 last_bcast_seqno;
+
+	/**
+	 * @neigh_list: list of potential next hop neighbor towards this orig
+	 *  node
+	 */
 	struct hlist_head neigh_list;
-	/* neigh_list_lock protects: neigh_list, ifinfo_list,
-	 * last_bonding_candidate and router
+
+	/**
+	 * @neigh_list_lock: lock protecting neigh_list, ifinfo_list,
+	 *  last_bonding_candidate and router
 	 */
 	spinlock_t neigh_list_lock;
+
+	/** @hash_entry: hlist node for batadv_priv::orig_hash */
 	struct hlist_node hash_entry;
+
+	/** @bat_priv: pointer to soft_iface this orig node belongs to */
 	struct batadv_priv *bat_priv;
-	/* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */
+
+	/** @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno */
 	spinlock_t bcast_seqno_lock;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
+
 #ifdef CONFIG_BATMAN_ADV_NC
+	/** @in_coding_list: list of nodes this orig can hear */
 	struct list_head in_coding_list;
+
+	/** @out_coding_list: list of nodes that can hear this orig */
 	struct list_head out_coding_list;
-	spinlock_t in_coding_list_lock; /* Protects in_coding_list */
-	spinlock_t out_coding_list_lock; /* Protects out_coding_list */
+
+	/** @in_coding_list_lock: protects in_coding_list */
+	spinlock_t in_coding_list_lock;
+
+	/** @out_coding_list_lock: protects out_coding_list */
+	spinlock_t out_coding_list_lock;
 #endif
+
+	/** @fragments: array with heads for fragment chains */
 	struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT];
+
+	/**
+	 * @vlan_list: a list of orig_node_vlan structs, one per VLAN served by
+	 *  the originator represented by this object
+	 */
 	struct hlist_head vlan_list;
-	spinlock_t vlan_list_lock; /* protects vlan_list */
+
+	/** @vlan_list_lock: lock protecting vlan_list */
+	spinlock_t vlan_list_lock;
+
+	/** @bat_iv: B.A.T.M.A.N. IV private structure */
 	struct batadv_orig_bat_iv bat_iv;
 };
 
 /**
  * enum batadv_orig_capabilities - orig node capabilities
- * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table enabled
- * @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled
- * @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability
- * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability
- *  (= orig node announces a tvlv of type BATADV_TVLV_MCAST)
  */
 enum batadv_orig_capabilities {
+	/**
+	 * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table
+	 *  enabled
+	 */
 	BATADV_ORIG_CAPA_HAS_DAT,
+
+	/** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */
 	BATADV_ORIG_CAPA_HAS_NC,
+
+	/** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */
 	BATADV_ORIG_CAPA_HAS_TT,
+
+	/**
+	 * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability
+	 *  (= orig node announces a tvlv of type BATADV_TVLV_MCAST)
+	 */
 	BATADV_ORIG_CAPA_HAS_MCAST,
 };
 
 /**
  * struct batadv_gw_node - structure for orig nodes announcing gw capabilities
- * @list: list node for batadv_priv_gw::list
- * @orig_node: pointer to corresponding orig node
- * @bandwidth_down: advertised uplink download bandwidth
- * @bandwidth_up: advertised uplink upload bandwidth
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_gw_node {
+	/** @list: list node for batadv_priv_gw::list */
 	struct hlist_node list;
+
+	/** @orig_node: pointer to corresponding orig node */
 	struct batadv_orig_node *orig_node;
+
+	/** @bandwidth_down: advertised uplink download bandwidth */
 	u32 bandwidth_down;
+
+	/** @bandwidth_up: advertised uplink upload bandwidth */
 	u32 bandwidth_up;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
@@ -408,118 +563,161 @@ DECLARE_EWMA(throughput, 10, 8)
 /**
  * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
  *  information
- * @throughput: ewma link throughput towards this neighbor
- * @elp_interval: time interval between two ELP transmissions
- * @elp_latest_seqno: latest and best known ELP sequence number
- * @last_unicast_tx: when the last unicast packet has been sent to this neighbor
- * @metric_work: work queue callback item for metric update
  */
 struct batadv_hardif_neigh_node_bat_v {
+	/** @throughput: ewma link throughput towards this neighbor */
 	struct ewma_throughput throughput;
+
+	/** @elp_interval: time interval between two ELP transmissions */
 	u32 elp_interval;
+
+	/** @elp_latest_seqno: latest and best known ELP sequence number */
 	u32 elp_latest_seqno;
+
+	/**
+	 * @last_unicast_tx: when the last unicast packet has been sent to this
+	 *  neighbor
+	 */
 	unsigned long last_unicast_tx;
+
+	/** @metric_work: work queue callback item for metric update */
 	struct work_struct metric_work;
 };
 
 /**
  * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
- * @list: list node for batadv_hard_iface::neigh_list
- * @addr: the MAC address of the neighboring interface
- * @orig: the address of the originator this neighbor node belongs to
- * @if_incoming: pointer to incoming hard-interface
- * @last_seen: when last packet via this neighbor was received
- * @bat_v: B.A.T.M.A.N. V private data
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in a RCU-safe manner
  */
 struct batadv_hardif_neigh_node {
+	/** @list: list node for batadv_hard_iface::neigh_list */
 	struct hlist_node list;
+
+	/** @addr: the MAC address of the neighboring interface */
 	u8 addr[ETH_ALEN];
+
+	/**
+	 * @orig: the address of the originator this neighbor node belongs to
+	 */
 	u8 orig[ETH_ALEN];
+
+	/** @if_incoming: pointer to incoming hard-interface */
 	struct batadv_hard_iface *if_incoming;
+
+	/** @last_seen: when last packet via this neighbor was received */
 	unsigned long last_seen;
+
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	/** @bat_v: B.A.T.M.A.N. V private data */
 	struct batadv_hardif_neigh_node_bat_v bat_v;
 #endif
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in a RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_neigh_node - structure for single hops neighbors
- * @list: list node for batadv_orig_node::neigh_list
- * @orig_node: pointer to corresponding orig_node
- * @addr: the MAC address of the neighboring interface
- * @ifinfo_list: list for routing metrics per outgoing interface
- * @ifinfo_lock: lock protecting private ifinfo members and list
- * @if_incoming: pointer to incoming hard-interface
- * @last_seen: when last packet via this neighbor was received
- * @hardif_neigh: hardif_neigh of this neighbor
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_neigh_node {
+	/** @list: list node for batadv_orig_node::neigh_list */
 	struct hlist_node list;
+
+	/** @orig_node: pointer to corresponding orig_node */
 	struct batadv_orig_node *orig_node;
+
+	/** @addr: the MAC address of the neighboring interface */
 	u8 addr[ETH_ALEN];
+
+	/** @ifinfo_list: list for routing metrics per outgoing interface */
 	struct hlist_head ifinfo_list;
-	spinlock_t ifinfo_lock;	/* protects ifinfo_list and its members */
+
+	/** @ifinfo_lock: lock protecting ifinfo_list and its members */
+	spinlock_t ifinfo_lock;
+
+	/** @if_incoming: pointer to incoming hard-interface */
 	struct batadv_hard_iface *if_incoming;
+
+	/** @last_seen: when last packet via this neighbor was received */
 	unsigned long last_seen;
+
+	/** @hardif_neigh: hardif_neigh of this neighbor */
 	struct batadv_hardif_neigh_node *hardif_neigh;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing
  *  interface for B.A.T.M.A.N. IV
- * @tq_recv: ring buffer of received TQ values from this neigh node
- * @tq_index: ring buffer index
- * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv)
- * @real_bits: bitfield containing the number of OGMs received from this neigh
- *  node (relative to orig_node->last_real_seqno)
- * @real_packet_count: counted result of real_bits
  */
 struct batadv_neigh_ifinfo_bat_iv {
+	/** @tq_recv: ring buffer of received TQ values from this neigh node */
 	u8 tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE];
+
+	/** @tq_index: ring buffer index */
 	u8 tq_index;
+
+	/**
+	 * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv)
+	 */
 	u8 tq_avg;
+
+	/**
+	 * @real_bits: bitfield containing the number of OGMs received from this
+	 *  neigh node (relative to orig_node->last_real_seqno)
+	 */
 	DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+	/** @real_packet_count: counted result of real_bits */
 	u8 real_packet_count;
 };
 
 /**
  * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing
  *  interface for B.A.T.M.A.N. V
- * @throughput: last throughput metric received from originator via this neigh
- * @last_seqno: last sequence number known for this neighbor
  */
 struct batadv_neigh_ifinfo_bat_v {
+	/**
+	 * @throughput: last throughput metric received from originator via this
+	 *  neigh
+	 */
 	u32 throughput;
+
+	/** @last_seqno: last sequence number known for this neighbor */
 	u32 last_seqno;
 };
 
 /**
  * struct batadv_neigh_ifinfo - neighbor information per outgoing interface
- * @list: list node for batadv_neigh_node::ifinfo_list
- * @if_outgoing: pointer to outgoing hard-interface
- * @bat_iv: B.A.T.M.A.N. IV private structure
- * @bat_v: B.A.T.M.A.N. V private data
- * @last_ttl: last received ttl from this neigh node
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in a RCU-safe manner
  */
 struct batadv_neigh_ifinfo {
+	/** @list: list node for batadv_neigh_node::ifinfo_list */
 	struct hlist_node list;
+
+	/** @if_outgoing: pointer to outgoing hard-interface */
 	struct batadv_hard_iface *if_outgoing;
+
+	/** @bat_iv: B.A.T.M.A.N. IV private structure */
 	struct batadv_neigh_ifinfo_bat_iv bat_iv;
+
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	/** @bat_v: B.A.T.M.A.N. V private data */
 	struct batadv_neigh_ifinfo_bat_v bat_v;
 #endif
+
+	/** @last_ttl: last received ttl from this neigh node */
 	u8 last_ttl;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in a RCU-safe manner */
 	struct rcu_head rcu;
 };
 
@@ -527,148 +725,278 @@ struct batadv_neigh_ifinfo {
 
 /**
  * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression
- * @orig: mac address of orig node orginating the broadcast
- * @crc: crc32 checksum of broadcast payload
- * @entrytime: time when the broadcast packet was received
  */
 struct batadv_bcast_duplist_entry {
+	/** @orig: mac address of orig node orginating the broadcast */
 	u8 orig[ETH_ALEN];
+
+	/** @crc: crc32 checksum of broadcast payload */
 	__be32 crc;
+
+	/** @entrytime: time when the broadcast packet was received */
 	unsigned long entrytime;
 };
 #endif
 
 /**
  * enum batadv_counters - indices for traffic counters
- * @BATADV_CNT_TX: transmitted payload traffic packet counter
- * @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter
- * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet counter
- * @BATADV_CNT_RX: received payload traffic packet counter
- * @BATADV_CNT_RX_BYTES: received payload traffic bytes counter
- * @BATADV_CNT_FORWARD: forwarded payload traffic packet counter
- * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter
- * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet counter
- * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes counter
- * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter
- * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes counter
- * @BATADV_CNT_FRAG_TX: transmitted fragment traffic packet counter
- * @BATADV_CNT_FRAG_TX_BYTES: transmitted fragment traffic bytes counter
- * @BATADV_CNT_FRAG_RX: received fragment traffic packet counter
- * @BATADV_CNT_FRAG_RX_BYTES: received fragment traffic bytes counter
- * @BATADV_CNT_FRAG_FWD: forwarded fragment traffic packet counter
- * @BATADV_CNT_FRAG_FWD_BYTES: forwarded fragment traffic bytes counter
- * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter
- * @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter
- * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet counter
- * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter
- * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet counter
- * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter
- * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter
- * @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter
- * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter
- * @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter
- * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic packet
- *  counter
- * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter
- * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes counter
- * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet counter
- * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes counter
- * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc decoding
- * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter
- * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes counter
- * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic packet
- *  counter
- * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in promisc
- *  mode.
- * @BATADV_CNT_NUM: number of traffic counters
  */
 enum batadv_counters {
+	/** @BATADV_CNT_TX: transmitted payload traffic packet counter */
 	BATADV_CNT_TX,
+
+	/** @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter */
 	BATADV_CNT_TX_BYTES,
+
+	/**
+	 * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet
+	 *  counter
+	 */
 	BATADV_CNT_TX_DROPPED,
+
+	/** @BATADV_CNT_RX: received payload traffic packet counter */
 	BATADV_CNT_RX,
+
+	/** @BATADV_CNT_RX_BYTES: received payload traffic bytes counter */
 	BATADV_CNT_RX_BYTES,
+
+	/** @BATADV_CNT_FORWARD: forwarded payload traffic packet counter */
 	BATADV_CNT_FORWARD,
+
+	/**
+	 * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter
+	 */
 	BATADV_CNT_FORWARD_BYTES,
+
+	/**
+	 * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet
+	 *  counter
+	 */
 	BATADV_CNT_MGMT_TX,
+
+	/**
+	 * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes
+	 *  counter
+	 */
 	BATADV_CNT_MGMT_TX_BYTES,
+
+	/**
+	 * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter
+	 */
 	BATADV_CNT_MGMT_RX,
+
+	/**
+	 * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes
+	 *  counter
+	 */
 	BATADV_CNT_MGMT_RX_BYTES,
+
+	/** @BATADV_CNT_FRAG_TX: transmitted fragment traffic packet counter */
 	BATADV_CNT_FRAG_TX,
+
+	/**
+	 * @BATADV_CNT_FRAG_TX_BYTES: transmitted fragment traffic bytes counter
+	 */
 	BATADV_CNT_FRAG_TX_BYTES,
+
+	/** @BATADV_CNT_FRAG_RX: received fragment traffic packet counter */
 	BATADV_CNT_FRAG_RX,
+
+	/**
+	 * @BATADV_CNT_FRAG_RX_BYTES: received fragment traffic bytes counter
+	 */
 	BATADV_CNT_FRAG_RX_BYTES,
+
+	/** @BATADV_CNT_FRAG_FWD: forwarded fragment traffic packet counter */
 	BATADV_CNT_FRAG_FWD,
+
+	/**
+	 * @BATADV_CNT_FRAG_FWD_BYTES: forwarded fragment traffic bytes counter
+	 */
 	BATADV_CNT_FRAG_FWD_BYTES,
+
+	/**
+	 * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter
+	 */
 	BATADV_CNT_TT_REQUEST_TX,
+
+	/** @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter */
 	BATADV_CNT_TT_REQUEST_RX,
+
+	/**
+	 * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet
+	 *  counter
+	 */
 	BATADV_CNT_TT_RESPONSE_TX,
+
+	/**
+	 * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter
+	 */
 	BATADV_CNT_TT_RESPONSE_RX,
+
+	/**
+	 * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet
+	 *  counter
+	 */
 	BATADV_CNT_TT_ROAM_ADV_TX,
+
+	/**
+	 * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter
+	 */
 	BATADV_CNT_TT_ROAM_ADV_RX,
+
 #ifdef CONFIG_BATMAN_ADV_DAT
+	/**
+	 * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter
+	 */
 	BATADV_CNT_DAT_GET_TX,
+
+	/** @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter */
 	BATADV_CNT_DAT_GET_RX,
+
+	/**
+	 * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter
+	 */
 	BATADV_CNT_DAT_PUT_TX,
+
+	/** @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter */
 	BATADV_CNT_DAT_PUT_RX,
+
+	/**
+	 * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic
+	 *  packet counter
+	 */
 	BATADV_CNT_DAT_CACHED_REPLY_TX,
 #endif
+
 #ifdef CONFIG_BATMAN_ADV_NC
+	/**
+	 * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter
+	 */
 	BATADV_CNT_NC_CODE,
+
+	/**
+	 * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes
+	 *  counter
+	 */
 	BATADV_CNT_NC_CODE_BYTES,
+
+	/**
+	 * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet
+	 *  counter
+	 */
 	BATADV_CNT_NC_RECODE,
+
+	/**
+	 * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes
+	 *  counter
+	 */
 	BATADV_CNT_NC_RECODE_BYTES,
+
+	/**
+	 * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc
+	 *  decoding
+	 */
 	BATADV_CNT_NC_BUFFER,
+
+	/**
+	 * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter
+	 */
 	BATADV_CNT_NC_DECODE,
+
+	/**
+	 * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes
+	 *  counter
+	 */
 	BATADV_CNT_NC_DECODE_BYTES,
+
+	/**
+	 * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic
+	 *  packet counter
+	 */
 	BATADV_CNT_NC_DECODE_FAILED,
+
+	/**
+	 * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in
+	 *  promisc mode.
+	 */
 	BATADV_CNT_NC_SNIFFED,
 #endif
+
+	/** @BATADV_CNT_NUM: number of traffic counters */
 	BATADV_CNT_NUM,
 };
 
 /**
  * struct batadv_priv_tt - per mesh interface translation table data
- * @vn: translation table version number
- * @ogm_append_cnt: counter of number of OGMs containing the local tt diff
- * @local_changes: changes registered in an originator interval
- * @changes_list: tracks tt local changes within an originator interval
- * @local_hash: local translation table hash table
- * @global_hash: global translation table hash table
- * @req_list: list of pending & unanswered tt_requests
- * @roam_list: list of the last roaming events of each client limiting the
- *  number of roaming events to avoid route flapping
- * @changes_list_lock: lock protecting changes_list
- * @req_list_lock: lock protecting req_list
- * @roam_list_lock: lock protecting roam_list
- * @last_changeset: last tt changeset this host has generated
- * @last_changeset_len: length of last tt changeset this host has generated
- * @last_changeset_lock: lock protecting last_changeset & last_changeset_len
- * @commit_lock: prevents from executing a local TT commit while reading the
- *  local table. The local TT commit is made up by two operations (data
- *  structure update and metdata -CRC/TTVN- recalculation) and they have to be
- *  executed atomically in order to avoid another thread to read the
- *  table/metadata between those.
- * @work: work queue callback item for translation table purging
  */
 struct batadv_priv_tt {
+	/** @vn: translation table version number */
 	atomic_t vn;
+
+	/**
+	 * @ogm_append_cnt: counter of number of OGMs containing the local tt
+	 *  diff
+	 */
 	atomic_t ogm_append_cnt;
+
+	/** @local_changes: changes registered in an originator interval */
 	atomic_t local_changes;
+
+	/**
+	 * @changes_list: tracks tt local changes within an originator interval
+	 */
 	struct list_head changes_list;
+
+	/** @local_hash: local translation table hash table */
 	struct batadv_hashtable *local_hash;
+
+	/** @global_hash: global translation table hash table */
 	struct batadv_hashtable *global_hash;
+
+	/** @req_list: list of pending & unanswered tt_requests */
 	struct hlist_head req_list;
+
+	/**
+	 * @roam_list: list of the last roaming events of each client limiting
+	 *  the number of roaming events to avoid route flapping
+	 */
 	struct list_head roam_list;
-	spinlock_t changes_list_lock; /* protects changes */
-	spinlock_t req_list_lock; /* protects req_list */
-	spinlock_t roam_list_lock; /* protects roam_list */
+
+	/** @changes_list_lock: lock protecting changes_list */
+	spinlock_t changes_list_lock;
+
+	/** @req_list_lock: lock protecting req_list */
+	spinlock_t req_list_lock;
+
+	/** @roam_list_lock: lock protecting roam_list */
+	spinlock_t roam_list_lock;
+
+	/** @last_changeset: last tt changeset this host has generated */
 	unsigned char *last_changeset;
+
+	/**
+	 * @last_changeset_len: length of last tt changeset this host has
+	 *  generated
+	 */
 	s16 last_changeset_len;
-	/* protects last_changeset & last_changeset_len */
+
+	/**
+	 * @last_changeset_lock: lock protecting last_changeset &
+	 *  last_changeset_len
+	 */
 	spinlock_t last_changeset_lock;
-	/* prevents from executing a commit while reading the table */
+
+	/**
+	 * @commit_lock: prevents from executing a local TT commit while reading
+	 *  the local table. The local TT commit is made up by two operations
+	 *  (data structure update and metdata -CRC/TTVN- recalculation) and
+	 *  they have to be executed atomically in order to avoid another thread
+	 *  to read the table/metadata between those.
+	 */
 	spinlock_t commit_lock;
+
+	/** @work: work queue callback item for translation table purging */
 	struct delayed_work work;
 };
 
@@ -676,31 +1004,57 @@ struct batadv_priv_tt {
 
 /**
  * struct batadv_priv_bla - per mesh interface bridge loope avoidance data
- * @num_requests: number of bla requests in flight
- * @claim_hash: hash table containing mesh nodes this host has claimed
- * @backbone_hash: hash table containing all detected backbone gateways
- * @loopdetect_addr: MAC address used for own loopdetection frames
- * @loopdetect_lasttime: time when the loopdetection frames were sent
- * @loopdetect_next: how many periods to wait for the next loopdetect process
- * @bcast_duplist: recently received broadcast packets array (for broadcast
- *  duplicate suppression)
- * @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist
- * @bcast_duplist_lock: lock protecting bcast_duplist & bcast_duplist_curr
- * @claim_dest: local claim data (e.g. claim group)
- * @work: work queue callback item for cleanups & bla announcements
  */
 struct batadv_priv_bla {
+	/** @num_requests: number of bla requests in flight */
 	atomic_t num_requests;
+
+	/**
+	 * @claim_hash: hash table containing mesh nodes this host has claimed
+	 */
 	struct batadv_hashtable *claim_hash;
+
+	/**
+	 * @backbone_hash: hash table containing all detected backbone gateways
+	 */
 	struct batadv_hashtable *backbone_hash;
+
+	/** @loopdetect_addr: MAC address used for own loopdetection frames */
 	u8 loopdetect_addr[ETH_ALEN];
+
+	/**
+	 * @loopdetect_lasttime: time when the loopdetection frames were sent
+	 */
 	unsigned long loopdetect_lasttime;
+
+	/**
+	 * @loopdetect_next: how many periods to wait for the next loopdetect
+	 *  process
+	 */
 	atomic_t loopdetect_next;
+
+	/**
+	 * @bcast_duplist: recently received broadcast packets array (for
+	 *  broadcast duplicate suppression)
+	 */
 	struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE];
+
+	/**
+	 * @bcast_duplist_curr: index of last broadcast packet added to
+	 *  bcast_duplist
+	 */
 	int bcast_duplist_curr;
-	/* protects bcast_duplist & bcast_duplist_curr */
+
+	/**
+	 * @bcast_duplist_lock: lock protecting bcast_duplist &
+	 *  bcast_duplist_curr
+	 */
 	spinlock_t bcast_duplist_lock;
+
+	/** @claim_dest: local claim data (e.g. claim group) */
 	struct batadv_bla_claim_dst claim_dest;
+
+	/** @work: work queue callback item for cleanups & bla announcements */
 	struct delayed_work work;
 };
 #endif
@@ -709,68 +1063,94 @@ struct batadv_priv_bla {
 
 /**
  * struct batadv_priv_debug_log - debug logging data
- * @log_buff: buffer holding the logs (ring bufer)
- * @log_start: index of next character to read
- * @log_end: index of next character to write
- * @lock: lock protecting log_buff, log_start & log_end
- * @queue_wait: log reader's wait queue
  */
 struct batadv_priv_debug_log {
+	/** @log_buff: buffer holding the logs (ring bufer) */
 	char log_buff[BATADV_LOG_BUF_LEN];
+
+	/** @log_start: index of next character to read */
 	unsigned long log_start;
+
+	/** @log_end: index of next character to write */
 	unsigned long log_end;
-	spinlock_t lock; /* protects log_buff, log_start and log_end */
+
+	/** @lock: lock protecting log_buff, log_start & log_end */
+	spinlock_t lock;
+
+	/** @queue_wait: log reader's wait queue */
 	wait_queue_head_t queue_wait;
 };
 #endif
 
 /**
  * struct batadv_priv_gw - per mesh interface gateway data
- * @gateway_list: list of available gateway nodes
- * @list_lock: lock protecting gateway_list & curr_gw
- * @curr_gw: pointer to currently selected gateway node
- * @mode: gateway operation: off, client or server (see batadv_gw_modes)
- * @sel_class: gateway selection class (applies if gw_mode client)
- * @bandwidth_down: advertised uplink download bandwidth (if gw_mode server)
- * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server)
- * @reselect: bool indicating a gateway re-selection is in progress
  */
 struct batadv_priv_gw {
+	/** @gateway_list: list of available gateway nodes */
 	struct hlist_head gateway_list;
-	spinlock_t list_lock; /* protects gateway_list & curr_gw */
-	struct batadv_gw_node __rcu *curr_gw;  /* rcu protected pointer */
+
+	/** @list_lock: lock protecting gateway_list & curr_gw */
+	spinlock_t list_lock;
+
+	/** @curr_gw: pointer to currently selected gateway node */
+	struct batadv_gw_node __rcu *curr_gw;
+
+	/**
+	 * @mode: gateway operation: off, client or server (see batadv_gw_modes)
+	 */
 	atomic_t mode;
+
+	/** @sel_class: gateway selection class (applies if gw_mode client) */
 	atomic_t sel_class;
+
+	/**
+	 * @bandwidth_down: advertised uplink download bandwidth (if gw_mode
+	 *  server)
+	 */
 	atomic_t bandwidth_down;
+
+	/**
+	 * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server)
+	 */
 	atomic_t bandwidth_up;
+
+	/** @reselect: bool indicating a gateway re-selection is in progress */
 	atomic_t reselect;
 };
 
 /**
  * struct batadv_priv_tvlv - per mesh interface tvlv data
- * @container_list: list of registered tvlv containers to be sent with each OGM
- * @handler_list: list of the various tvlv content handlers
- * @container_list_lock: protects tvlv container list access
- * @handler_list_lock: protects handler list access
  */
 struct batadv_priv_tvlv {
+	/**
+	 * @container_list: list of registered tvlv containers to be sent with
+	 *  each OGM
+	 */
 	struct hlist_head container_list;
+
+	/** @handler_list: list of the various tvlv content handlers */
 	struct hlist_head handler_list;
-	spinlock_t container_list_lock; /* protects container_list */
-	spinlock_t handler_list_lock; /* protects handler_list */
+
+	/** @container_list_lock: protects tvlv container list access */
+	spinlock_t container_list_lock;
+
+	/** @handler_list_lock: protects handler list access */
+	spinlock_t handler_list_lock;
 };
 
 #ifdef CONFIG_BATMAN_ADV_DAT
 
 /**
  * struct batadv_priv_dat - per mesh interface DAT private data
- * @addr: node DAT address
- * @hash: hashtable representing the local ARP cache
- * @work: work queue callback item for cache purging
  */
 struct batadv_priv_dat {
+	/** @addr: node DAT address */
 	batadv_dat_addr_t addr;
+
+	/** @hash: hashtable representing the local ARP cache */
 	struct batadv_hashtable *hash;
+
+	/** @work: work queue callback item for cache purging */
 	struct delayed_work work;
 };
 #endif
@@ -778,375 +1158,582 @@ struct batadv_priv_dat {
 #ifdef CONFIG_BATMAN_ADV_MCAST
 /**
  * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged
- * @exists: whether a querier exists in the mesh
- * @shadowing: if a querier exists, whether it is potentially shadowing
- *  multicast listeners (i.e. querier is behind our own bridge segment)
  */
 struct batadv_mcast_querier_state {
+	/** @exists: whether a querier exists in the mesh */
 	bool exists;
+
+	/**
+	 * @shadowing: if a querier exists, whether it is potentially shadowing
+	 *  multicast listeners (i.e. querier is behind our own bridge segment)
+	 */
 	bool shadowing;
 };
 
 /**
  * struct batadv_priv_mcast - per mesh interface mcast data
- * @mla_list: list of multicast addresses we are currently announcing via TT
- * @want_all_unsnoopables_list: a list of orig_nodes wanting all unsnoopable
- *  multicast traffic
- * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic
- * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic
- * @querier_ipv4: the current state of an IGMP querier in the mesh
- * @querier_ipv6: the current state of an MLD querier in the mesh
- * @flags: the flags we have last sent in our mcast tvlv
- * @enabled: whether the multicast tvlv is currently enabled
- * @bridged: whether the soft interface has a bridge on top
- * @num_disabled: number of nodes that have no mcast tvlv
- * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic
- * @num_want_all_ipv4: counter for items in want_all_ipv4_list
- * @num_want_all_ipv6: counter for items in want_all_ipv6_list
- * @want_lists_lock: lock for protecting modifications to mcast want lists
- *  (traversals are rcu-locked)
- * @work: work queue callback item for multicast TT and TVLV updates
  */
 struct batadv_priv_mcast {
+	/**
+	 * @mla_list: list of multicast addresses we are currently announcing
+	 *  via TT
+	 */
 	struct hlist_head mla_list; /* see __batadv_mcast_mla_update() */
+
+	/**
+	 * @want_all_unsnoopables_list: a list of orig_nodes wanting all
+	 *  unsnoopable multicast traffic
+	 */
 	struct hlist_head want_all_unsnoopables_list;
+
+	/**
+	 * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast
+	 *  traffic
+	 */
 	struct hlist_head want_all_ipv4_list;
+
+	/**
+	 * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast
+	 *  traffic
+	 */
 	struct hlist_head want_all_ipv6_list;
+
+	/** @querier_ipv4: the current state of an IGMP querier in the mesh */
 	struct batadv_mcast_querier_state querier_ipv4;
+
+	/** @querier_ipv6: the current state of an MLD querier in the mesh */
 	struct batadv_mcast_querier_state querier_ipv6;
+
+	/** @flags: the flags we have last sent in our mcast tvlv */
 	u8 flags;
+
+	/** @enabled: whether the multicast tvlv is currently enabled */
 	bool enabled;
+
+	/** @bridged: whether the soft interface has a bridge on top */
 	bool bridged;
+
+	/** @num_disabled: number of nodes that have no mcast tvlv */
 	atomic_t num_disabled;
+
+	/**
+	 * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
+	 *  traffic
+	 */
 	atomic_t num_want_all_unsnoopables;
+
+	/** @num_want_all_ipv4: counter for items in want_all_ipv4_list */
 	atomic_t num_want_all_ipv4;
+
+	/** @num_want_all_ipv6: counter for items in want_all_ipv6_list */
 	atomic_t num_want_all_ipv6;
-	/* protects want_all_{unsnoopables,ipv4,ipv6}_list */
+
+	/**
+	 * @want_lists_lock: lock for protecting modifications to mcasts
+	 *  want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked)
+	 */
 	spinlock_t want_lists_lock;
+
+	/** @work: work queue callback item for multicast TT and TVLV updates */
 	struct delayed_work work;
 };
 #endif
 
 /**
  * struct batadv_priv_nc - per mesh interface network coding private data
- * @work: work queue callback item for cleanup
- * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
- * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
- * @max_fwd_delay: maximum packet forward delay to allow coding of packets
- * @max_buffer_time: buffer time for sniffed packets used to decoding
- * @timestamp_fwd_flush: timestamp of last forward packet queue flush
- * @timestamp_sniffed_purge: timestamp of last sniffed packet queue purge
- * @coding_hash: Hash table used to buffer skbs while waiting for another
- *  incoming skb to code it with. Skbs are added to the buffer just before being
- *  forwarded in routing.c
- * @decoding_hash: Hash table used to buffer skbs that might be needed to decode
- *  a received coded skb. The buffer is used for 1) skbs arriving on the
- *  soft-interface; 2) skbs overheard on the hard-interface; and 3) skbs
- *  forwarded by batman-adv.
  */
 struct batadv_priv_nc {
+	/** @work: work queue callback item for cleanup */
 	struct delayed_work work;
+
+	/**
+	 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
+	 */
 	struct dentry *debug_dir;
+
+	/**
+	 * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
+	 */
 	u8 min_tq;
+
+	/**
+	 * @max_fwd_delay: maximum packet forward delay to allow coding of
+	 *  packets
+	 */
 	u32 max_fwd_delay;
+
+	/**
+	 * @max_buffer_time: buffer time for sniffed packets used to decoding
+	 */
 	u32 max_buffer_time;
+
+	/**
+	 * @timestamp_fwd_flush: timestamp of last forward packet queue flush
+	 */
 	unsigned long timestamp_fwd_flush;
+
+	/**
+	 * @timestamp_sniffed_purge: timestamp of last sniffed packet queue
+	 *  purge
+	 */
 	unsigned long timestamp_sniffed_purge;
+
+	/**
+	 * @coding_hash: Hash table used to buffer skbs while waiting for
+	 *  another incoming skb to code it with. Skbs are added to the buffer
+	 *  just before being forwarded in routing.c
+	 */
 	struct batadv_hashtable *coding_hash;
+
+	/**
+	 * @decoding_hash: Hash table used to buffer skbs that might be needed
+	 *  to decode a received coded skb. The buffer is used for 1) skbs
+	 *  arriving on the soft-interface; 2) skbs overheard on the
+	 *  hard-interface; and 3) skbs forwarded by batman-adv.
+	 */
 	struct batadv_hashtable *decoding_hash;
 };
 
 /**
  * struct batadv_tp_unacked - unacked packet meta-information
- * @seqno: seqno of the unacked packet
- * @len: length of the packet
- * @list: list node for batadv_tp_vars::unacked_list
  *
  * This struct is supposed to represent a buffer unacked packet. However, since
  * the purpose of the TP meter is to count the traffic only, there is no need to
  * store the entire sk_buff, the starting offset and the length are enough
  */
 struct batadv_tp_unacked {
+	/** @seqno: seqno of the unacked packet */
 	u32 seqno;
+
+	/** @len: length of the packet */
 	u16 len;
+
+	/** @list: list node for batadv_tp_vars::unacked_list */
 	struct list_head list;
 };
 
 /**
  * enum batadv_tp_meter_role - Modus in tp meter session
- * @BATADV_TP_RECEIVER: Initialized as receiver
- * @BATADV_TP_SENDER: Initialized as sender
  */
 enum batadv_tp_meter_role {
+	/** @BATADV_TP_RECEIVER: Initialized as receiver */
 	BATADV_TP_RECEIVER,
+
+	/** @BATADV_TP_SENDER: Initialized as sender */
 	BATADV_TP_SENDER
 };
 
 /**
  * struct batadv_tp_vars - tp meter private variables per session
- * @list: list node for bat_priv::tp_list
- * @timer: timer for ack (receiver) and retry (sender)
- * @bat_priv: pointer to the mesh object
- * @start_time: start time in jiffies
- * @other_end: mac address of remote
- * @role: receiver/sender modi
- * @sending: sending binary semaphore: 1 if sending, 0 is not
- * @reason: reason for a stopped session
- * @finish_work: work item for the finishing procedure
- * @test_length: test length in milliseconds
- * @session: TP session identifier
- * @icmp_uid: local ICMP "socket" index
- * @dec_cwnd: decimal part of the cwnd used during linear growth
- * @cwnd: current size of the congestion window
- * @cwnd_lock: lock do protect @cwnd & @dec_cwnd
- * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the
- *  connection switches to the Congestion Avoidance state
- * @last_acked: last acked byte
- * @last_sent: last sent byte, not yet acked
- * @tot_sent: amount of data sent/ACKed so far
- * @dup_acks: duplicate ACKs counter
- * @fast_recovery: true if in Fast Recovery mode
- * @recover: last sent seqno when entering Fast Recovery
- * @rto: sender timeout
- * @srtt: smoothed RTT scaled by 2^3
- * @rttvar: RTT variation scaled by 2^2
- * @more_bytes: waiting queue anchor when waiting for more ack/retry timeout
- * @prerandom_offset: offset inside the prerandom buffer
- * @prerandom_lock: spinlock protecting access to prerandom_offset
- * @last_recv: last in-order received packet
- * @unacked_list: list of unacked packets (meta-info only)
- * @unacked_lock: protect unacked_list
- * @last_recv_time: time time (jiffies) a msg was received
- * @refcount: number of context where the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_tp_vars {
+	/** @list: list node for bat_priv::tp_list */
 	struct hlist_node list;
+
+	/** @timer: timer for ack (receiver) and retry (sender) */
 	struct timer_list timer;
+
+	/** @bat_priv: pointer to the mesh object */
 	struct batadv_priv *bat_priv;
+
+	/** @start_time: start time in jiffies */
 	unsigned long start_time;
+
+	/** @other_end: mac address of remote */
 	u8 other_end[ETH_ALEN];
+
+	/** @role: receiver/sender modi */
 	enum batadv_tp_meter_role role;
+
+	/** @sending: sending binary semaphore: 1 if sending, 0 is not */
 	atomic_t sending;
+
+	/** @reason: reason for a stopped session */
 	enum batadv_tp_meter_reason reason;
+
+	/** @finish_work: work item for the finishing procedure */
 	struct delayed_work finish_work;
+
+	/** @test_length: test length in milliseconds */
 	u32 test_length;
+
+	/** @session: TP session identifier */
 	u8 session[2];
+
+	/** @icmp_uid: local ICMP "socket" index */
 	u8 icmp_uid;
 
 	/* sender variables */
+
+	/** @dec_cwnd: decimal part of the cwnd used during linear growth */
 	u16 dec_cwnd;
+
+	/** @cwnd: current size of the congestion window */
 	u32 cwnd;
-	spinlock_t cwnd_lock; /* Protects cwnd & dec_cwnd */
+
+	/** @cwnd_lock: lock do protect @cwnd & @dec_cwnd */
+	spinlock_t cwnd_lock;
+
+	/**
+	 * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the
+	 *  connection switches to the Congestion Avoidance state
+	 */
 	u32 ss_threshold;
+
+	/** @last_acked: last acked byte */
 	atomic_t last_acked;
+
+	/** @last_sent: last sent byte, not yet acked */
 	u32 last_sent;
+
+	/** @tot_sent: amount of data sent/ACKed so far */
 	atomic64_t tot_sent;
+
+	/** @dup_acks: duplicate ACKs counter */
 	atomic_t dup_acks;
+
+	/** @fast_recovery: true if in Fast Recovery mode */
 	bool fast_recovery;
+
+	/** @recover: last sent seqno when entering Fast Recovery */
 	u32 recover;
+
+	/** @rto: sender timeout */
 	u32 rto;
+
+	/** @srtt: smoothed RTT scaled by 2^3 */
 	u32 srtt;
+
+	/** @rttvar: RTT variation scaled by 2^2 */
 	u32 rttvar;
+
+	/**
+	 * @more_bytes: waiting queue anchor when waiting for more ack/retry
+	 *  timeout
+	 */
 	wait_queue_head_t more_bytes;
+
+	/** @prerandom_offset: offset inside the prerandom buffer */
 	u32 prerandom_offset;
-	spinlock_t prerandom_lock; /* Protects prerandom_offset */
+
+	/** @prerandom_lock: spinlock protecting access to prerandom_offset */
+	spinlock_t prerandom_lock;
 
 	/* receiver variables */
+
+	/** @last_recv: last in-order received packet */
 	u32 last_recv;
+
+	/** @unacked_list: list of unacked packets (meta-info only) */
 	struct list_head unacked_list;
-	spinlock_t unacked_lock; /* Protects unacked_list */
+
+	/** @unacked_lock: protect unacked_list */
+	spinlock_t unacked_lock;
+
+	/** @last_recv_time: time time (jiffies) a msg was received */
 	unsigned long last_recv_time;
+
+	/** @refcount: number of context where the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_softif_vlan - per VLAN attributes set
- * @bat_priv: pointer to the mesh object
- * @vid: VLAN identifier
- * @kobj: kobject for sysfs vlan subdirectory
- * @ap_isolation: AP isolation state
- * @tt: TT private attributes (VLAN specific)
- * @list: list node for bat_priv::softif_vlan_list
- * @refcount: number of context where this object is currently in use
- * @rcu: struct used for freeing in a RCU-safe manner
  */
 struct batadv_softif_vlan {
+	/** @bat_priv: pointer to the mesh object */
 	struct batadv_priv *bat_priv;
+
+	/** @vid: VLAN identifier */
 	unsigned short vid;
+
+	/** @kobj: kobject for sysfs vlan subdirectory */
 	struct kobject *kobj;
+
+	/** @ap_isolation: AP isolation state */
 	atomic_t ap_isolation;		/* boolean */
+
+	/** @tt: TT private attributes (VLAN specific) */
 	struct batadv_vlan_tt tt;
+
+	/** @list: list node for bat_priv::softif_vlan_list */
 	struct hlist_node list;
+
+	/**
+	 * @refcount: number of context where this object is currently in use
+	 */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in a RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data
- * @ogm_buff: buffer holding the OGM packet
- * @ogm_buff_len: length of the OGM packet buffer
- * @ogm_seqno: OGM sequence number - used to identify each OGM
- * @ogm_wq: workqueue used to schedule OGM transmissions
  */
 struct batadv_priv_bat_v {
+	/** @ogm_buff: buffer holding the OGM packet */
 	unsigned char *ogm_buff;
+
+	/** @ogm_buff_len: length of the OGM packet buffer */
 	int ogm_buff_len;
+
+	/** @ogm_seqno: OGM sequence number - used to identify each OGM */
 	atomic_t ogm_seqno;
+
+	/** @ogm_wq: workqueue used to schedule OGM transmissions */
 	struct delayed_work ogm_wq;
 };
 
 /**
  * struct batadv_priv - per mesh interface data
- * @mesh_state: current status of the mesh (inactive/active/deactivating)
- * @soft_iface: net device which holds this struct as private data
- * @bat_counters: mesh internal traffic statistic counters (see batadv_counters)
- * @aggregated_ogms: bool indicating whether OGM aggregation is enabled
- * @bonding: bool indicating whether traffic bonding is enabled
- * @fragmentation: bool indicating whether traffic fragmentation is enabled
- * @packet_size_max: max packet size that can be transmitted via
- *  multiple fragmented skbs or a single frame if fragmentation is disabled
- * @frag_seqno: incremental counter to identify chains of egress fragments
- * @bridge_loop_avoidance: bool indicating whether bridge loop avoidance is
- *  enabled
- * @distributed_arp_table: bool indicating whether distributed ARP table is
- *  enabled
- * @multicast_mode: Enable or disable multicast optimizations on this node's
- *  sender/originating side
- * @orig_interval: OGM broadcast interval in milliseconds
- * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop
- * @log_level: configured log level (see batadv_dbg_level)
- * @isolation_mark: the skb->mark value used to match packets for AP isolation
- * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used
- *  for the isolation mark
- * @bcast_seqno: last sent broadcast packet sequence number
- * @bcast_queue_left: number of remaining buffered broadcast packet slots
- * @batman_queue_left: number of remaining OGM packet slots
- * @num_ifaces: number of interfaces assigned to this mesh interface
- * @mesh_obj: kobject for sysfs mesh subdirectory
- * @debug_dir: dentry for debugfs batman-adv subdirectory
- * @forw_bat_list: list of aggregated OGMs that will be forwarded
- * @forw_bcast_list: list of broadcast packets that will be rebroadcasted
- * @tp_list: list of tp sessions
- * @tp_num: number of currently active tp sessions
- * @orig_hash: hash table containing mesh participants (orig nodes)
- * @forw_bat_list_lock: lock protecting forw_bat_list
- * @forw_bcast_list_lock: lock protecting forw_bcast_list
- * @tp_list_lock: spinlock protecting @tp_list
- * @orig_work: work queue callback item for orig node purging
- * @primary_if: one of the hard-interfaces assigned to this mesh interface
- *  becomes the primary interface
- * @algo_ops: routing algorithm used by this mesh interface
- * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top
- *  of the mesh interface represented by this object
- * @softif_vlan_list_lock: lock protecting softif_vlan_list
- * @bla: bridge loope avoidance data
- * @debug_log: holding debug logging relevant data
- * @gw: gateway data
- * @tt: translation table data
- * @tvlv: type-version-length-value data
- * @dat: distributed arp table data
- * @mcast: multicast data
- * @network_coding: bool indicating whether network coding is enabled
- * @nc: network coding data
- * @bat_v: B.A.T.M.A.N. V per soft-interface private data
  */
 struct batadv_priv {
+	/**
+	 * @mesh_state: current status of the mesh
+	 *  (inactive/active/deactivating)
+	 */
 	atomic_t mesh_state;
+
+	/** @soft_iface: net device which holds this struct as private data */
 	struct net_device *soft_iface;
+
+	/**
+	 * @bat_counters: mesh internal traffic statistic counters (see
+	 *  batadv_counters)
+	 */
 	u64 __percpu *bat_counters; /* Per cpu counters */
+
+	/**
+	 * @aggregated_ogms: bool indicating whether OGM aggregation is enabled
+	 */
 	atomic_t aggregated_ogms;
+
+	/** @bonding: bool indicating whether traffic bonding is enabled */
 	atomic_t bonding;
+
+	/**
+	 * @fragmentation: bool indicating whether traffic fragmentation is
+	 *  enabled
+	 */
 	atomic_t fragmentation;
+
+	/**
+	 * @packet_size_max: max packet size that can be transmitted via
+	 *  multiple fragmented skbs or a single frame if fragmentation is
+	 *  disabled
+	 */
 	atomic_t packet_size_max;
+
+	/**
+	 * @frag_seqno: incremental counter to identify chains of egress
+	 *  fragments
+	 */
 	atomic_t frag_seqno;
+
 #ifdef CONFIG_BATMAN_ADV_BLA
+	/**
+	 * @bridge_loop_avoidance: bool indicating whether bridge loop
+	 *  avoidance is enabled
+	 */
 	atomic_t bridge_loop_avoidance;
 #endif
+
 #ifdef CONFIG_BATMAN_ADV_DAT
+	/**
+	 * @distributed_arp_table: bool indicating whether distributed ARP table
+	 *  is enabled
+	 */
 	atomic_t distributed_arp_table;
 #endif
+
 #ifdef CONFIG_BATMAN_ADV_MCAST
+	/**
+	 * @multicast_mode: Enable or disable multicast optimizations on this
+	 *  node's sender/originating side
+	 */
 	atomic_t multicast_mode;
 #endif
+
+	/** @orig_interval: OGM broadcast interval in milliseconds */
 	atomic_t orig_interval;
+
+	/**
+	 * @hop_penalty: penalty which will be applied to an OGM's tq-field on
+	 *  every hop
+	 */
 	atomic_t hop_penalty;
+
 #ifdef CONFIG_BATMAN_ADV_DEBUG
+	/** @log_level: configured log level (see batadv_dbg_level) */
 	atomic_t log_level;
 #endif
+
+	/**
+	 * @isolation_mark: the skb->mark value used to match packets for AP
+	 *  isolation
+	 */
 	u32 isolation_mark;
+
+	/**
+	 * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be
+	 *  used for the isolation mark
+	 */
 	u32 isolation_mark_mask;
+
+	/** @bcast_seqno: last sent broadcast packet sequence number */
 	atomic_t bcast_seqno;
+
+	/**
+	 * @bcast_queue_left: number of remaining buffered broadcast packet
+	 *  slots
+	 */
 	atomic_t bcast_queue_left;
+
+	/** @batman_queue_left: number of remaining OGM packet slots */
 	atomic_t batman_queue_left;
+
+	/** @num_ifaces: number of interfaces assigned to this mesh interface */
 	char num_ifaces;
+
+	/** @mesh_obj: kobject for sysfs mesh subdirectory */
 	struct kobject *mesh_obj;
+
+	/** @debug_dir: dentry for debugfs batman-adv subdirectory */
 	struct dentry *debug_dir;
+
+	/** @forw_bat_list: list of aggregated OGMs that will be forwarded */
 	struct hlist_head forw_bat_list;
+
+	/**
+	 * @forw_bcast_list: list of broadcast packets that will be
+	 *  rebroadcasted
+	 */
 	struct hlist_head forw_bcast_list;
+
+	/** @tp_list: list of tp sessions */
 	struct hlist_head tp_list;
+
+	/** @tp_num: number of currently active tp sessions */
 	struct batadv_hashtable *orig_hash;
-	spinlock_t forw_bat_list_lock; /* protects forw_bat_list */
-	spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */
-	spinlock_t tp_list_lock; /* protects tp_list */
+
+	/** @orig_hash: hash table containing mesh participants (orig nodes) */
+	spinlock_t forw_bat_list_lock;
+
+	/** @forw_bat_list_lock: lock protecting forw_bat_list */
+	spinlock_t forw_bcast_list_lock;
+
+	/** @forw_bcast_list_lock: lock protecting forw_bcast_list */
+	spinlock_t tp_list_lock;
+
+	/** @tp_list_lock: spinlock protecting @tp_list */
 	atomic_t tp_num;
+
+	/** @orig_work: work queue callback item for orig node purging */
 	struct delayed_work orig_work;
+
+	/**
+	 * @primary_if: one of the hard-interfaces assigned to this mesh
+	 *  interface becomes the primary interface
+	 */
 	struct batadv_hard_iface __rcu *primary_if;  /* rcu protected pointer */
+
+	/** @algo_ops: routing algorithm used by this mesh interface */
 	struct batadv_algo_ops *algo_ops;
+
+	/**
+	 * @softif_vlan_list: a list of softif_vlan structs, one per VLAN
+	 *  created on top of the mesh interface represented by this object
+	 */
 	struct hlist_head softif_vlan_list;
-	spinlock_t softif_vlan_list_lock; /* protects softif_vlan_list */
+
+	/** @softif_vlan_list_lock: lock protecting softif_vlan_list */
+	spinlock_t softif_vlan_list_lock;
+
 #ifdef CONFIG_BATMAN_ADV_BLA
+	/** @bla: bridge loope avoidance data */
 	struct batadv_priv_bla bla;
 #endif
+
 #ifdef CONFIG_BATMAN_ADV_DEBUG
+	/** @debug_log: holding debug logging relevant data */
 	struct batadv_priv_debug_log *debug_log;
 #endif
+
+	/** @gw: gateway data */
 	struct batadv_priv_gw gw;
+
+	/** @tt: translation table data */
 	struct batadv_priv_tt tt;
+
+	/** @tvlv: type-version-length-value data */
 	struct batadv_priv_tvlv tvlv;
+
 #ifdef CONFIG_BATMAN_ADV_DAT
+	/** @dat: distributed arp table data */
 	struct batadv_priv_dat dat;
 #endif
+
 #ifdef CONFIG_BATMAN_ADV_MCAST
+	/** @mcast: multicast data */
 	struct batadv_priv_mcast mcast;
 #endif
+
 #ifdef CONFIG_BATMAN_ADV_NC
+	/**
+	 * @network_coding: bool indicating whether network coding is enabled
+	 */
 	atomic_t network_coding;
+
+	/** @nc: network coding data */
 	struct batadv_priv_nc nc;
 #endif /* CONFIG_BATMAN_ADV_NC */
+
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	/** @bat_v: B.A.T.M.A.N. V per soft-interface private data */
 	struct batadv_priv_bat_v bat_v;
 #endif
 };
 
 /**
  * struct batadv_socket_client - layer2 icmp socket client data
- * @queue_list: packet queue for packets destined for this socket client
- * @queue_len: number of packets in the packet queue (queue_list)
- * @index: socket client's index in the batadv_socket_client_hash
- * @lock: lock protecting queue_list, queue_len & index
- * @queue_wait: socket client's wait queue
- * @bat_priv: pointer to soft_iface this client belongs to
  */
 struct batadv_socket_client {
+	/**
+	 * @queue_list: packet queue for packets destined for this socket client
+	 */
 	struct list_head queue_list;
+
+	/** @queue_len: number of packets in the packet queue (queue_list) */
 	unsigned int queue_len;
+
+	/** @index: socket client's index in the batadv_socket_client_hash */
 	unsigned char index;
-	spinlock_t lock; /* protects queue_list, queue_len & index */
+
+	/** @lock: lock protecting queue_list, queue_len & index */
+	spinlock_t lock;
+
+	/** @queue_wait: socket client's wait queue */
 	wait_queue_head_t queue_wait;
+
+	/** @bat_priv: pointer to soft_iface this client belongs to */
 	struct batadv_priv *bat_priv;
 };
 
 /**
  * struct batadv_socket_packet - layer2 icmp packet for socket client
- * @list: list node for batadv_socket_client::queue_list
- * @icmp_len: size of the layer2 icmp packet
- * @icmp_packet: layer2 icmp packet
  */
 struct batadv_socket_packet {
+	/** @list: list node for batadv_socket_client::queue_list */
 	struct list_head list;
+
+	/** @icmp_len: size of the layer2 icmp packet */
 	size_t icmp_len;
+
+	/** @icmp_packet: layer2 icmp packet */
 	u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE];
 };
 
@@ -1154,312 +1741,429 @@ struct batadv_socket_packet {
 
 /**
  * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN
- * @orig: originator address of backbone node (mac address of primary iface)
- * @vid: vlan id this gateway was detected on
- * @hash_entry: hlist node for batadv_priv_bla::backbone_hash
- * @bat_priv: pointer to soft_iface this backbone gateway belongs to
- * @lasttime: last time we heard of this backbone gw
- * @wait_periods: grace time for bridge forward delays and bla group forming at
- *  bootup phase - no bcast traffic is formwared until it has elapsed
- * @request_sent: if this bool is set to true we are out of sync with this
- *  backbone gateway - no bcast traffic is formwared until the situation was
- *  resolved
- * @crc: crc16 checksum over all claims
- * @crc_lock: lock protecting crc
- * @report_work: work struct for reporting detected loops
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_bla_backbone_gw {
+	/**
+	 * @orig: originator address of backbone node (mac address of primary
+	 *  iface)
+	 */
 	u8 orig[ETH_ALEN];
+
+	/** @vid: vlan id this gateway was detected on */
 	unsigned short vid;
+
+	/** @hash_entry: hlist node for batadv_priv_bla::backbone_hash */
 	struct hlist_node hash_entry;
+
+	/** @bat_priv: pointer to soft_iface this backbone gateway belongs to */
 	struct batadv_priv *bat_priv;
+
+	/** @lasttime: last time we heard of this backbone gw */
 	unsigned long lasttime;
+
+	/**
+	 * @wait_periods: grace time for bridge forward delays and bla group
+	 *  forming at bootup phase - no bcast traffic is formwared until it has
+	 *  elapsed
+	 */
 	atomic_t wait_periods;
+
+	/**
+	 * @request_sent: if this bool is set to true we are out of sync with
+	 *  this backbone gateway - no bcast traffic is formwared until the
+	 *  situation was resolved
+	 */
 	atomic_t request_sent;
+
+	/** @crc: crc16 checksum over all claims */
 	u16 crc;
-	spinlock_t crc_lock; /* protects crc */
+
+	/** @crc_lock: lock protecting crc */
+	spinlock_t crc_lock;
+
+	/** @report_work: work struct for reporting detected loops */
 	struct work_struct report_work;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_bla_claim - claimed non-mesh client structure
- * @addr: mac address of claimed non-mesh client
- * @vid: vlan id this client was detected on
- * @backbone_gw: pointer to backbone gw claiming this client
- * @backbone_lock: lock protecting backbone_gw pointer
- * @lasttime: last time we heard of claim (locals only)
- * @hash_entry: hlist node for batadv_priv_bla::claim_hash
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_bla_claim {
+	/** @addr: mac address of claimed non-mesh client */
 	u8 addr[ETH_ALEN];
+
+	/** @vid: vlan id this client was detected on */
 	unsigned short vid;
+
+	/** @backbone_gw: pointer to backbone gw claiming this client */
 	struct batadv_bla_backbone_gw *backbone_gw;
-	spinlock_t backbone_lock; /* protects backbone_gw */
+
+	/** @backbone_lock: lock protecting backbone_gw pointer */
+	spinlock_t backbone_lock;
+
+	/** @lasttime: last time we heard of claim (locals only) */
 	unsigned long lasttime;
+
+	/** @hash_entry: hlist node for batadv_priv_bla::claim_hash */
 	struct hlist_node hash_entry;
+
+	/** @refcount: number of contexts the object is used */
 	struct rcu_head rcu;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct kref refcount;
 };
 #endif
 
 /**
  * struct batadv_tt_common_entry - tt local & tt global common data
- * @addr: mac address of non-mesh client
- * @vid: VLAN identifier
- * @hash_entry: hlist node for batadv_priv_tt::local_hash or for
- *  batadv_priv_tt::global_hash
- * @flags: various state handling flags (see batadv_tt_client_flags)
- * @added_at: timestamp used for purging stale tt common entries
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_tt_common_entry {
+	/** @addr: mac address of non-mesh client */
 	u8 addr[ETH_ALEN];
+
+	/** @vid: VLAN identifier */
 	unsigned short vid;
+
+	/**
+	 * @hash_entry: hlist node for batadv_priv_tt::local_hash or for
+	 *  batadv_priv_tt::global_hash
+	 */
 	struct hlist_node hash_entry;
+
+	/** @flags: various state handling flags (see batadv_tt_client_flags) */
 	u16 flags;
+
+	/** @added_at: timestamp used for purging stale tt common entries */
 	unsigned long added_at;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_tt_local_entry - translation table local entry data
- * @common: general translation table data
- * @last_seen: timestamp used for purging stale tt local entries
- * @vlan: soft-interface vlan of the entry
  */
 struct batadv_tt_local_entry {
+	/** @common: general translation table data */
 	struct batadv_tt_common_entry common;
+
+	/** @last_seen: timestamp used for purging stale tt local entries */
 	unsigned long last_seen;
+
+	/** @vlan: soft-interface vlan of the entry */
 	struct batadv_softif_vlan *vlan;
 };
 
 /**
  * struct batadv_tt_global_entry - translation table global entry data
- * @common: general translation table data
- * @orig_list: list of orig nodes announcing this non-mesh client
- * @orig_list_count: number of items in the orig_list
- * @list_lock: lock protecting orig_list
- * @roam_at: time at which TT_GLOBAL_ROAM was set
  */
 struct batadv_tt_global_entry {
+	/** @common: general translation table data */
 	struct batadv_tt_common_entry common;
+
+	/** @orig_list: list of orig nodes announcing this non-mesh client */
 	struct hlist_head orig_list;
+
+	/** @orig_list_count: number of items in the orig_list */
 	atomic_t orig_list_count;
-	spinlock_t list_lock;	/* protects orig_list */
+
+	/** @list_lock: lock protecting orig_list */
+	spinlock_t list_lock;
+
+	/** @roam_at: time at which TT_GLOBAL_ROAM was set */
 	unsigned long roam_at;
 };
 
 /**
  * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client
- * @orig_node: pointer to orig node announcing this non-mesh client
- * @ttvn: translation table version number which added the non-mesh client
- * @flags: per orig entry TT sync flags
- * @list: list node for batadv_tt_global_entry::orig_list
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_tt_orig_list_entry {
+	/** @orig_node: pointer to orig node announcing this non-mesh client */
 	struct batadv_orig_node *orig_node;
+
+	/**
+	 * @ttvn: translation table version number which added the non-mesh
+	 *  client
+	 */
 	u8 ttvn;
+
+	/** @flags: per orig entry TT sync flags */
 	u8 flags;
+
+	/** @list: list node for batadv_tt_global_entry::orig_list */
 	struct hlist_node list;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_tt_change_node - structure for tt changes occurred
- * @list: list node for batadv_priv_tt::changes_list
- * @change: holds the actual translation table diff data
  */
 struct batadv_tt_change_node {
+	/** @list: list node for batadv_priv_tt::changes_list */
 	struct list_head list;
+
+	/** @change: holds the actual translation table diff data */
 	struct batadv_tvlv_tt_change change;
 };
 
 /**
  * struct batadv_tt_req_node - data to keep track of the tt requests in flight
- * @addr: mac address address of the originator this request was sent to
- * @issued_at: timestamp used for purging stale tt requests
- * @refcount: number of contexts the object is used by
- * @list: list node for batadv_priv_tt::req_list
  */
 struct batadv_tt_req_node {
+	/**
+	 * @addr: mac address address of the originator this request was sent to
+	 */
 	u8 addr[ETH_ALEN];
+
+	/** @issued_at: timestamp used for purging stale tt requests */
 	unsigned long issued_at;
+
+	/** @refcount: number of contexts the object is used by */
 	struct kref refcount;
+
+	/** @list: list node for batadv_priv_tt::req_list */
 	struct hlist_node list;
 };
 
 /**
  * struct batadv_tt_roam_node - roaming client data
- * @addr: mac address of the client in the roaming phase
- * @counter: number of allowed roaming events per client within a single
- *  OGM interval (changes are committed with each OGM)
- * @first_time: timestamp used for purging stale roaming node entries
- * @list: list node for batadv_priv_tt::roam_list
  */
 struct batadv_tt_roam_node {
+	/** @addr: mac address of the client in the roaming phase */
 	u8 addr[ETH_ALEN];
+
+	/**
+	 * @counter: number of allowed roaming events per client within a single
+	 * OGM interval (changes are committed with each OGM)
+	 */
 	atomic_t counter;
+
+	/**
+	 * @first_time: timestamp used for purging stale roaming node entries
+	 */
 	unsigned long first_time;
+
+	/** @list: list node for batadv_priv_tt::roam_list */
 	struct list_head list;
 };
 
 /**
  * struct batadv_nc_node - network coding node
- * @list: next and prev pointer for the list handling
- * @addr: the node's mac address
- * @refcount: number of contexts the object is used by
- * @rcu: struct used for freeing in an RCU-safe manner
- * @orig_node: pointer to corresponding orig node struct
- * @last_seen: timestamp of last ogm received from this node
  */
 struct batadv_nc_node {
+	/** @list: next and prev pointer for the list handling */
 	struct list_head list;
+
+	/** @addr: the node's mac address */
 	u8 addr[ETH_ALEN];
+
+	/** @refcount: number of contexts the object is used by */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
+
+	/** @orig_node: pointer to corresponding orig node struct */
 	struct batadv_orig_node *orig_node;
+
+	/** @last_seen: timestamp of last ogm received from this node */
 	unsigned long last_seen;
 };
 
 /**
  * struct batadv_nc_path - network coding path
- * @hash_entry: next and prev pointer for the list handling
- * @rcu: struct used for freeing in an RCU-safe manner
- * @refcount: number of contexts the object is used by
- * @packet_list: list of buffered packets for this path
- * @packet_list_lock: access lock for packet list
- * @next_hop: next hop (destination) of path
- * @prev_hop: previous hop (source) of path
- * @last_valid: timestamp for last validation of path
  */
 struct batadv_nc_path {
+	/** @hash_entry: next and prev pointer for the list handling */
 	struct hlist_node hash_entry;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
+
+	/** @refcount: number of contexts the object is used by */
 	struct kref refcount;
+
+	/** @packet_list: list of buffered packets for this path */
 	struct list_head packet_list;
-	spinlock_t packet_list_lock; /* Protects packet_list */
+
+	/** @packet_list_lock: access lock for packet list */
+	spinlock_t packet_list_lock;
+
+	/** @next_hop: next hop (destination) of path */
 	u8 next_hop[ETH_ALEN];
+
+	/** @prev_hop: previous hop (source) of path */
 	u8 prev_hop[ETH_ALEN];
+
+	/** @last_valid: timestamp for last validation of path */
 	unsigned long last_valid;
 };
 
 /**
  * struct batadv_nc_packet - network coding packet used when coding and
  *  decoding packets
- * @list: next and prev pointer for the list handling
- * @packet_id: crc32 checksum of skb data
- * @timestamp: field containing the info when the packet was added to path
- * @neigh_node: pointer to original next hop neighbor of skb
- * @skb: skb which can be encoded or used for decoding
- * @nc_path: pointer to path this nc packet is attached to
  */
 struct batadv_nc_packet {
+	/** @list: next and prev pointer for the list handling */
 	struct list_head list;
+
+	/** @packet_id: crc32 checksum of skb data */
 	__be32 packet_id;
+
+	/**
+	 * @timestamp: field containing the info when the packet was added to
+	 *  path
+	 */
 	unsigned long timestamp;
+
+	/** @neigh_node: pointer to original next hop neighbor of skb */
 	struct batadv_neigh_node *neigh_node;
+
+	/** @skb: skb which can be encoded or used for decoding */
 	struct sk_buff *skb;
+
+	/** @nc_path: pointer to path this nc packet is attached to */
 	struct batadv_nc_path *nc_path;
 };
 
 /**
  * struct batadv_skb_cb - control buffer structure used to store private data
  *  relevant to batman-adv in the skb->cb buffer in skbs.
- * @decoded: Marks a skb as decoded, which is checked when searching for coding
- *  opportunities in network-coding.c
- * @num_bcasts: Counter for broadcast packet retransmissions
  */
 struct batadv_skb_cb {
+	/**
+	 * @decoded: Marks a skb as decoded, which is checked when searching for
+	 *  coding opportunities in network-coding.c
+	 */
 	bool decoded;
+
+	/** @num_bcasts: Counter for broadcast packet retransmissions */
 	unsigned int num_bcasts;
 };
 
 /**
  * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
- * @list: list node for batadv_priv::forw_{bat,bcast}_list
- * @cleanup_list: list node for purging functions
- * @send_time: execution time for delayed_work (packet sending)
- * @own: bool for locally generated packets (local OGMs are re-scheduled after
- *  sending)
- * @skb: bcast packet's skb buffer
- * @packet_len: size of aggregated OGM packet inside the skb buffer
- * @direct_link_flags: direct link flags for aggregated OGM packets
- * @num_packets: counter for aggregated OGMv1 packets
- * @delayed_work: work queue callback item for packet sending
- * @if_incoming: pointer to incoming hard-iface or primary iface if
- *  locally generated packet
- * @if_outgoing: packet where the packet should be sent to, or NULL if
- *  unspecified
- * @queue_left: The queue (counter) this packet was applied to
  */
 struct batadv_forw_packet {
+	/** @list: list node for batadv_priv::forw_{bat,bcast}_list */
 	struct hlist_node list;
+
+	/** @cleanup_list: list node for purging functions */
 	struct hlist_node cleanup_list;
+
+	/** @send_time: execution time for delayed_work (packet sending) */
 	unsigned long send_time;
+
+	/**
+	 * @own: bool for locally generated packets (local OGMs are re-scheduled
+	 * after sending)
+	 */
 	u8 own;
+
+	/** @skb: bcast packet's skb buffer */
 	struct sk_buff *skb;
+
+	/** @packet_len: size of aggregated OGM packet inside the skb buffer */
 	u16 packet_len;
+
+	/** @direct_link_flags: direct link flags for aggregated OGM packets */
 	u32 direct_link_flags;
+
+	/** @num_packets: counter for aggregated OGMv1 packets */
 	u8 num_packets;
+
+	/** @delayed_work: work queue callback item for packet sending */
 	struct delayed_work delayed_work;
+
+	/**
+	 * @if_incoming: pointer to incoming hard-iface or primary iface if
+	 *  locally generated packet
+	 */
 	struct batadv_hard_iface *if_incoming;
+
+	/**
+	 * @if_outgoing: packet where the packet should be sent to, or NULL if
+	 *  unspecified
+	 */
 	struct batadv_hard_iface *if_outgoing;
+
+	/** @queue_left: The queue (counter) this packet was applied to */
 	atomic_t *queue_left;
 };
 
 /**
  * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific)
- * @activate: start routing mechanisms when hard-interface is brought up
- *  (optional)
- * @enable: init routing info when hard-interface is enabled
- * @disable: de-init routing info when hard-interface is disabled
- * @update_mac: (re-)init mac addresses of the protocol information
- *  belonging to this hard-interface
- * @primary_set: called when primary interface is selected / changed
  */
 struct batadv_algo_iface_ops {
+	/**
+	 * @activate: start routing mechanisms when hard-interface is brought up
+	 *  (optional)
+	 */
 	void (*activate)(struct batadv_hard_iface *hard_iface);
+
+	/** @enable: init routing info when hard-interface is enabled */
 	int (*enable)(struct batadv_hard_iface *hard_iface);
+
+	/** @disable: de-init routing info when hard-interface is disabled */
 	void (*disable)(struct batadv_hard_iface *hard_iface);
+
+	/**
+	 * @update_mac: (re-)init mac addresses of the protocol information
+	 *  belonging to this hard-interface
+	 */
 	void (*update_mac)(struct batadv_hard_iface *hard_iface);
+
+	/** @primary_set: called when primary interface is selected / changed */
 	void (*primary_set)(struct batadv_hard_iface *hard_iface);
 };
 
 /**
  * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific)
- * @hardif_init: called on creation of single hop entry
- *  (optional)
- * @cmp: compare the metrics of two neighbors for their respective outgoing
- *  interfaces
- * @is_similar_or_better: check if neigh1 is equally similar or better than
- *  neigh2 for their respective outgoing interface from the metric prospective
- * @print: print the single hop neighbor list (optional)
- * @dump: dump neighbors to a netlink socket (optional)
  */
 struct batadv_algo_neigh_ops {
+	/** @hardif_init: called on creation of single hop entry (optional) */
 	void (*hardif_init)(struct batadv_hardif_neigh_node *neigh);
+
+	/**
+	 * @cmp: compare the metrics of two neighbors for their respective
+	 *  outgoing interfaces
+	 */
 	int (*cmp)(struct batadv_neigh_node *neigh1,
 		   struct batadv_hard_iface *if_outgoing1,
 		   struct batadv_neigh_node *neigh2,
 		   struct batadv_hard_iface *if_outgoing2);
+
+	/**
+	 * @is_similar_or_better: check if neigh1 is equally similar or better
+	 *  than neigh2 for their respective outgoing interface from the metric
+	 *  prospective
+	 */
 	bool (*is_similar_or_better)(struct batadv_neigh_node *neigh1,
 				     struct batadv_hard_iface *if_outgoing1,
 				     struct batadv_neigh_node *neigh2,
 				     struct batadv_hard_iface *if_outgoing2);
+
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+	/** @print: print the single hop neighbor list (optional) */
 	void (*print)(struct batadv_priv *priv, struct seq_file *seq);
 #endif
+
+	/** @dump: dump neighbors to a netlink socket (optional) */
 	void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
 		     struct batadv_priv *priv,
 		     struct batadv_hard_iface *hard_iface);
@@ -1467,24 +2171,36 @@ struct batadv_algo_neigh_ops {
 
 /**
  * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
- * @free: free the resources allocated by the routing algorithm for an orig_node
- *  object (optional)
- * @add_if: ask the routing algorithm to apply the needed changes to the
- *  orig_node due to a new hard-interface being added into the mesh (optional)
- * @del_if: ask the routing algorithm to apply the needed changes to the
- *  orig_node due to an hard-interface being removed from the mesh (optional)
- * @print: print the originator table (optional)
- * @dump: dump originators to a netlink socket (optional)
  */
 struct batadv_algo_orig_ops {
+	/**
+	 * @free: free the resources allocated by the routing algorithm for an
+	 *  orig_node object (optional)
+	 */
 	void (*free)(struct batadv_orig_node *orig_node);
+
+	/**
+	 * @add_if: ask the routing algorithm to apply the needed changes to the
+	 *  orig_node due to a new hard-interface being added into the mesh
+	 *  (optional)
+	 */
 	int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num);
+
+	/**
+	 * @del_if: ask the routing algorithm to apply the needed changes to the
+	 *  orig_node due to an hard-interface being removed from the mesh
+	 *  (optional)
+	 */
 	int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num,
 		      int del_if_num);
+
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+	/** @print: print the originator table (optional) */
 	void (*print)(struct batadv_priv *priv, struct seq_file *seq,
 		      struct batadv_hard_iface *hard_iface);
 #endif
+
+	/** @dump: dump originators to a netlink socket (optional) */
 	void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
 		     struct batadv_priv *priv,
 		     struct batadv_hard_iface *hard_iface);
@@ -1492,158 +2208,213 @@ struct batadv_algo_orig_ops {
 
 /**
  * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
- * @init_sel_class: initialize GW selection class (optional)
- * @store_sel_class: parse and stores a new GW selection class (optional)
- * @show_sel_class: prints the current GW selection class (optional)
- * @get_best_gw_node: select the best GW from the list of available nodes
- *  (optional)
- * @is_eligible: check if a newly discovered GW is a potential candidate for
- *  the election as best GW (optional)
- * @print: print the gateway table (optional)
- * @dump: dump gateways to a netlink socket (optional)
  */
 struct batadv_algo_gw_ops {
+	/** @init_sel_class: initialize GW selection class (optional) */
 	void (*init_sel_class)(struct batadv_priv *bat_priv);
+
+	/**
+	 * @store_sel_class: parse and stores a new GW selection class
+	 *  (optional)
+	 */
 	ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
 				   size_t count);
+
+	/** @show_sel_class: prints the current GW selection class (optional) */
 	ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
+
+	/**
+	 * @get_best_gw_node: select the best GW from the list of available
+	 *  nodes (optional)
+	 */
 	struct batadv_gw_node *(*get_best_gw_node)
 		(struct batadv_priv *bat_priv);
+
+	/**
+	 * @is_eligible: check if a newly discovered GW is a potential candidate
+	 *  for the election as best GW (optional)
+	 */
 	bool (*is_eligible)(struct batadv_priv *bat_priv,
 			    struct batadv_orig_node *curr_gw_orig,
 			    struct batadv_orig_node *orig_node);
+
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+	/** @print: print the gateway table (optional) */
 	void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq);
 #endif
+
+	/** @dump: dump gateways to a netlink socket (optional) */
 	void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
 		     struct batadv_priv *priv);
 };
 
 /**
  * struct batadv_algo_ops - mesh algorithm callbacks
- * @list: list node for the batadv_algo_list
- * @name: name of the algorithm
- * @iface: callbacks related to interface handling
- * @neigh: callbacks related to neighbors handling
- * @orig: callbacks related to originators handling
- * @gw: callbacks related to GW mode
  */
 struct batadv_algo_ops {
+	/** @list: list node for the batadv_algo_list */
 	struct hlist_node list;
+
+	/** @name: name of the algorithm */
 	char *name;
+
+	/** @iface: callbacks related to interface handling */
 	struct batadv_algo_iface_ops iface;
+
+	/** @neigh: callbacks related to neighbors handling */
 	struct batadv_algo_neigh_ops neigh;
+
+	/** @orig: callbacks related to originators handling */
 	struct batadv_algo_orig_ops orig;
+
+	/** @gw: callbacks related to GW mode */
 	struct batadv_algo_gw_ops gw;
 };
 
 /**
  * struct batadv_dat_entry - it is a single entry of batman-adv ARP backend. It
  * is used to stored ARP entries needed for the global DAT cache
- * @ip: the IPv4 corresponding to this DAT/ARP entry
- * @mac_addr: the MAC address associated to the stored IPv4
- * @vid: the vlan ID associated to this entry
- * @last_update: time in jiffies when this entry was refreshed last time
- * @hash_entry: hlist node for batadv_priv_dat::hash
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_dat_entry {
+	/** @ip: the IPv4 corresponding to this DAT/ARP entry */
 	__be32 ip;
+
+	/** @mac_addr: the MAC address associated to the stored IPv4 */
 	u8 mac_addr[ETH_ALEN];
+
+	/** @vid: the vlan ID associated to this entry */
 	unsigned short vid;
+
+	/**
+	 * @last_update: time in jiffies when this entry was refreshed last time
+	 */
 	unsigned long last_update;
+
+	/** @hash_entry: hlist node for batadv_priv_dat::hash */
 	struct hlist_node hash_entry;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * struct batadv_hw_addr - a list entry for a MAC address
- * @list: list node for the linking of entries
- * @addr: the MAC address of this list entry
  */
 struct batadv_hw_addr {
+	/** @list: list node for the linking of entries */
 	struct hlist_node list;
+
+	/** @addr: the MAC address of this list entry */
 	unsigned char addr[ETH_ALEN];
 };
 
 /**
  * struct batadv_dat_candidate - candidate destination for DAT operations
- * @type: the type of the selected candidate. It can one of the following:
- *	  - BATADV_DAT_CANDIDATE_NOT_FOUND
- *	  - BATADV_DAT_CANDIDATE_ORIG
- * @orig_node: if type is BATADV_DAT_CANDIDATE_ORIG this field points to the
- *	       corresponding originator node structure
  */
 struct batadv_dat_candidate {
+	/**
+	 * @type: the type of the selected candidate. It can one of the
+	 *  following:
+	 *	  - BATADV_DAT_CANDIDATE_NOT_FOUND
+	 *	  - BATADV_DAT_CANDIDATE_ORIG
+	 */
 	int type;
+
+	/**
+	 * @orig_node: if type is BATADV_DAT_CANDIDATE_ORIG this field points to
+	 * the corresponding originator node structure
+	 */
 	struct batadv_orig_node *orig_node;
 };
 
 /**
  * struct batadv_tvlv_container - container for tvlv appended to OGMs
- * @list: hlist node for batadv_priv_tvlv::container_list
- * @tvlv_hdr: tvlv header information needed to construct the tvlv
- * @refcount: number of contexts the object is used
  */
 struct batadv_tvlv_container {
+	/** @list: hlist node for batadv_priv_tvlv::container_list */
 	struct hlist_node list;
+
+	/** @tvlv_hdr: tvlv header information needed to construct the tvlv */
 	struct batadv_tvlv_hdr tvlv_hdr;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
 };
 
 /**
  * struct batadv_tvlv_handler - handler for specific tvlv type and version
- * @list: hlist node for batadv_priv_tvlv::handler_list
- * @ogm_handler: handler callback which is given the tvlv payload to process on
- *  incoming OGM packets
- * @unicast_handler: handler callback which is given the tvlv payload to process
- *  on incoming unicast tvlv packets
- * @type: tvlv type this handler feels responsible for
- * @version: tvlv version this handler feels responsible for
- * @flags: tvlv handler flags
- * @refcount: number of contexts the object is used
- * @rcu: struct used for freeing in an RCU-safe manner
  */
 struct batadv_tvlv_handler {
+	/** @list: hlist node for batadv_priv_tvlv::handler_list */
 	struct hlist_node list;
+
+	/**
+	 * @ogm_handler: handler callback which is given the tvlv payload to
+	 *  process on incoming OGM packets
+	 */
 	void (*ogm_handler)(struct batadv_priv *bat_priv,
 			    struct batadv_orig_node *orig,
 			    u8 flags, void *tvlv_value, u16 tvlv_value_len);
+
+	/**
+	 * @unicast_handler: handler callback which is given the tvlv payload to
+	 *  process on incoming unicast tvlv packets
+	 */
 	int (*unicast_handler)(struct batadv_priv *bat_priv,
 			       u8 *src, u8 *dst,
 			       void *tvlv_value, u16 tvlv_value_len);
+
+	/** @type: tvlv type this handler feels responsible for */
 	u8 type;
+
+	/** @version: tvlv version this handler feels responsible for */
 	u8 version;
+
+	/** @flags: tvlv handler flags */
 	u8 flags;
+
+	/** @refcount: number of contexts the object is used */
 	struct kref refcount;
+
+	/** @rcu: struct used for freeing in an RCU-safe manner */
 	struct rcu_head rcu;
 };
 
 /**
  * enum batadv_tvlv_handler_flags - tvlv handler flags definitions
- * @BATADV_TVLV_HANDLER_OGM_CIFNOTFND: tvlv ogm processing function will call
- *  this handler even if its type was not found (with no data)
- * @BATADV_TVLV_HANDLER_OGM_CALLED: interval tvlv handling flag - the API marks
- *  a handler as being called, so it won't be called if the
- *  BATADV_TVLV_HANDLER_OGM_CIFNOTFND flag was set
  */
 enum batadv_tvlv_handler_flags {
+	/**
+	 * @BATADV_TVLV_HANDLER_OGM_CIFNOTFND: tvlv ogm processing function
+	 *  will call this handler even if its type was not found (with no data)
+	 */
 	BATADV_TVLV_HANDLER_OGM_CIFNOTFND = BIT(1),
+
+	/**
+	 * @BATADV_TVLV_HANDLER_OGM_CALLED: interval tvlv handling flag - the
+	 *  API marks a handler as being called, so it won't be called if the
+	 *  BATADV_TVLV_HANDLER_OGM_CIFNOTFND flag was set
+	 */
 	BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
 };
 
 /**
  * struct batadv_store_mesh_work - Work queue item to detach add/del interface
  *  from sysfs locks
- * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
- * @soft_iface_name: name of soft-interface to modify
- * @work: work queue item
  */
 struct batadv_store_mesh_work {
+	/**
+	 * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
+	 */
 	struct net_device *net_dev;
+
+	/** @soft_iface_name: name of soft-interface to modify */
 	char soft_iface_name[IFNAMSIZ];
+
+	/** @work: work queue item */
 	struct work_struct work;
 };
 
-- 
cgit v1.2.3


From a07369d7fb0df37c61f271280b7497a882a3d291 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:49 +0100
Subject: batman-adv: Fix kernel-doc references to struct members

The correct syntax to create references in kernel-doc to a struct member is
not "struct_name::member"" but "&struct_name->member" or
"&struct_name.member". The correct syntax is required to get the correct
cross-referencing in the reStructuredText text output.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/types.h | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index c39f879d7dde..77b145eba193 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -234,7 +234,7 @@ struct batadv_hard_iface {
  * struct batadv_orig_ifinfo - originator info per outgoing interface
  */
 struct batadv_orig_ifinfo {
-	/** @list: list node for orig_node::ifinfo_list */
+	/** @list: list node for &batadv_orig_node.ifinfo_list */
 	struct hlist_node list;
 
 	/** @if_outgoing: pointer to outgoing hard-interface */
@@ -320,7 +320,7 @@ struct batadv_orig_node_vlan {
 	/** @tt: VLAN specific TT attributes */
 	struct batadv_vlan_tt tt;
 
-	/** @list: list node for orig_node::vlan_list */
+	/** @list: list node for &batadv_orig_node.vlan_list */
 	struct hlist_node list;
 
 	/**
@@ -467,7 +467,7 @@ struct batadv_orig_node {
 	 */
 	spinlock_t neigh_list_lock;
 
-	/** @hash_entry: hlist node for batadv_priv::orig_hash */
+	/** @hash_entry: hlist node for &batadv_priv.orig_hash */
 	struct hlist_node hash_entry;
 
 	/** @bat_priv: pointer to soft_iface this orig node belongs to */
@@ -539,7 +539,7 @@ enum batadv_orig_capabilities {
  * struct batadv_gw_node - structure for orig nodes announcing gw capabilities
  */
 struct batadv_gw_node {
-	/** @list: list node for batadv_priv_gw::list */
+	/** @list: list node for &batadv_priv_gw.list */
 	struct hlist_node list;
 
 	/** @orig_node: pointer to corresponding orig node */
@@ -588,7 +588,7 @@ struct batadv_hardif_neigh_node_bat_v {
  * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
  */
 struct batadv_hardif_neigh_node {
-	/** @list: list node for batadv_hard_iface::neigh_list */
+	/** @list: list node for &batadv_hard_iface.neigh_list */
 	struct hlist_node list;
 
 	/** @addr: the MAC address of the neighboring interface */
@@ -621,7 +621,7 @@ struct batadv_hardif_neigh_node {
  * struct batadv_neigh_node - structure for single hops neighbors
  */
 struct batadv_neigh_node {
-	/** @list: list node for batadv_orig_node::neigh_list */
+	/** @list: list node for &batadv_orig_node.neigh_list */
 	struct hlist_node list;
 
 	/** @orig_node: pointer to corresponding orig_node */
@@ -697,7 +697,7 @@ struct batadv_neigh_ifinfo_bat_v {
  * struct batadv_neigh_ifinfo - neighbor information per outgoing interface
  */
 struct batadv_neigh_ifinfo {
-	/** @list: list node for batadv_neigh_node::ifinfo_list */
+	/** @list: list node for &batadv_neigh_node.ifinfo_list */
 	struct hlist_node list;
 
 	/** @if_outgoing: pointer to outgoing hard-interface */
@@ -1308,7 +1308,7 @@ struct batadv_tp_unacked {
 	/** @len: length of the packet */
 	u16 len;
 
-	/** @list: list node for batadv_tp_vars::unacked_list */
+	/** @list: list node for &batadv_tp_vars.unacked_list */
 	struct list_head list;
 };
 
@@ -1327,7 +1327,7 @@ enum batadv_tp_meter_role {
  * struct batadv_tp_vars - tp meter private variables per session
  */
 struct batadv_tp_vars {
-	/** @list: list node for bat_priv::tp_list */
+	/** @list: list node for &bat_priv.tp_list */
 	struct hlist_node list;
 
 	/** @timer: timer for ack (receiver) and retry (sender) */
@@ -1459,7 +1459,7 @@ struct batadv_softif_vlan {
 	/** @tt: TT private attributes (VLAN specific) */
 	struct batadv_vlan_tt tt;
 
-	/** @list: list node for bat_priv::softif_vlan_list */
+	/** @list: list node for &bat_priv.softif_vlan_list */
 	struct hlist_node list;
 
 	/**
@@ -1727,7 +1727,7 @@ struct batadv_socket_client {
  * struct batadv_socket_packet - layer2 icmp packet for socket client
  */
 struct batadv_socket_packet {
-	/** @list: list node for batadv_socket_client::queue_list */
+	/** @list: list node for &batadv_socket_client.queue_list */
 	struct list_head list;
 
 	/** @icmp_len: size of the layer2 icmp packet */
@@ -1752,7 +1752,7 @@ struct batadv_bla_backbone_gw {
 	/** @vid: vlan id this gateway was detected on */
 	unsigned short vid;
 
-	/** @hash_entry: hlist node for batadv_priv_bla::backbone_hash */
+	/** @hash_entry: hlist node for &batadv_priv_bla.backbone_hash */
 	struct hlist_node hash_entry;
 
 	/** @bat_priv: pointer to soft_iface this backbone gateway belongs to */
@@ -1810,7 +1810,7 @@ struct batadv_bla_claim {
 	/** @lasttime: last time we heard of claim (locals only) */
 	unsigned long lasttime;
 
-	/** @hash_entry: hlist node for batadv_priv_bla::claim_hash */
+	/** @hash_entry: hlist node for &batadv_priv_bla.claim_hash */
 	struct hlist_node hash_entry;
 
 	/** @refcount: number of contexts the object is used */
@@ -1832,8 +1832,8 @@ struct batadv_tt_common_entry {
 	unsigned short vid;
 
 	/**
-	 * @hash_entry: hlist node for batadv_priv_tt::local_hash or for
-	 *  batadv_priv_tt::global_hash
+	 * @hash_entry: hlist node for &batadv_priv_tt.local_hash or for
+	 *  &batadv_priv_tt.global_hash
 	 */
 	struct hlist_node hash_entry;
 
@@ -1900,7 +1900,7 @@ struct batadv_tt_orig_list_entry {
 	/** @flags: per orig entry TT sync flags */
 	u8 flags;
 
-	/** @list: list node for batadv_tt_global_entry::orig_list */
+	/** @list: list node for &batadv_tt_global_entry.orig_list */
 	struct hlist_node list;
 
 	/** @refcount: number of contexts the object is used */
@@ -1914,7 +1914,7 @@ struct batadv_tt_orig_list_entry {
  * struct batadv_tt_change_node - structure for tt changes occurred
  */
 struct batadv_tt_change_node {
-	/** @list: list node for batadv_priv_tt::changes_list */
+	/** @list: list node for &batadv_priv_tt.changes_list */
 	struct list_head list;
 
 	/** @change: holds the actual translation table diff data */
@@ -1936,7 +1936,7 @@ struct batadv_tt_req_node {
 	/** @refcount: number of contexts the object is used by */
 	struct kref refcount;
 
-	/** @list: list node for batadv_priv_tt::req_list */
+	/** @list: list node for &batadv_priv_tt.req_list */
 	struct hlist_node list;
 };
 
@@ -1958,7 +1958,7 @@ struct batadv_tt_roam_node {
 	 */
 	unsigned long first_time;
 
-	/** @list: list node for batadv_priv_tt::roam_list */
+	/** @list: list node for &batadv_priv_tt.roam_list */
 	struct list_head list;
 };
 
@@ -2060,7 +2060,10 @@ struct batadv_skb_cb {
  * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
  */
 struct batadv_forw_packet {
-	/** @list: list node for batadv_priv::forw_{bat,bcast}_list */
+	/**
+	 * @list: list node for &batadv_priv.forw.bcast_list and
+	 *  &batadv_priv.forw.bat_list
+	 */
 	struct hlist_node list;
 
 	/** @cleanup_list: list node for purging functions */
@@ -2290,7 +2293,7 @@ struct batadv_dat_entry {
 	 */
 	unsigned long last_update;
 
-	/** @hash_entry: hlist node for batadv_priv_dat::hash */
+	/** @hash_entry: hlist node for &batadv_priv_dat.hash */
 	struct hlist_node hash_entry;
 
 	/** @refcount: number of contexts the object is used */
@@ -2334,7 +2337,7 @@ struct batadv_dat_candidate {
  * struct batadv_tvlv_container - container for tvlv appended to OGMs
  */
 struct batadv_tvlv_container {
-	/** @list: hlist node for batadv_priv_tvlv::container_list */
+	/** @list: hlist node for &batadv_priv_tvlv.container_list */
 	struct hlist_node list;
 
 	/** @tvlv_hdr: tvlv header information needed to construct the tvlv */
@@ -2348,7 +2351,7 @@ struct batadv_tvlv_container {
  * struct batadv_tvlv_handler - handler for specific tvlv type and version
  */
 struct batadv_tvlv_handler {
-	/** @list: hlist node for batadv_priv_tvlv::handler_list */
+	/** @list: hlist node for &batadv_priv_tvlv.handler_list */
 	struct hlist_node list;
 
 	/**
-- 
cgit v1.2.3


From c93effcf721ee0a171457fd2ef63367516e45d46 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:50 +0100
Subject: batman-adv: Add kernel-doc to structs in headers

All structs in types.h are already documented. But some other headers
still have private structs which also should be documented.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/hash.h  | 14 +++++++++++---
 net/batman-adv/sysfs.h | 13 +++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index c92fde593959..65396b126f3b 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -46,10 +46,18 @@ typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *,
 typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32);
 typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *);
 
+/**
+ * struct batadv_hashtable - Wrapper of simple hlist based hashtable
+ */
 struct batadv_hashtable {
-	struct hlist_head *table;   /* the hashtable itself with the buckets */
-	spinlock_t *list_locks;     /* spinlock for each hash list entry */
-	u32 size;		    /* size of hashtable */
+	/** @table: the hashtable itself with the buckets */
+	struct hlist_head *table;
+
+	/** @list_locks: spinlock for each hash list entry */
+	spinlock_t *list_locks;
+
+	/** @size: size of hashtable */
+	u32 size;
 };
 
 /* allocates and clears the hash */
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index 0384cb6c406b..bbeee61221fa 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -36,10 +36,23 @@ struct net_device;
  */
 #define BATADV_SYSFS_VLAN_SUBDIR_PREFIX "vlan"
 
+/**
+ * struct batadv_attribute - sysfs export helper for batman-adv attributes
+ */
 struct batadv_attribute {
+	/** @attr: sysfs attribute file */
 	struct attribute attr;
+
+	/**
+	 * @show: function to export the current attribute's content to sysfs
+	 */
 	ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
 			char *buf);
+
+	/**
+	 * @store: function to load new value from character buffer and save it
+	 * in batman-adv attribute
+	 */
 	ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
 			 char *buf, size_t count);
 };
-- 
cgit v1.2.3


From 73844a8c78cc975ac43fec05f7c90417f5f99742 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:51 +0100
Subject: batman-adv: Add kernel-doc to enums in headers

All enums in types.h are already documented. But some other headers
still have private enums which also should be documented.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/hard-interface.h | 23 +++++++++++++++++++++++
 net/batman-adv/main.h           | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index fb7a5d6b5ce3..1e61aacac539 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -31,12 +31,35 @@
 struct net_device;
 struct net;
 
+/**
+ * enum batadv_hard_if_state - State of a hard interface
+ */
 enum batadv_hard_if_state {
+	/**
+	 * @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a
+	 * batman-adv soft interface
+	 */
 	BATADV_IF_NOT_IN_USE,
+
+	/**
+	 * @BATADV_IF_TO_BE_REMOVED: interface will be removed from soft
+	 * interface
+	 */
 	BATADV_IF_TO_BE_REMOVED,
+
+	/** @BATADV_IF_INACTIVE: interface is deactivated */
 	BATADV_IF_INACTIVE,
+
+	/** @BATADV_IF_ACTIVE: interface is used */
 	BATADV_IF_ACTIVE,
+
+	/** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */
 	BATADV_IF_TO_BE_ACTIVATED,
+
+	/**
+	 * @BATADV_IF_I_WANT_YOU: interface is queued up (using sysfs) for being
+	 * added as slave interface of a batman-adv soft interface
+	 */
 	BATADV_IF_I_WANT_YOU,
 };
 
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 7f6a3123e1a4..633e6e41ba14 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -141,24 +141,56 @@
  */
 #define BATADV_TP_MAX_NUM 5
 
+/**
+ * enum batadv_mesh_state - State of a soft interface
+ */
 enum batadv_mesh_state {
+	/** @BATADV_MESH_INACTIVE: soft interface is not yet running */
 	BATADV_MESH_INACTIVE,
+
+	/** @BATADV_MESH_ACTIVE: interface is up and running */
 	BATADV_MESH_ACTIVE,
+
+	/** @BATADV_MESH_DEACTIVATING: interface is getting shut down */
 	BATADV_MESH_DEACTIVATING,
 };
 
 #define BATADV_BCAST_QUEUE_LEN		256
 #define BATADV_BATMAN_QUEUE_LEN	256
 
+/**
+ * enum batadv_uev_action - action type of uevent
+ */
 enum batadv_uev_action {
+	/** @BATADV_UEV_ADD: gateway was selected (after none was selected) */
 	BATADV_UEV_ADD = 0,
+
+	/**
+	 * @BATADV_UEV_DEL: selected gateway was removed and none is selected
+	 * anymore
+	 */
 	BATADV_UEV_DEL,
+
+	/**
+	 * @BATADV_UEV_CHANGE: a different gateway was selected as based gateway
+	 */
 	BATADV_UEV_CHANGE,
+
+	/**
+	 * @BATADV_UEV_LOOPDETECT: loop was detected which cannot be handled by
+	 * bridge loop avoidance
+	 */
 	BATADV_UEV_LOOPDETECT,
 };
 
+/**
+ * enum batadv_uev_type - Type of uevent
+ */
 enum batadv_uev_type {
+	/** @BATADV_UEV_GW: selected gateway was modified */
 	BATADV_UEV_GW = 0,
+
+	/** @BATADV_UEV_BLA: bridge loop avoidance event */
 	BATADV_UEV_BLA,
 };
 
-- 
cgit v1.2.3


From e57acf8e93fb65715af7595066d99d4c0c3f0235 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:52 +0100
Subject: batman-adv: Add kernel-doc to functions in headers

Externally visible functions should be documented with kernel-doc. This
usually refers to non-static functions but also static inline files in
headers are visible in other files and should therefore be documented.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bitarray.h       |  7 +++-
 net/batman-adv/hard-interface.h |  6 +++
 net/batman-adv/hash.h           | 11 +++++-
 net/batman-adv/log.h            | 35 +++++++++++++++++-
 net/batman-adv/main.h           | 82 ++++++++++++++++++++++++++++++++++-------
 net/batman-adv/originator.h     |  9 ++++-
 6 files changed, 131 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0508353fa28d..ca9d0753dd6b 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -47,7 +47,12 @@ static inline bool batadv_test_bit(const unsigned long *seq_bits,
 	return test_bit(diff, seq_bits) != 0;
 }
 
-/* turn corresponding bit on, so we can remember that we got the packet */
+/**
+ * batadv_set_bit() - Turn corresponding bit on, so we can remember that we got
+ *  the packet
+ * @seq_bits: bitmap of the packet receive window
+ * @n: relative sequence number of newly received packet
+ */
 static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
 {
 	/* if too old, just drop it */
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 1e61aacac539..de5e9a374ece 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -130,6 +130,12 @@ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface)
 	kref_put(&hard_iface->refcount, batadv_hardif_release);
 }
 
+/**
+ * batadv_primary_if_get_selected() - Get reference to primary interface
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: primary interface (with increased refcnt), otherwise NULL
+ */
 static inline struct batadv_hard_iface *
 batadv_primary_if_get_selected(struct batadv_priv *bat_priv)
 {
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 65396b126f3b..4ce1b6d3ad5c 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -121,8 +121,15 @@ out:
 	return ret;
 }
 
-/* removes data from hash, if found. data could be the structure you use with
- * just the key filled, we just need the key for comparing.
+/**
+ * batadv_hash_remove() - Removes data from hash, if found
+ * @hash: hash table
+ * @compare: callback to determine if 2 hash elements are identical
+ * @choose: callback calculating the hash index
+ * @data: data passed to the aforementioned callbacks as argument
+ *
+ * ata could be the structure you use with  just the key filled, we just need
+ * the key for comparing.
  *
  * Return: returns pointer do data on success, so you can remove the used
  * structure yourself, or NULL on error
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index dd22e17b84b4..35e02b2b9e72 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -79,7 +79,14 @@ enum batadv_dbg_level {
 int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
 __printf(2, 3);
 
-/* possibly ratelimited debug output */
+/**
+ * _batadv_dbg() - Store debug output with(out) ratelimiting
+ * @type: type of debug message
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ratelimited: whether output should be rate limited
+ * @fmt: format string
+ * @arg...: variable arguments
+ */
 #define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...)		\
 	do {								\
 		struct batadv_priv *__batpriv = (bat_priv);		\
@@ -98,11 +105,30 @@ static inline void _batadv_dbg(int type __always_unused,
 }
 #endif
 
+/**
+ * batadv_dbg() - Store debug output without ratelimiting
+ * @type: type of debug message
+ * @bat_priv: the bat priv with all the soft interface information
+ * @arg...: format string and variable arguments
+ */
 #define batadv_dbg(type, bat_priv, arg...) \
 	_batadv_dbg(type, bat_priv, 0, ## arg)
+
+/**
+ * batadv_dbg_ratelimited() - Store debug output with ratelimiting
+ * @type: type of debug message
+ * @bat_priv: the bat priv with all the soft interface information
+ * @arg...: format string and variable arguments
+ */
 #define batadv_dbg_ratelimited(type, bat_priv, arg...) \
 	_batadv_dbg(type, bat_priv, 1, ## arg)
 
+/**
+ * batadv_info() - Store message in debug buffer and print it to kmsg buffer
+ * @net_dev: the soft interface net device
+ * @fmt: format string
+ * @arg...: variable arguments
+ */
 #define batadv_info(net_dev, fmt, arg...)				\
 	do {								\
 		struct net_device *_netdev = (net_dev);                 \
@@ -110,6 +136,13 @@ static inline void _batadv_dbg(int type __always_unused,
 		batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg);	\
 		pr_info("%s: " fmt, _netdev->name, ## arg);		\
 	} while (0)
+
+/**
+ * batadv_err() - Store error in debug buffer and print it to kmsg buffer
+ * @net_dev: the soft interface net device
+ * @fmt: format string
+ * @arg...: variable arguments
+ */
 #define batadv_err(net_dev, fmt, arg...)				\
 	do {								\
 		struct net_device *_netdev = (net_dev);                 \
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 633e6e41ba14..5ac86df48c42 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -298,40 +298,96 @@ static inline bool batadv_has_timed_out(unsigned long timestamp,
 	return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout));
 }
 
+/**
+ * batadv_atomic_dec_not_zero() - Decrease unless the number is 0
+ * @v: pointer of type atomic_t
+ *
+ * Return: non-zero if v was not 0, and zero otherwise.
+ */
 #define batadv_atomic_dec_not_zero(v)	atomic_add_unless((v), -1, 0)
 
-/* Returns the smallest signed integer in two's complement with the sizeof x */
+/**
+ * batadv_smallest_signed_int() - Returns the smallest signed integer in two's
+ *  complement with the sizeof x
+ * @x: type of integer
+ *
+ * Return: smallest signed integer of type
+ */
 #define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u)))
 
-/* Checks if a sequence number x is a predecessor/successor of y.
- * they handle overflows/underflows and can correctly check for a
- * predecessor/successor unless the variable sequence number has grown by
- * more then 2**(bitwidth(x)-1)-1.
+/**
+ * batadv_seq_before() - Checks if a sequence number x is a predecessor of y
+ * @x: potential predecessor of @y
+ * @y: value to compare @x against
+ *
+ * It handles overflows/underflows and can correctly check for a predecessor
+ * unless the variable sequence number has grown by more then
+ * 2**(bitwidth(x)-1)-1.
+ *
  * This means that for a u8 with the maximum value 255, it would think:
- *  - when adding nothing - it is neither a predecessor nor a successor
- *  - before adding more than 127 to the starting value - it is a predecessor,
- *  - when adding 128 - it is neither a predecessor nor a successor,
- *  - after adding more than 127 to the starting value - it is a successor
+ *
+ * * when adding nothing - it is neither a predecessor nor a successor
+ * * before adding more than 127 to the starting value - it is a predecessor,
+ * * when adding 128 - it is neither a predecessor nor a successor,
+ * * after adding more than 127 to the starting value - it is a successor
+ *
+ * Return: true when x is a predecessor of y, false otherwise
  */
 #define batadv_seq_before(x, y) ({typeof(x)_d1 = (x); \
 				 typeof(y)_d2 = (y); \
 				 typeof(x)_dummy = (_d1 - _d2); \
 				 (void)(&_d1 == &_d2); \
 				 _dummy > batadv_smallest_signed_int(_dummy); })
+
+/**
+ * batadv_seq_after() - Checks if a sequence number x is a successor of y
+ * @x: potential sucessor of @y
+ * @y: value to compare @x against
+ *
+ * It handles overflows/underflows and can correctly check for a successor
+ * unless the variable sequence number has grown by more then
+ * 2**(bitwidth(x)-1)-1.
+ *
+ * This means that for a u8 with the maximum value 255, it would think:
+ *
+ * * when adding nothing - it is neither a predecessor nor a successor
+ * * before adding more than 127 to the starting value - it is a predecessor,
+ * * when adding 128 - it is neither a predecessor nor a successor,
+ * * after adding more than 127 to the starting value - it is a successor
+ *
+ * Return: true when x is a successor of y, false otherwise
+ */
 #define batadv_seq_after(x, y) batadv_seq_before(y, x)
 
-/* Stop preemption on local cpu while incrementing the counter */
+/**
+ * batadv_add_counter() - Add to per cpu statistics counter of soft interface
+ * @bat_priv: the bat priv with all the soft interface information
+ * @idx: counter index which should be modified
+ * @count: value to increase counter by
+ *
+ * Stop preemption on local cpu while incrementing the counter
+ */
 static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
 				      size_t count)
 {
 	this_cpu_add(bat_priv->bat_counters[idx], count);
 }
 
+/**
+ * batadv_inc_counter() - Increase per cpu statistics counter of soft interface
+ * @b: the bat priv with all the soft interface information
+ * @i: counter index which should be modified
+ */
 #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
 
-/* Define a macro to reach the control buffer of the skb. The members of the
- * control buffer are defined in struct batadv_skb_cb in types.h.
- * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
+/**
+ * BATADV_SKB_CB() - Get batadv_skb_cb from skb control buffer
+ * @__skb: skb holding the control buffer
+ *
+ * The members of the control buffer are defined in struct batadv_skb_cb in
+ * types.h. The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
+ *
+ * Return: pointer to the batadv_skb_cb of the skb
  */
 #define BATADV_SKB_CB(__skb)       ((struct batadv_skb_cb *)&((__skb)->cb[0]))
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index b5d2164532c9..8e543a3cdc6c 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -84,8 +84,13 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
 			  unsigned short vid);
 void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan);
 
-/* hashfunction to choose an entry in a hash table of given size
- * hash algorithm from http://en.wikipedia.org/wiki/Hash_table
+/**
+ * batadv_choose_orig() - Return the index of the orig entry in the hash table
+ * @data: mac address of the originator node
+ * @size: the size of the hash table
+ *
+ * Return: the hash index where the object represented by @data should be
+ * stored at.
  */
 static inline u32 batadv_choose_orig(const void *data, u32 size)
 {
-- 
cgit v1.2.3


From ff15c27c97303fbe5abc49c25c73ea299ab72d31 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Dec 2017 19:51:53 +0100
Subject: batman-adv: Add kernel-doc to externally visible functions

According to the kernel-doc documentation, externally visible functions
should be documented. This refers to all all non-static function which can
(and will) be used by functions in other sources files.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_algo.c          | 27 ++++++++++++++++++++++++
 net/batman-adv/bat_iv_ogm.c        |  5 +++++
 net/batman-adv/debugfs.c           | 16 ++++++++++++++
 net/batman-adv/gateway_client.c    | 38 +++++++++++++++++++++++++++++++++
 net/batman-adv/gateway_common.c    |  9 ++++++++
 net/batman-adv/hard-interface.c    | 36 ++++++++++++++++++++++++++++++-
 net/batman-adv/hash.c              | 17 +++++++++++++--
 net/batman-adv/icmp_socket.c       |  9 ++++++++
 net/batman-adv/log.c               | 17 +++++++++++++++
 net/batman-adv/main.c              | 31 +++++++++++++++++++++++++++
 net/batman-adv/originator.c        | 43 ++++++++++++++++++++++++++++++++++++++
 net/batman-adv/routing.c           | 29 +++++++++++++++++++++++++
 net/batman-adv/send.c              | 18 ++++++++++++++++
 net/batman-adv/soft-interface.c    | 20 ++++++++++++++++++
 net/batman-adv/sysfs.c             | 32 ++++++++++++++++++++++++++++
 net/batman-adv/translation-table.c | 37 ++++++++++++++++++++++++++++++++
 16 files changed, 381 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index aed7ced059df..80c72c7d3cad 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -61,6 +61,12 @@ static struct batadv_algo_ops *batadv_algo_get(char *name)
 	return bat_algo_ops;
 }
 
+/**
+ * batadv_algo_register() - Register callbacks for a mesh algorithm
+ * @bat_algo_ops: mesh algorithm callbacks to add
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
 {
 	struct batadv_algo_ops *bat_algo_ops_tmp;
@@ -90,6 +96,19 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
 	return 0;
 }
 
+/**
+ * batadv_algo_select() - Select algorithm of soft interface
+ * @bat_priv: the bat priv with all the soft interface information
+ * @name: name of the algorithm to select
+ *
+ * The algorithm callbacks for the soft interface will be set when the algorithm
+ * with the correct name was found. Any previous selected algorithm will not be
+ * deinitialized and the new selected algorithm will also not be initialized.
+ * It is therefore not allowed to call batadv_algo_select outside the creation
+ * function of the soft interface.
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
 {
 	struct batadv_algo_ops *bat_algo_ops;
@@ -104,6 +123,14 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
 }
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+
+/**
+ * batadv_algo_seq_print_text() - Print the supported algorithms in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct batadv_algo_ops *bat_algo_ops;
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 0973e8c5a063..c9955f29a2bf 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -2853,6 +2853,11 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
 	},
 };
 
+/**
+ * batadv_iv_init() - B.A.T.M.A.N. IV initialization function
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int __init batadv_iv_init(void)
 {
 	int ret;
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 97d6eb45cbf2..21d1189957a7 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -259,6 +259,9 @@ static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
 	NULL,
 };
 
+/**
+ * batadv_debugfs_init() - Initialize soft interface independent debugfs entries
+ */
 void batadv_debugfs_init(void)
 {
 	struct batadv_debuginfo **bat_debug;
@@ -289,6 +292,9 @@ err:
 	batadv_debugfs = NULL;
 }
 
+/**
+ * batadv_debugfs_destroy() - Remove all debugfs entries
+ */
 void batadv_debugfs_destroy(void)
 {
 	debugfs_remove_recursive(batadv_debugfs);
@@ -355,6 +361,12 @@ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
 	}
 }
 
+/**
+ * batadv_debugfs_add_meshif() - Initialize interface dependent debugfs entries
+ * @dev: netdev struct of the soft interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_debugfs_add_meshif(struct net_device *dev)
 {
 	struct batadv_priv *bat_priv = netdev_priv(dev);
@@ -401,6 +413,10 @@ out:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries
+ * @dev: netdev struct of the soft interface
+ */
 void batadv_debugfs_del_meshif(struct net_device *dev)
 {
 	struct batadv_priv *bat_priv = netdev_priv(dev);
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 6731f7dabeb9..2488e25d0eef 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -93,6 +93,12 @@ void batadv_gw_node_put(struct batadv_gw_node *gw_node)
 	kref_put(&gw_node->refcount, batadv_gw_node_release);
 }
 
+/**
+ * batadv_gw_get_selected_gw_node() - Get currently selected gateway
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: selected gateway (with increased refcnt), NULL on errors
+ */
 struct batadv_gw_node *
 batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
 {
@@ -111,6 +117,12 @@ out:
 	return gw_node;
 }
 
+/**
+ * batadv_gw_get_selected_orig() - Get originator of currently selected gateway
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: orig_node of selected gateway (with increased refcnt), NULL on errors
+ */
 struct batadv_orig_node *
 batadv_gw_get_selected_orig(struct batadv_priv *bat_priv)
 {
@@ -204,6 +216,10 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
 	batadv_gw_node_put(curr_gw);
 }
 
+/**
+ * batadv_gw_election() - Elect the best gateway
+ * @bat_priv: the bat priv with all the soft interface information
+ */
 void batadv_gw_election(struct batadv_priv *bat_priv)
 {
 	struct batadv_gw_node *curr_gw = NULL;
@@ -292,6 +308,11 @@ out:
 		batadv_neigh_ifinfo_put(router_ifinfo);
 }
 
+/**
+ * batadv_gw_check_election() - Elect orig node as best gateway when eligible
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be checked
+ */
 void batadv_gw_check_election(struct batadv_priv *bat_priv,
 			      struct batadv_orig_node *orig_node)
 {
@@ -460,6 +481,11 @@ out:
 		batadv_gw_node_put(gw_node);
 }
 
+/**
+ * batadv_gw_node_delete() - Remove orig_node from gateway list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is currently in process of being removed
+ */
 void batadv_gw_node_delete(struct batadv_priv *bat_priv,
 			   struct batadv_orig_node *orig_node)
 {
@@ -471,6 +497,10 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv,
 	batadv_gw_node_update(bat_priv, orig_node, &gateway);
 }
 
+/**
+ * batadv_gw_node_free() - Free gateway information from soft interface
+ * @bat_priv: the bat priv with all the soft interface information
+ */
 void batadv_gw_node_free(struct batadv_priv *bat_priv)
 {
 	struct batadv_gw_node *gw_node;
@@ -486,6 +516,14 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
 }
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+
+/**
+ * batadv_gw_client_seq_print_text() - Print the gateway table in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct net_device *net_dev = (struct net_device *)seq->private;
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 1c58727835ca..83bfeecf661c 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -165,6 +165,15 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
 	}
 }
 
+/**
+ * batadv_gw_bandwidth_set() - Parse and set download/upload gateway bandwidth
+ *  from supplied string buffer
+ * @net_dev: netdev struct of the soft interface
+ * @buff: the buffer containing the user data
+ * @count: number of bytes in the buffer
+ *
+ * Return: 'count' on success or a negative error code in case of failure
+ */
 ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
 				size_t count)
 {
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 33425a022026..13d04dba0b3a 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -67,6 +67,12 @@ void batadv_hardif_release(struct kref *ref)
 	kfree_rcu(hard_iface, rcu);
 }
 
+/**
+ * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device
+ * @net_dev: net_device to search for
+ *
+ * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors
+ */
 struct batadv_hard_iface *
 batadv_hardif_get_by_netdev(const struct net_device *net_dev)
 {
@@ -561,6 +567,13 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface)
 	soft_iface->needed_tailroom = lower_tailroom;
 }
 
+/**
+ * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface
+ * @soft_iface: netdev struct of the soft interface
+ *
+ * Return: MTU for the soft-interface (limited by the minimal MTU of all active
+ *  slave interfaces)
+ */
 int batadv_hardif_min_mtu(struct net_device *soft_iface)
 {
 	struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -607,7 +620,11 @@ out:
 	return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN);
 }
 
-/* adjusts the MTU if a new interface with a smaller MTU appeared. */
+/**
+ * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller
+ *  MTU appeared
+ * @soft_iface: netdev struct of the soft interface
+ */
 void batadv_update_min_mtu(struct net_device *soft_iface)
 {
 	soft_iface->mtu = batadv_hardif_min_mtu(soft_iface);
@@ -692,6 +709,14 @@ static int batadv_master_del_slave(struct batadv_hard_iface *slave,
 	return ret;
 }
 
+/**
+ * batadv_hardif_enable_interface() - Enslave hard interface to soft interface
+ * @hard_iface: hard interface to add to soft interface
+ * @net: the applicable net namespace
+ * @iface_name: name of the soft interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 				   struct net *net, const char *iface_name)
 {
@@ -803,6 +828,12 @@ err:
 	return ret;
 }
 
+/**
+ * batadv_hardif_disable_interface() - Remove hard interface from soft interface
+ * @hard_iface: hard interface to be removed
+ * @autodel: whether to delete soft interface when it doesn't contain any other
+ *  slave interfaces
+ */
 void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
 				     enum batadv_hard_if_cleanup autodel)
 {
@@ -937,6 +968,9 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
 	batadv_hardif_put(hard_iface);
 }
 
+/**
+ * batadv_hardif_remove_interfaces() - Remove all hard interfaces
+ */
 void batadv_hardif_remove_interfaces(void)
 {
 	struct batadv_hard_iface *hard_iface, *hard_iface_tmp;
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 2ce0d5673f40..04d964358c98 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -34,7 +34,10 @@ static void batadv_hash_init(struct batadv_hashtable *hash)
 	}
 }
 
-/* free only the hashtable and the hash itself. */
+/**
+ * batadv_hash_destroy() - Free only the hashtable and the hash itself
+ * @hash: hash object to destroy
+ */
 void batadv_hash_destroy(struct batadv_hashtable *hash)
 {
 	kfree(hash->list_locks);
@@ -42,7 +45,12 @@ void batadv_hash_destroy(struct batadv_hashtable *hash)
 	kfree(hash);
 }
 
-/* allocates and clears the hash */
+/**
+ * batadv_hash_new() - Allocates and clears the hashtable
+ * @size: number of hash buckets to allocate
+ *
+ * Return: newly allocated hashtable, NULL on errors
+ */
 struct batadv_hashtable *batadv_hash_new(u32 size)
 {
 	struct batadv_hashtable *hash;
@@ -71,6 +79,11 @@ free_hash:
 	return NULL;
 }
 
+/**
+ * batadv_hash_set_lock_class() - Set specific lockdep class for hash spinlocks
+ * @hash: hash object to modify
+ * @key: lockdep class key address
+ */
 void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
 				struct lock_class_key *key)
 {
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 8af5d30e59b1..f2ef75b7fa73 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -57,6 +57,9 @@ static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
 				     struct batadv_icmp_header *icmph,
 				     size_t icmp_len);
 
+/**
+ * batadv_socket_init() - Initialize soft interface independent socket data
+ */
 void batadv_socket_init(void)
 {
 	memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash));
@@ -316,6 +319,12 @@ static const struct file_operations batadv_fops = {
 	.llseek = no_llseek,
 };
 
+/**
+ * batadv_socket_setup() - Create debugfs "socket" file
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_socket_setup(struct batadv_priv *bat_priv)
 {
 	struct dentry *d;
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 6fbcdd40a332..da004980ab8b 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -88,6 +88,13 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
 	return 0;
 }
 
+/**
+ * batadv_debug_log() - Add debug log entry
+ * @bat_priv: the bat priv with all the soft interface information
+ * @fmt: format string
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
 {
 	va_list args;
@@ -199,6 +206,12 @@ static const struct file_operations batadv_log_fops = {
 	.llseek         = no_llseek,
 };
 
+/**
+ * batadv_debug_log_setup() - Initialize debug log
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_debug_log_setup(struct batadv_priv *bat_priv)
 {
 	struct dentry *d;
@@ -224,6 +237,10 @@ err:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_debug_log_cleanup() - Destroy debug log
+ * @bat_priv: the bat priv with all the soft interface information
+ */
 void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
 {
 	kfree(bat_priv->debug_log);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index e6e1f5eae494..8bee4279d579 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -140,6 +140,12 @@ static void __exit batadv_exit(void)
 	batadv_tt_cache_destroy();
 }
 
+/**
+ * batadv_mesh_init() - Initialize soft interface
+ * @soft_iface: netdev struct of the soft interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_mesh_init(struct net_device *soft_iface)
 {
 	struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -217,6 +223,10 @@ err:
 	return ret;
 }
 
+/**
+ * batadv_mesh_free() - Deinitialize soft interface
+ * @soft_iface: netdev struct of the soft interface
+ */
 void batadv_mesh_free(struct net_device *soft_iface)
 {
 	struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -413,6 +423,16 @@ static int batadv_recv_unhandled_packet(struct sk_buff *skb,
 /* incoming packets with the batman ethertype received on any active hard
  * interface
  */
+
+/**
+ * batadv_batman_skb_recv() - Handle incoming message from an hard interface
+ * @skb: the received packet
+ * @dev: the net device that the packet was received on
+ * @ptype: packet type of incoming packet (ETH_P_BATMAN)
+ * @orig_dev: the original receive net device (e.g. bonded device)
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
 int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
 			   struct packet_type *ptype,
 			   struct net_device *orig_dev)
@@ -536,6 +556,13 @@ static void batadv_recv_handler_init(void)
 	batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet;
 }
 
+/**
+ * batadv_recv_handler_register() - Register handler for batman-adv packet type
+ * @packet_type: batadv_packettype which should be handled
+ * @recv_handler: receive handler for the packet type
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int
 batadv_recv_handler_register(u8 packet_type,
 			     int (*recv_handler)(struct sk_buff *,
@@ -553,6 +580,10 @@ batadv_recv_handler_register(u8 packet_type,
 	return 0;
 }
 
+/**
+ * batadv_recv_handler_unregister() - Unregister handler for packet type
+ * @packet_type: batadv_packettype which should no longer be handled
+ */
 void batadv_recv_handler_unregister(u8 packet_type)
 {
 	batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 412a603b2fda..58a7d9274435 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -58,6 +58,13 @@
 /* hash class keys */
 static struct lock_class_key batadv_orig_hash_lock_class_key;
 
+/**
+ * batadv_orig_hash_find() - Find and return originator from orig_hash
+ * @bat_priv: the bat priv with all the soft interface information
+ * @data: mac address of the originator
+ *
+ * Return: orig_node (with increased refcnt), NULL on errors
+ */
 struct batadv_orig_node *
 batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
 {
@@ -201,6 +208,12 @@ void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan)
 	kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
 }
 
+/**
+ * batadv_originator_init() - Initialize all originator structures
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_originator_init(struct batadv_priv *bat_priv)
 {
 	if (bat_priv->orig_hash)
@@ -959,6 +972,10 @@ void batadv_orig_node_put(struct batadv_orig_node *orig_node)
 	kref_put(&orig_node->refcount, batadv_orig_node_release);
 }
 
+/**
+ * batadv_originator_free() - Free all originator structures
+ * @bat_priv: the bat priv with all the soft interface information
+ */
 void batadv_originator_free(struct batadv_priv *bat_priv)
 {
 	struct batadv_hashtable *hash = bat_priv->orig_hash;
@@ -1374,12 +1391,24 @@ static void batadv_purge_orig(struct work_struct *work)
 			   msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
 }
 
+/**
+ * batadv_purge_orig_ref() - Purge all outdated originators
+ * @bat_priv: the bat priv with all the soft interface information
+ */
 void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
 {
 	_batadv_purge_orig(bat_priv);
 }
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+
+/**
+ * batadv_orig_seq_print_text() - Print the originator table in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1532,6 +1561,13 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
 	return ret;
 }
 
+/**
+ * batadv_orig_hash_add_if() - Add interface to originators in orig_hash
+ * @hard_iface: hard interface to add (already slave of the soft interface)
+ * @max_if_num: new number of interfaces
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
 			    int max_if_num)
 {
@@ -1567,6 +1603,13 @@ err:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash
+ * @hard_iface: hard interface to remove (still slave of the soft interface)
+ * @max_if_num: new number of interfaces
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
 			    int max_if_num)
 {
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 01820be4ae5a..eb835bde502a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -181,6 +181,14 @@ bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
 	return false;
 }
 
+/**
+ * batadv_check_management_packet() - Check preconditions for management packets
+ * @skb: incoming packet buffer
+ * @hard_iface: incoming hard interface
+ * @header_len: minimal header length of packet type
+ *
+ * Return: true when management preconditions are met, false otherwise
+ */
 bool batadv_check_management_packet(struct sk_buff *skb,
 				    struct batadv_hard_iface *hard_iface,
 				    int header_len)
@@ -348,6 +356,13 @@ out:
 	return ret;
 }
 
+/**
+ * batadv_recv_icmp_packet() - Process incoming icmp packet
+ * @skb: incoming packet buffer
+ * @recv_if: incoming hard interface
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
 int batadv_recv_icmp_packet(struct sk_buff *skb,
 			    struct batadv_hard_iface *recv_if)
 {
@@ -936,6 +951,13 @@ free_skb:
 	return NET_RX_DROP;
 }
 
+/**
+ * batadv_recv_unicast_packet() - Process incoming unicast packet
+ * @skb: incoming packet buffer
+ * @recv_if: incoming hard interface
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
 int batadv_recv_unicast_packet(struct sk_buff *skb,
 			       struct batadv_hard_iface *recv_if)
 {
@@ -1156,6 +1178,13 @@ free_skb:
 	return ret;
 }
 
+/**
+ * batadv_recv_bcast_packet() - Process incoming broadcast packet
+ * @skb: incoming packet buffer
+ * @recv_if: incoming hard interface
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
 int batadv_recv_bcast_packet(struct sk_buff *skb,
 			     struct batadv_hard_iface *recv_if)
 {
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 0700b3dfb595..2a5ab6f1076d 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -124,12 +124,30 @@ send_skb_err:
 	return NET_XMIT_DROP;
 }
 
+/**
+ * batadv_send_broadcast_skb() - Send broadcast packet via hard interface
+ * @skb: packet to be transmitted (with batadv header and no outer eth header)
+ * @hard_iface: outgoing interface
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ */
 int batadv_send_broadcast_skb(struct sk_buff *skb,
 			      struct batadv_hard_iface *hard_iface)
 {
 	return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
 }
 
+/**
+ * batadv_send_unicast_skb() - Send unicast packet to neighbor
+ * @skb: packet to be transmitted (with batadv header and no outer eth header)
+ * @neigh: neighbor which is used as next hop to destination
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ */
 int batadv_send_unicast_skb(struct sk_buff *skb,
 			    struct batadv_neigh_node *neigh)
 {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 9b66e0edc741..1eb5555c5fe4 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -65,6 +65,13 @@
 #include "sysfs.h"
 #include "translation-table.h"
 
+/**
+ * batadv_skb_head_push() - Increase header size and move (push) head pointer
+ * @skb: packet buffer which should be modified
+ * @len: number of bytes to add
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
 {
 	int result;
@@ -1064,6 +1071,13 @@ static void batadv_softif_init_early(struct net_device *dev)
 	dev->ethtool_ops = &batadv_ethtool_ops;
 }
 
+/**
+ * batadv_softif_create() - Create and register soft interface
+ * @net: the applicable net namespace
+ * @name: name of the new soft interface
+ *
+ * Return: newly allocated soft_interface, NULL on errors
+ */
 struct net_device *batadv_softif_create(struct net *net, const char *name)
 {
 	struct net_device *soft_iface;
@@ -1141,6 +1155,12 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
 	unregister_netdevice_queue(soft_iface, head);
 }
 
+/**
+ * batadv_softif_is_valid() - Check whether device is a batadv soft interface
+ * @net_dev: device which should be checked
+ *
+ * Return: true when net_dev is a batman-adv interface, false otherwise
+ */
 bool batadv_softif_is_valid(const struct net_device *net_dev)
 {
 	if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx)
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 8e2b7c7d2358..56fb42551453 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -735,6 +735,12 @@ static struct batadv_attribute *batadv_vlan_attrs[] = {
 	NULL,
 };
 
+/**
+ * batadv_sysfs_add_meshif() - Add soft interface specific sysfs entries
+ * @dev: netdev struct of the soft interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_sysfs_add_meshif(struct net_device *dev)
 {
 	struct kobject *batif_kobject = &dev->dev.kobj;
@@ -775,6 +781,10 @@ out:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_sysfs_del_meshif() - Remove soft interface specific sysfs entries
+ * @dev: netdev struct of the soft interface
+ */
 void batadv_sysfs_del_meshif(struct net_device *dev)
 {
 	struct batadv_priv *bat_priv = netdev_priv(dev);
@@ -1132,6 +1142,13 @@ static struct batadv_attribute *batadv_batman_attrs[] = {
 	NULL,
 };
 
+/**
+ * batadv_sysfs_add_hardif() - Add hard interface specific sysfs entries
+ * @hardif_obj: address where to store the pointer to new sysfs folder
+ * @dev: netdev struct of the hard interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev)
 {
 	struct kobject *hardif_kobject = &dev->dev.kobj;
@@ -1166,6 +1183,11 @@ out:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_sysfs_del_hardif() - Remove hard interface specific sysfs entries
+ * @hardif_obj: address to the pointer to which stores batman-adv sysfs folder
+ *  of the hard interface
+ */
 void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
 {
 	kobject_uevent(*hardif_obj, KOBJ_REMOVE);
@@ -1174,6 +1196,16 @@ void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
 	*hardif_obj = NULL;
 }
 
+/**
+ * batadv_throw_uevent() - Send an uevent with batman-adv specific env data
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: subsystem type of event. Stored in uevent's BATTYPE
+ * @action: action type of event. Stored in uevent's BATACTION
+ * @data: string with additional information to the event (ignored for
+ *  BATADV_UEV_DEL). Stored in uevent's BATDATA
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
 int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
 			enum batadv_uev_action action, const char *data)
 {
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8b583d3e86e6..0e53be3f8df0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1055,6 +1055,14 @@ container_register:
 }
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
+
+/**
+ * batadv_tt_local_seq_print_text() - Print the local tt table in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1927,6 +1935,13 @@ print_list:
 	}
 }
 
+/**
+ * batadv_tt_global_seq_print_text() - Print the global tt table in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct net_device *net_dev = (struct net_device *)seq->private;
@@ -3729,6 +3744,10 @@ static void batadv_tt_purge(struct work_struct *work)
 			   msecs_to_jiffies(BATADV_TT_WORK_PERIOD));
 }
 
+/**
+ * batadv_tt_free() - Free translation table of soft interface
+ * @bat_priv: the bat priv with all the soft interface information
+ */
 void batadv_tt_free(struct batadv_priv *bat_priv)
 {
 	batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1);
@@ -3876,6 +3895,15 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
 	spin_unlock_bh(&bat_priv->tt.commit_lock);
 }
 
+/**
+ * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped
+ * @bat_priv: the bat priv with all the soft interface information
+ * @src: source mac address of packet
+ * @dst: destination mac address of packet
+ * @vid: vlan id of packet
+ *
+ * Return: true when src+dst(+vid) pair should be isolated, false otherwise
+ */
 bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
 			   unsigned short vid)
 {
@@ -4047,6 +4075,15 @@ out:
 	return ret;
 }
 
+/**
+ * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which the temporary entry should be associated with
+ * @addr: mac address of the client
+ * @vid: VLAN id of the new temporary global translation table
+ *
+ * Return: true when temporary tt entry could be added, false otherwise
+ */
 bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
 					  struct batadv_orig_node *orig_node,
 					  const unsigned char *addr,
-- 
cgit v1.2.3


From 1d7e2ed22f8d9171fa8b629754022f22115b3f03 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 13 Dec 2017 16:38:55 -0800
Subject: net: erspan: refactor existing erspan code

The patch refactors the existing erspan implementation in order
to support erspan version 2, which has additional metadata.  So, in
stead of having one 'struct erspanhdr' holding erspan version 1,
breaks it into 'struct erspan_base_hdr' and 'struct erspan_metadata'.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h           | 34 ++++++++++++++++++++++++----------
 net/ipv4/ip_gre.c              | 27 +++++++++++++++++----------
 net/ipv6/ip6_gre.c             | 25 ++++++++++++++++---------
 net/openvswitch/flow_netlink.c |  8 ++++----
 4 files changed, 61 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index 6e758d08c9ee..70c40c7c75b2 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -15,7 +15,7 @@
  *  s, Recur, Flags, Version fields only S (bit 03) is set to 1. The
  *  other fields are set to zero, so only a sequence number follows.
  *
- *  ERSPAN Type II header (8 octets [42:49])
+ *  ERSPAN Version 1 (Type II) header (8 octets [42:49])
  *  0                   1                   2                   3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -27,7 +27,7 @@
  * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
  */
 
-#define ERSPAN_VERSION	0x1
+#define ERSPAN_VERSION	0x1	/* ERSPAN type II */
 
 #define VER_MASK	0xf000
 #define VLAN_MASK	0x0fff
@@ -44,20 +44,29 @@ enum erspan_encap_type {
 	ERSPAN_ENCAP_INFRAME = 0x3,	/* VLAN tag perserved in frame */
 };
 
+#define ERSPAN_V1_MDSIZE	4
+#define ERSPAN_V2_MDSIZE	8
 struct erspan_metadata {
-	__be32 index;   /* type II */
+	union {
+		__be32 index;	/* Version 1 (type II)*/
+	} u;
 };
 
-struct erspanhdr {
+struct erspan_base_hdr {
 	__be16 ver_vlan;
 #define VER_OFFSET  12
 	__be16 session_id;
 #define COS_OFFSET  13
 #define EN_OFFSET   11
 #define T_OFFSET    10
-	struct erspan_metadata md;
 };
 
+static inline int erspan_hdr_len(int version)
+{
+	return sizeof(struct erspan_base_hdr) +
+	       (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE);
+}
+
 static inline u8 tos_to_cos(u8 tos)
 {
 	u8 dscp, cos;
@@ -73,7 +82,8 @@ static inline void erspan_build_header(struct sk_buff *skb,
 {
 	struct ethhdr *eth = eth_hdr(skb);
 	enum erspan_encap_type enc_type;
-	struct erspanhdr *ershdr;
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *ersmd;
 	struct qtag_prefix {
 		__be16 eth_type;
 		__be16 tci;
@@ -96,17 +106,21 @@ static inline void erspan_build_header(struct sk_buff *skb,
 		enc_type = ERSPAN_ENCAP_INFRAME;
 	}
 
-	skb_push(skb, sizeof(*ershdr));
-	ershdr = (struct erspanhdr *)skb->data;
-	memset(ershdr, 0, sizeof(*ershdr));
+	skb_push(skb, sizeof(*ershdr) + ERSPAN_V1_MDSIZE);
+	ershdr = (struct erspan_base_hdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE);
 
+	/* Build base header */
 	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
 				 (ERSPAN_VERSION << VER_OFFSET));
 	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
 			   ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) |
 			   (enc_type << EN_OFFSET & EN_MASK) |
 			   ((truncate << T_OFFSET) & T_MASK));
-	ershdr->md.index = htonl(index & INDEX_MASK);
+
+	/* Build metadata */
+	ersmd = (struct erspan_metadata *)(ershdr + 1);
+	ersmd->u.index = htonl(index & INDEX_MASK);
 }
 
 #endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d828821d88d7..3e37402147f3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -256,34 +256,41 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 {
 	struct net *net = dev_net(skb->dev);
 	struct metadata_dst *tun_dst = NULL;
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *pkt_md;
 	struct ip_tunnel_net *itn;
 	struct ip_tunnel *tunnel;
-	struct erspanhdr *ershdr;
 	const struct iphdr *iph;
-	__be32 index;
+	int ver;
 	int len;
 
 	itn = net_generic(net, erspan_net_id);
 	len = gre_hdr_len + sizeof(*ershdr);
 
+	/* Check based hdr len */
 	if (unlikely(!pskb_may_pull(skb, len)))
 		return -ENOMEM;
 
 	iph = ip_hdr(skb);
-	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+	ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
 
 	/* The original GRE header does not have key field,
 	 * Use ERSPAN 10-bit session ID as key.
 	 */
 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
-	index = ershdr->md.index;
+	pkt_md = (struct erspan_metadata *)(ershdr + 1);
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 				  tpi->flags | TUNNEL_KEY,
 				  iph->saddr, iph->daddr, tpi->key);
 
 	if (tunnel) {
+		len = gre_hdr_len + erspan_hdr_len(ver);
+		if (unlikely(!pskb_may_pull(skb, len)))
+			return -ENOMEM;
+
 		if (__iptunnel_pull_header(skb,
-					   gre_hdr_len + sizeof(*ershdr),
+					   len,
 					   htons(ETH_P_TEB),
 					   false, false) < 0)
 			goto drop;
@@ -307,12 +314,12 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 			if (!md)
 				return PACKET_REJECT;
 
-			md->index = index;
+			memcpy(md, pkt_md, sizeof(*md));
 			info = &tun_dst->u.tun_info;
 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 			info->options_len = sizeof(*md);
 		} else {
-			tunnel->index = ntohl(index);
+			tunnel->index = ntohl(pkt_md->u.index);
 		}
 
 		skb_reset_mac_header(skb);
@@ -571,7 +578,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	key = &tun_info->key;
 
 	/* ERSPAN has fixed 8 byte GRE header */
-	tunnel_hlen = 8 + sizeof(struct erspanhdr);
+	tunnel_hlen = 8 + sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
 
 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 	if (!rt)
@@ -590,7 +597,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto err_free_rt;
 
 	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-			    ntohl(md->index), truncate, true);
+			    ntohl(md->u.index), truncate, true);
 
 	gre_build_header(skb, 8, TUNNEL_SEQ,
 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -1238,7 +1245,7 @@ static int erspan_tunnel_init(struct net_device *dev)
 	tunnel->tun_hlen = 8;
 	tunnel->parms.iph.protocol = IPPROTO_GRE;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       sizeof(struct erspanhdr);
+		       sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
 
 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4562579797d1..1303d0c44c36 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -501,25 +501,32 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 			 struct tnl_ptk_info *tpi)
 {
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *pkt_md;
 	const struct ipv6hdr *ipv6h;
-	struct erspanhdr *ershdr;
 	struct ip6_tnl *tunnel;
-	__be32 index;
+	u8 ver;
 
 	ipv6h = ipv6_hdr(skb);
-	ershdr = (struct erspanhdr *)skb->data;
+	ershdr = (struct erspan_base_hdr *)skb->data;
 
 	if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
 		return PACKET_REJECT;
 
+	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
-	index = ershdr->md.index;
+	pkt_md = (struct erspan_metadata *)(ershdr + 1);
 
 	tunnel = ip6gre_tunnel_lookup(skb->dev,
 				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
 				      tpi->proto);
 	if (tunnel) {
-		if (__iptunnel_pull_header(skb, sizeof(*ershdr),
+		int len = erspan_hdr_len(ver);
+
+		if (unlikely(!pskb_may_pull(skb, len)))
+			return -ENOMEM;
+
+		if (__iptunnel_pull_header(skb, len,
 					   htons(ETH_P_TEB),
 					   false, false) < 0)
 			return PACKET_REJECT;
@@ -545,14 +552,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 			if (!md)
 				return PACKET_REJECT;
 
-			md->index = index;
+			memcpy(md, pkt_md, sizeof(*md));
 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 			info->options_len = sizeof(*md);
 
 			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 
 		} else {
-			tunnel->parms.index = ntohl(index);
+			tunnel->parms.index = ntohl(pkt_md->u.index);
 			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
 		}
 
@@ -921,7 +928,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			goto tx_err;
 
 		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-				    ntohl(md->index), truncate, false);
+				    ntohl(md->u.index), truncate, false);
 
 	} else {
 		switch (skb->protocol) {
@@ -1657,7 +1664,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 
 	tunnel->tun_hlen = 8;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       sizeof(struct erspanhdr);
+		       sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
 	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
 
 	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624ea74353dd..bce1f78b0de5 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -644,12 +644,12 @@ static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
 	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
 
 	memset(&opts, 0, sizeof(opts));
-	opts.index = nla_get_be32(attr);
+	opts.u.index = nla_get_be32(attr);
 
 	/* Index has only 20-bit */
-	if (ntohl(opts.index) & ~INDEX_MASK) {
+	if (ntohl(opts.u.index) & ~INDEX_MASK) {
 		OVS_NLERR(log, "ERSPAN index number %x too large.",
-			  ntohl(opts.index));
+			  ntohl(opts.u.index));
 		return -EINVAL;
 	}
 
@@ -907,7 +907,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 			return -EMSGSIZE;
 		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
 			 nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
-				      ((struct erspan_metadata *)tun_opts)->index))
+				      ((struct erspan_metadata *)tun_opts)->u.index))
 			return -EMSGSIZE;
 	}
 
-- 
cgit v1.2.3


From f551c91de262ba36b20c3ac19538afb4f4507441 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 13 Dec 2017 16:38:56 -0800
Subject: net: erspan: introduce erspan v2 for ip_gre

The patch adds support for erspan version 2.  Not all features are
supported in this patch.  The SGT (security group tag), GRA (timestamp
granularity), FT (frame type) are set to fixed value.  Only hardware
ID and direction are configurable.  Optional subheader is also not
supported.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h           | 120 ++++++++++++++++++++++++++++++++++++++++-
 include/net/ip_tunnels.h       |   5 +-
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_tunnel.h |   3 ++
 net/ipv4/ip_gre.c              | 105 ++++++++++++++++++++++++++++++------
 5 files changed, 216 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index 70c40c7c75b2..acdf6843095d 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -24,11 +24,29 @@
  * |      Reserved         |                  Index                |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
+ *
+ *  ERSPAN Version 2 (Type III) header (12 octets [42:49])
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Ver  |          VLAN         | COS |BSO|T|     Session ID    |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                          Timestamp                            |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |             SGT               |P|    FT   |   Hw ID   |D|Gra|O|
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *      Platform Specific SubHeader (8 octets, optional)
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Platf ID |               Platform Specific Info              |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                  Platform Specific Info                       |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
  * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
  */
 
 #define ERSPAN_VERSION	0x1	/* ERSPAN type II */
-
 #define VER_MASK	0xf000
 #define VLAN_MASK	0x0fff
 #define COS_MASK	0xe000
@@ -37,6 +55,28 @@
 #define ID_MASK		0x03ff
 #define INDEX_MASK	0xfffff
 
+#define ERSPAN_VERSION2	0x2	/* ERSPAN type III*/
+#define BSO_MASK	EN_MASK
+#define SGT_MASK	0xffff0000
+#define P_MASK		0x8000
+#define FT_MASK		0x7c00
+#define HWID_MASK	0x03f0
+#define DIR_MASK	0x0008
+#define GRA_MASK	0x0006
+#define O_MASK		0x0001
+
+/* ERSPAN version 2 metadata header */
+struct erspan_md2 {
+	__be32 timestamp;
+	__be16 sgt;	/* security group tag */
+	__be16 flags;
+#define P_OFFSET	15
+#define FT_OFFSET	10
+#define HWID_OFFSET	4
+#define DIR_OFFSET	3
+#define GRA_OFFSET	1
+};
+
 enum erspan_encap_type {
 	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
 	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
@@ -48,8 +88,10 @@ enum erspan_encap_type {
 #define ERSPAN_V2_MDSIZE	8
 struct erspan_metadata {
 	union {
-		__be32 index;	/* Version 1 (type II)*/
+		__be32 index;		/* Version 1 (type II)*/
+		struct erspan_md2 md2;	/* Version 2 (type III) */
 	} u;
+	int version;
 };
 
 struct erspan_base_hdr {
@@ -58,6 +100,7 @@ struct erspan_base_hdr {
 	__be16 session_id;
 #define COS_OFFSET  13
 #define EN_OFFSET   11
+#define BSO_OFFSET  EN_OFFSET
 #define T_OFFSET    10
 };
 
@@ -123,4 +166,77 @@ static inline void erspan_build_header(struct sk_buff *skb,
 	ersmd->u.index = htonl(index & INDEX_MASK);
 }
 
+/* ERSPAN GRA: timestamp granularity
+ *   00b --> granularity = 100 microseconds
+ *   01b --> granularity = 100 nanoseconds
+ *   10b --> granularity = IEEE 1588
+ * Here we only support 100 microseconds.
+ */
+static inline __be32 erspan_get_timestamp(void)
+{
+	u64 h_usecs;
+	ktime_t kt;
+
+	kt = ktime_get_real();
+	h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC);
+
+	/* ERSPAN base header only has 32-bit,
+	 * so it wraps around 4 days.
+	 */
+	return htonl((u32)h_usecs);
+}
+
+static inline void erspan_build_header_v2(struct sk_buff *skb,
+					  __be32 id, u8 direction, u16 hwid,
+					  bool truncate, bool is_ipv4)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *md;
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+	u16 session_id;
+	u8 gra = 0; /* 100 usec */
+	u8 bso = 0; /* Bad/Short/Oversized */
+	u8 sgt = 0;
+	u8 tos;
+
+	tos = is_ipv4 ? ip_hdr(skb)->tos :
+			(ipv6_hdr(skb)->priority << 4) +
+			(ipv6_hdr(skb)->flow_lbl[0] >> 4);
+
+	/* Unlike v1, v2 does not have En field,
+	 * so only extract vlan tci field.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+	}
+
+	skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
+	ershdr = (struct erspan_base_hdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
+
+	/* Build base header */
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION2 << VER_OFFSET));
+	session_id = (u16)(ntohl(id) & ID_MASK) |
+		     ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) |
+		     (bso << BSO_OFFSET & BSO_MASK) |
+		     ((truncate << T_OFFSET) & T_MASK);
+	ershdr->session_id = htons(session_id);
+
+	/* Build metadata */
+	md = (struct erspan_metadata *)(ershdr + 1);
+	md->u.md2.timestamp = erspan_get_timestamp();
+	md->u.md2.sgt = htons(sgt);
+	md->u.md2.flags = htons(((1 << P_OFFSET) & P_MASK) |
+				((hwid << HWID_OFFSET) & HWID_MASK) |
+				((direction << DIR_OFFSET) & DIR_MASK) |
+				((gra << GRA_OFFSET) & GRA_MASK));
+}
+
 #endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 24628f6b09bf..1f16773cfd76 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -116,8 +116,11 @@ struct ip_tunnel {
 	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
-	/* This field used only by ERSPAN */
+	/* These four fields used only by ERSPAN */
 	u32		index;		/* ERSPAN type II index */
+	u8		erspan_ver;	/* ERSPAN version */
+	u8		dir;		/* ERSPAN direction */
+	u16		hwid;		/* ERSPAN hardware ID */
 
 	struct dst_cache dst_cache;
 
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3ee3bf7c8526..87b7529fcdfe 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -47,6 +47,7 @@
 #define ETH_P_PUP	0x0200		/* Xerox PUP packet		*/
 #define ETH_P_PUPAT	0x0201		/* Xerox PUP Addr Trans packet	*/
 #define ETH_P_TSN	0x22F0		/* TSN (IEEE 1722) packet	*/
+#define ETH_P_ERSPAN2	0x22EB		/* ERSPAN version 2 (type III)	*/
 #define ETH_P_IP	0x0800		/* Internet Protocol packet	*/
 #define ETH_P_X25	0x0805		/* CCITT X.25			*/
 #define ETH_P_ARP	0x0806		/* Address Resolution packet	*/
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index e68dadbd6d45..1b3d148c4560 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -137,6 +137,9 @@ enum {
 	IFLA_GRE_IGNORE_DF,
 	IFLA_GRE_FWMARK,
 	IFLA_GRE_ERSPAN_INDEX,
+	IFLA_GRE_ERSPAN_VER,
+	IFLA_GRE_ERSPAN_DIR,
+	IFLA_GRE_ERSPAN_HWID,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 3e37402147f3..004800b923c6 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -315,11 +315,26 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return PACKET_REJECT;
 
 			memcpy(md, pkt_md, sizeof(*md));
+			md->version = ver;
+
 			info = &tun_dst->u.tun_info;
 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 			info->options_len = sizeof(*md);
 		} else {
-			tunnel->index = ntohl(pkt_md->u.index);
+			tunnel->erspan_ver = ver;
+			if (ver == 1) {
+				tunnel->index = ntohl(pkt_md->u.index);
+			} else {
+				u16 md2_flags;
+				u16 dir, hwid;
+
+				md2_flags = ntohs(pkt_md->u.md2.flags);
+				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+				tunnel->dir = dir;
+				tunnel->hwid = hwid;
+			}
+
 		}
 
 		skb_reset_mac_header(skb);
@@ -413,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
-	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+		     tpi.proto == htons(ETH_P_ERSPAN2))) {
 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 			return 0;
 	}
@@ -568,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	bool truncate = false;
 	struct flowi4 fl;
 	int tunnel_hlen;
+	int version;
 	__be16 df;
 
 	tun_info = skb_tunnel_info(skb);
@@ -576,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto err_free_skb;
 
 	key = &tun_info->key;
+	md = ip_tunnel_info_opts(tun_info);
+	if (!md)
+		goto err_free_rt;
 
 	/* ERSPAN has fixed 8 byte GRE header */
-	tunnel_hlen = 8 + sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
+	version = md->version;
+	tunnel_hlen = 8 + erspan_hdr_len(version);
 
 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 	if (!rt)
@@ -592,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		truncate = true;
 	}
 
-	md = ip_tunnel_info_opts(tun_info);
-	if (!md)
-		goto err_free_rt;
+	if (version == 1) {
+		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+				    ntohl(md->u.index), truncate, true);
+	} else if (version == 2) {
+		u16 md2_flags;
+		u8 direction;
+		u16 hwid;
 
-	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-			    ntohl(md->u.index), truncate, true);
+		md2_flags = ntohs(md->u.md2.flags);
+		direction = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+		hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+		erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id),
+				       direction, hwid,	truncate, true);
+	} else {
+		goto err_free_rt;
+	}
 
 	gre_build_header(skb, 8, TUNNEL_SEQ,
 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -699,8 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	}
 
 	/* Push ERSPAN header */
-	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
-			    truncate, true);
+	if (tunnel->erspan_ver == 1)
+		erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+				    truncate, true);
+	else
+		erspan_build_header_v2(skb, tunnel->parms.o_key,
+				       tunnel->dir, tunnel->hwid,
+				       truncate, true);
+
 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
 	return NETDEV_TX_OK;
@@ -1172,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
-	if (data[IFLA_GRE_ERSPAN_INDEX]) {
-		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+	if (data[IFLA_GRE_ERSPAN_VER]) {
+		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
 
-		if (t->index & ~INDEX_MASK)
+		if (t->erspan_ver != 1 && t->erspan_ver != 2)
 			return -EINVAL;
 	}
 
+	if (t->erspan_ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX]) {
+			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+			if (t->index & ~INDEX_MASK)
+				return -EINVAL;
+		}
+	} else if (t->erspan_ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR]) {
+			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+				return -EINVAL;
+		}
+		if (data[IFLA_GRE_ERSPAN_HWID]) {
+			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+				return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1245,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev)
 	tunnel->tun_hlen = 8;
 	tunnel->parms.iph.protocol = IPPROTO_GRE;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
+		       erspan_hdr_len(tunnel->erspan_ver);
 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
 
 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
@@ -1375,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_GRE_ERSPAN_INDEX */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_VER */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_DIR */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_HWID */
+		nla_total_size(2) +
 		0;
 }
 
@@ -1417,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
-	if (t->index)
+	if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+		goto nla_put_failure;
+
+	if (t->erspan_ver == 1) {
 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
 			goto nla_put_failure;
+	} else if (t->erspan_ver == 2) {
+		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+			goto nla_put_failure;
+		if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+			goto nla_put_failure;
+	}
 
 	return 0;
 
@@ -1455,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
-- 
cgit v1.2.3


From 94d7d8f2928701ef9b82527f889e0220dba11fa2 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 13 Dec 2017 16:38:57 -0800
Subject: ip6_gre: add erspan v2 support

Similar to support for ipv4 erspan, this patch adds
erspan v2 to ip6erspan tunnel.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |   3 ++
 net/ipv6/ip6_gre.c       | 120 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 107 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 109a5a8877ef..236e40ba06bf 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -37,6 +37,9 @@ struct __ip6_tnl_parm {
 
 	__u32			fwmark;
 	__u32			index;	/* ERSPAN type II index */
+	__u8			erspan_ver;	/* ERSPAN version */
+	__u8			dir;	/* direction */
+	__u16			hwid;	/* hwid */
 };
 
 /* IPv6 tunnel */
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 1303d0c44c36..5c9c65f1d5c2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -553,13 +553,28 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 				return PACKET_REJECT;
 
 			memcpy(md, pkt_md, sizeof(*md));
+			md->version = ver;
 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 			info->options_len = sizeof(*md);
 
 			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 
 		} else {
-			tunnel->parms.index = ntohl(pkt_md->u.index);
+			tunnel->parms.erspan_ver = ver;
+
+			if (ver == 1) {
+				tunnel->parms.index = ntohl(pkt_md->u.index);
+			} else {
+				u16 md2_flags;
+				u16 dir, hwid;
+
+				md2_flags = ntohs(pkt_md->u.md2.flags);
+				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+				tunnel->parms.dir = dir;
+				tunnel->parms.hwid = hwid;
+			}
+
 			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
 		}
 
@@ -582,7 +597,8 @@ static int gre_rcv(struct sk_buff *skb)
 	if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
 		goto drop;
 
-	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+		     tpi.proto == htons(ETH_P_ERSPAN2))) {
 		if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD)
 			return 0;
 		goto drop;
@@ -927,9 +943,24 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		if (!md)
 			goto tx_err;
 
-		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-				    ntohl(md->u.index), truncate, false);
-
+		if (md->version == 1) {
+			erspan_build_header(skb,
+					    tunnel_id_to_key32(key->tun_id),
+					    ntohl(md->u.index), truncate,
+					    false);
+		} else if (md->version == 2) {
+			u16 md2_flags;
+			u16 dir, hwid;
+
+			md2_flags = ntohs(md->u.md2.flags);
+			dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+			hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+			erspan_build_header_v2(skb,
+					       tunnel_id_to_key32(key->tun_id),
+					       dir, hwid, truncate,
+					       false);
+		}
 	} else {
 		switch (skb->protocol) {
 		case htons(ETH_P_IP):
@@ -949,8 +980,15 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			break;
 		}
 
-		erspan_build_header(skb, t->parms.o_key, t->parms.index,
-				    truncate, false);
+		if (t->parms.erspan_ver == 1)
+			erspan_build_header(skb, t->parms.o_key,
+					    t->parms.index,
+					    truncate, false);
+		else
+			erspan_build_header_v2(skb, t->parms.o_key,
+					       t->parms.dir,
+					       t->parms.hwid,
+					       truncate, false);
 		fl6.daddr = t->parms.raddr;
 	}
 
@@ -1514,7 +1552,7 @@ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
 				  struct netlink_ext_ack *extack)
 {
 	__be16 flags = 0;
-	int ret;
+	int ret, ver = 0;
 
 	if (!data)
 		return 0;
@@ -1543,12 +1581,35 @@ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
 		return -EINVAL;
 
-	if (data[IFLA_GRE_ERSPAN_INDEX]) {
-		u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
-
-		if (index & ~INDEX_MASK)
+	if (data[IFLA_GRE_ERSPAN_VER]) {
+		ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+		if (ver != 1 && ver != 2)
 			return -EINVAL;
 	}
+
+	if (ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX]) {
+			u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+			if (index & ~INDEX_MASK)
+				return -EINVAL;
+		}
+	} else if (ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR]) {
+			u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+
+			if (dir & ~(DIR_MASK >> DIR_OFFSET))
+				return -EINVAL;
+		}
+
+		if (data[IFLA_GRE_ERSPAN_HWID]) {
+			u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+
+			if (hwid & ~(HWID_MASK >> HWID_OFFSET))
+				return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1598,11 +1659,21 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 	if (data[IFLA_GRE_FWMARK])
 		parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
-	if (data[IFLA_GRE_ERSPAN_INDEX])
-		parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
-
 	if (data[IFLA_GRE_COLLECT_METADATA])
 		parms->collect_md = true;
+
+	if (data[IFLA_GRE_ERSPAN_VER])
+		parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+	if (parms->erspan_ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX])
+			parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+	} else if (parms->erspan_ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR])
+			parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+		if (data[IFLA_GRE_ERSPAN_HWID])
+			parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+	}
 }
 
 static int ip6gre_tap_init(struct net_device *dev)
@@ -1664,7 +1735,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 
 	tunnel->tun_hlen = 8;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
+		       erspan_hdr_len(tunnel->parms.erspan_ver);
 	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
 
 	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
@@ -1932,6 +2003,19 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
+	if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
+		goto nla_put_failure;
+
+	if (p->erspan_ver == 1) {
+		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
+			goto nla_put_failure;
+	} else if (p->erspan_ver == 2) {
+		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
+			goto nla_put_failure;
+		if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -1957,6 +2041,9 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
 	[IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
 	[IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
 };
 
 static void ip6erspan_tap_setup(struct net_device *dev)
@@ -2078,4 +2165,5 @@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
 MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
 MODULE_ALIAS_RTNL_LINK("ip6gre");
 MODULE_ALIAS_RTNL_LINK("ip6gretap");
+MODULE_ALIAS_RTNL_LINK("ip6erspan");
 MODULE_ALIAS_NETDEV("ip6gre0");
-- 
cgit v1.2.3


From 2d07a49aded490a0a4a2748e64030a0f59b6b8be Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:25 +0800
Subject: sctp: add basic structures and make chunk function for ifwdtsn

sctp_ifwdtsn_skip, sctp_ifwdtsn_hdr and sctp_ifwdtsn_chunk are used to
define and parse I-FWD TSN chunk format, and sctp_make_ifwdtsn is a
function to build the chunk.

The I-FORWARD-TSN Chunk Format is defined in section 2.3.1 of RFC8260.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 17 +++++++++++++++++
 include/net/sctp/sm.h      |  3 +++
 include/net/sctp/structs.h |  1 +
 net/sctp/sm_make_chunk.c   | 24 ++++++++++++++++++++++++
 4 files changed, 45 insertions(+)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 38e2cf66195f..b36c76635f18 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -110,6 +110,7 @@ enum sctp_cid {
 
 	/* Use hex, as defined in ADDIP sec. 3.1 */
 	SCTP_CID_ASCONF			= 0xC1,
+	SCTP_CID_I_FWD_TSN		= 0xC2,
 	SCTP_CID_ASCONF_ACK		= 0x80,
 	SCTP_CID_RECONF			= 0x82,
 }; /* enum */
@@ -616,6 +617,22 @@ struct sctp_fwdtsn_chunk {
 	struct sctp_fwdtsn_hdr fwdtsn_hdr;
 };
 
+struct sctp_ifwdtsn_skip {
+	__be16 stream;
+	__u8 reserved;
+	__u8 flags;
+	__be32 mid;
+};
+
+struct sctp_ifwdtsn_hdr {
+	__be32 new_cum_tsn;
+	struct sctp_ifwdtsn_skip skip[0];
+};
+
+struct sctp_ifwdtsn_chunk {
+	struct sctp_chunkhdr chunk_hdr;
+	struct sctp_ifwdtsn_hdr fwdtsn_hdr;
+};
 
 /* ADDIP
  * Section 3.1.1 Address Configuration Change Chunk (ASCONF)
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 0993b4953b3a..2883c43c5258 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -199,6 +199,9 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 				 const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
 				   __u8 flags, int paylen, gfp_t gfp);
+struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc,
+				     __u32 new_cum_tsn, size_t nstreams,
+				     struct sctp_ifwdtsn_skip *skiplist);
 struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
 					    int len, __u8 flags, gfp_t gfp);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 8ef638d966f1..a5c3cf41e693 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -599,6 +599,7 @@ struct sctp_chunk {
 		struct sctp_fwdtsn_hdr *fwdtsn_hdr;
 		struct sctp_authhdr *auth_hdr;
 		struct sctp_idatahdr *idata_hdr;
+		struct sctp_ifwdtsn_hdr *ifwdtsn_hdr;
 	} subh;
 
 	__u8 *chunk_end;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 23a7313d7972..b9b269cf615e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3536,6 +3536,30 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 	return retval;
 }
 
+struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc,
+				     __u32 new_cum_tsn, size_t nstreams,
+				     struct sctp_ifwdtsn_skip *skiplist)
+{
+	struct sctp_chunk *retval = NULL;
+	struct sctp_ifwdtsn_hdr ftsn_hdr;
+	size_t hint;
+
+	hint = (nstreams + 1) * sizeof(__u32);
+
+	retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint,
+				   GFP_ATOMIC);
+	if (!retval)
+		return NULL;
+
+	ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
+	retval->subh.ifwdtsn_hdr =
+		sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);
+
+	sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist);
+
+	return retval;
+}
+
 /* RE-CONFIG 3.1 (RE-CONFIG chunk)
  *   0                   1                   2                   3
  *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-- 
cgit v1.2.3


From 8e0c3b73cec1b943affde91b3c412ad8266b4694 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:26 +0800
Subject: sctp: implement generate_ftsn for sctp_stream_interleave

generate_ftsn is added as a member of sctp_stream_interleave, used to
create fwdtsn or ifwdtsn chunk according to abandoned chunks, called
in sctp_retransmit and sctp_outq_sack.

sctp_generate_iftsn works for ifwdtsn, and sctp_generate_fwdtsn is
still used for making fwdtsn.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |  2 +
 include/net/sctp/structs.h           |  1 +
 net/sctp/outqueue.c                  | 12 +++---
 net/sctp/stream_interleave.c         | 75 ++++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 501b2be049a3..66267dbcecba 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -47,6 +47,8 @@ struct sctp_stream_interleave {
 				 struct sctp_chunk *chunk, gfp_t gfp);
 	void	(*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
 	void	(*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
+	/* (I-)FORWARD-TSN process */
+	void	(*generate_ftsn)(struct sctp_outq *q, __u32 ctsn);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a5c3cf41e693..b7720d65a975 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1100,6 +1100,7 @@ void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
 void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 void sctp_prsctp_prune(struct sctp_association *asoc,
 		       struct sctp_sndrcvinfo *sinfo, int msg_len);
+void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 /* Uncork and flush an outqueue.  */
 static inline void sctp_outq_cork(struct sctp_outq *q)
 {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 7d67feeeffc1..af9b5ebcae50 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -67,8 +67,6 @@ static void sctp_mark_missing(struct sctp_outq *q,
 			      __u32 highest_new_tsn,
 			      int count_of_newacks);
 
-static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
-
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
@@ -591,7 +589,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	 * following the procedures outlined in C1 - C5.
 	 */
 	if (reason == SCTP_RTXR_T3_RTX)
-		sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+		q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point);
 
 	/* Flush the queues only on timeout, since fast_rtx is only
 	 * triggered during sack processing and the queue
@@ -942,6 +940,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_ECN_ECNE:
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
+		case SCTP_CID_I_FWD_TSN:
 		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
@@ -956,7 +955,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			 * sender MUST assure that at least one T3-rtx
 			 * timer is running.
 			 */
-			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
+			    chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
 				sctp_transport_reset_t3_rtx(transport);
 				transport->last_time_sent = jiffies;
 			}
@@ -1372,7 +1372,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 
 	asoc->peer.rwnd = sack_a_rwnd;
 
-	sctp_generate_fwdtsn(q, sack_ctsn);
+	asoc->stream.si->generate_ftsn(q, sack_ctsn);
 
 	pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn);
 	pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, "
@@ -1795,7 +1795,7 @@ static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist,
 }
 
 /* Create and add a fwdtsn chunk to the outq's control queue if needed. */
-static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
+void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
 {
 	struct sctp_association *asoc = q->asoc;
 	struct sctp_chunk *ftsn_chunk = NULL;
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 87b9417c9892..2ead372e7a76 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1082,6 +1082,77 @@ static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 	sctp_ulpq_flush(ulpq);
 }
 
+static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist,
+				    int nskips, __be16 stream, __u8 flags)
+{
+	int i;
+
+	for (i = 0; i < nskips; i++)
+		if (skiplist[i].stream == stream &&
+		    skiplist[i].flags == flags)
+			return i;
+
+	return i;
+}
+
+#define SCTP_FTSN_U_BIT	0x1
+static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
+{
+	struct sctp_ifwdtsn_skip ftsn_skip_arr[10];
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_chunk *ftsn_chunk = NULL;
+	struct list_head *lchunk, *temp;
+	int nskips = 0, skip_pos;
+	struct sctp_chunk *chunk;
+	__u32 tsn;
+
+	if (!asoc->peer.prsctp_capable)
+		return;
+
+	if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
+		asoc->adv_peer_ack_point = ctsn;
+
+	list_for_each_safe(lchunk, temp, &q->abandoned) {
+		chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list);
+		tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+		if (TSN_lte(tsn, ctsn)) {
+			list_del_init(lchunk);
+			sctp_chunk_free(chunk);
+		} else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) {
+			__be16 sid = chunk->subh.idata_hdr->stream;
+			__be32 mid = chunk->subh.idata_hdr->mid;
+			__u8 flags = 0;
+
+			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+				flags |= SCTP_FTSN_U_BIT;
+
+			asoc->adv_peer_ack_point = tsn;
+			skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips,
+						     sid, flags);
+			ftsn_skip_arr[skip_pos].stream = sid;
+			ftsn_skip_arr[skip_pos].reserved = 0;
+			ftsn_skip_arr[skip_pos].flags = flags;
+			ftsn_skip_arr[skip_pos].mid = mid;
+			if (skip_pos == nskips)
+				nskips++;
+			if (nskips == 10)
+				break;
+		} else {
+			break;
+		}
+	}
+
+	if (asoc->adv_peer_ack_point > ctsn)
+		ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point,
+					       nskips, &ftsn_skip_arr[0]);
+
+	if (ftsn_chunk) {
+		list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
+		SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS);
+	}
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
@@ -1093,6 +1164,8 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.renege_events		= sctp_ulpq_renege,
 	.start_pd		= sctp_ulpq_partial_delivery,
 	.abort_pd		= sctp_ulpq_abort_pd,
+	/* FORWARD-TSN process functions */
+	.generate_ftsn		= sctp_generate_fwdtsn,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -1106,6 +1179,8 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.renege_events		= sctp_renege_events,
 	.start_pd		= sctp_intl_start_pd,
 	.abort_pd		= sctp_intl_abort_pd,
+	/* I-FORWARD-TSN process functions */
+	.generate_ftsn		= sctp_generate_iftsn,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From 0fc2ea922c8ad5520c80f03facbf396c81dce802 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:27 +0800
Subject: sctp: implement validate_ftsn for sctp_stream_interleave

validate_ftsn is added as a member of sctp_stream_interleave, used to
validate ssn/chunk type for fwdtsn or mid (message id)/chunk type for
ifwdtsn, called in sctp_sf_eat_fwd_tsn, just as validate_data.

If this check fails, an abort packet will be sent, as said in section
2.3.1 of RFC8260.

As ifwdtsn and fwdtsn chunks have different length, it also defines
ftsn_chunk_len for sctp_stream_interleave to describe the chunk size.
Then it replaces all sizeof(struct sctp_fwdtsn_chunk) with
sctp_ftsnchk_len.

It also adds the process for ifwdtsn in rx path. As Marcelo pointed
out, there's no need to add event table for ifwdtsn, but just share
prsctp_chunk_event_table with fwdtsn's. It would drop fwdtsn chunk
for ifwdtsn and drop ifwdtsn chunk for fwdtsn by calling validate_ftsn
in sctp_sf_eat_fwd_tsn.

After this patch, the ifwdtsn can be accepted.

Note that this patch also removes the sctp.intl_enable check for
idata chunks in sctp_chunk_event_lookup, as it will do this check
in validate_data later.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |  2 ++
 include/net/sctp/structs.h           | 10 ++++++++
 net/sctp/sm_statefuns.c              | 24 +++++++-------------
 net/sctp/sm_statetable.c             |  4 ++--
 net/sctp/stream_interleave.c         | 44 ++++++++++++++++++++++++++++++++++++
 5 files changed, 66 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 66267dbcecba..0db15b50c5e6 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -33,6 +33,7 @@
 
 struct sctp_stream_interleave {
 	__u16	data_chunk_len;
+	__u16	ftsn_chunk_len;
 	/* (I-)DATA process */
 	struct sctp_chunk *(*make_datafrag)(const struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
@@ -49,6 +50,7 @@ struct sctp_stream_interleave {
 	void	(*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
 	/* (I-)FORWARD-TSN process */
 	void	(*generate_ftsn)(struct sctp_outq *q, __u32 ctsn);
+	bool	(*validate_ftsn)(struct sctp_chunk *chunk);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index b7720d65a975..8ac4d5cdbfed 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1443,6 +1443,16 @@ static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream)
 	return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr);
 }
 
+static inline __u16 sctp_ftsnchk_len(const struct sctp_stream *stream)
+{
+	return stream->si->ftsn_chunk_len;
+}
+
+static inline __u16 sctp_ftsnhdr_len(const struct sctp_stream *stream)
+{
+	return stream->si->ftsn_chunk_len - sizeof(struct sctp_chunkhdr);
+}
+
 /* SCTP_GET_ASSOC_STATS counters */
 struct sctp_priv_assoc_stats {
 	/* Maximum observed rto in the association during subsequent
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index c609c5409910..541f34735346 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3957,7 +3957,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
 {
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
 	struct sctp_chunk *chunk = arg;
-	struct sctp_fwdtsn_skip *skip;
 	__u16 len;
 	__u32 tsn;
 
@@ -3971,7 +3970,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
 		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the FORWARD_TSN chunk has valid length.  */
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3990,14 +3989,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
 	if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
 		goto discard_noforce;
 
-	/* Silently discard the chunk if stream-id is not valid */
-	sctp_walk_fwdtsn(skip, chunk) {
-		if (ntohs(skip->stream) >= asoc->stream.incnt)
-			goto discard_noforce;
-	}
+	if (!asoc->stream.si->validate_ftsn(chunk))
+		goto discard_noforce;
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
-	if (len > sizeof(struct sctp_fwdtsn_hdr))
+	if (len > sctp_ftsnhdr_len(&asoc->stream))
 		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
 				SCTP_CHUNK(chunk));
 
@@ -4028,7 +4024,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
 {
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
 	struct sctp_chunk *chunk = arg;
-	struct sctp_fwdtsn_skip *skip;
 	__u16 len;
 	__u32 tsn;
 
@@ -4042,7 +4037,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
 		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the FORWARD_TSN chunk has a valid length.  */
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -4061,14 +4056,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
 	if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
 		goto gen_shutdown;
 
-	/* Silently discard the chunk if stream-id is not valid */
-	sctp_walk_fwdtsn(skip, chunk) {
-		if (ntohs(skip->stream) >= asoc->stream.incnt)
-			goto gen_shutdown;
-	}
+	if (!asoc->stream.si->validate_ftsn(chunk))
+		goto gen_shutdown;
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
-	if (len > sizeof(struct sctp_fwdtsn_hdr))
+	if (len > sctp_ftsnhdr_len(&asoc->stream))
 		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
 				SCTP_CHUNK(chunk));
 
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 8c9bb4109f47..691d9dc620e3 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -985,14 +985,14 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
 	if (state > SCTP_STATE_MAX)
 		return &bug;
 
-	if (net->sctp.intl_enable && cid == SCTP_CID_I_DATA)
+	if (cid == SCTP_CID_I_DATA)
 		cid = SCTP_CID_DATA;
 
 	if (cid <= SCTP_CID_BASE_MAX)
 		return &chunk_event_table[cid][state];
 
 	if (net->sctp.prsctp_enable) {
-		if (cid == SCTP_CID_FWD_TSN)
+		if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN)
 			return &prsctp_chunk_event_table[0][state];
 	}
 
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 2ead372e7a76..cc4a5e320145 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1153,8 +1153,49 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
 	}
 }
 
+#define _sctp_walk_ifwdtsn(pos, chunk, end) \
+	for (pos = chunk->subh.ifwdtsn_hdr->skip; \
+	     (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++)
+
+#define sctp_walk_ifwdtsn(pos, ch) \
+	_sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \
+					sizeof(struct sctp_ifwdtsn_chunk))
+
+static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk)
+{
+	struct sctp_fwdtsn_skip *skip;
+	__u16 incnt;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN)
+		return false;
+
+	incnt = chunk->asoc->stream.incnt;
+	sctp_walk_fwdtsn(skip, chunk)
+		if (ntohs(skip->stream) >= incnt)
+			return false;
+
+	return true;
+}
+
+static bool sctp_validate_iftsn(struct sctp_chunk *chunk)
+{
+	struct sctp_ifwdtsn_skip *skip;
+	__u16 incnt;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN)
+		return false;
+
+	incnt = chunk->asoc->stream.incnt;
+	sctp_walk_ifwdtsn(skip, chunk)
+		if (ntohs(skip->stream) >= incnt)
+			return false;
+
+	return true;
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
+	.ftsn_chunk_len		= sizeof(struct sctp_fwdtsn_chunk),
 	/* DATA process functions */
 	.make_datafrag		= sctp_make_datafrag_empty,
 	.assign_number		= sctp_chunk_assign_ssn,
@@ -1166,10 +1207,12 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.abort_pd		= sctp_ulpq_abort_pd,
 	/* FORWARD-TSN process functions */
 	.generate_ftsn		= sctp_generate_fwdtsn,
+	.validate_ftsn		= sctp_validate_fwdtsn,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.data_chunk_len		= sizeof(struct sctp_idata_chunk),
+	.ftsn_chunk_len		= sizeof(struct sctp_ifwdtsn_chunk),
 	/* I-DATA process functions */
 	.make_datafrag		= sctp_make_idatafrag_empty,
 	.assign_number		= sctp_chunk_assign_mid,
@@ -1181,6 +1224,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.abort_pd		= sctp_intl_abort_pd,
 	/* I-FORWARD-TSN process functions */
 	.generate_ftsn		= sctp_generate_iftsn,
+	.validate_ftsn		= sctp_validate_iftsn,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From 47b20a88566f89dd0cc80c46f59ce0a12259d404 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:28 +0800
Subject: sctp: implement report_ftsn for sctp_stream_interleave

report_ftsn is added as a member of sctp_stream_interleave, used to
skip tsn from tsnmap, remove old events from reasm or lobby queue,
and abort pd for data or idata, called for SCTP_CMD_REPORT_FWDTSN
cmd and asoc reset.

sctp_report_iftsn works for ifwdtsn, and sctp_report_fwdtsn works
for fwdtsn. Note that sctp_report_iftsn doesn't do asoc abort_pd,
as stream abort_pd will be done when handling ifwdtsn. But when
ftsn is equal with ftsn, which means asoc reset, asoc abort_pd has
to be done.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |  1 +
 net/sctp/sm_sideeffect.c             |  9 +------
 net/sctp/stream.c                    |  6 ++---
 net/sctp/stream_interleave.c         | 48 ++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 0db15b50c5e6..0b55c70ac5af 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -51,6 +51,7 @@ struct sctp_stream_interleave {
 	/* (I-)FORWARD-TSN process */
 	void	(*generate_ftsn)(struct sctp_outq *q, __u32 ctsn);
 	bool	(*validate_ftsn)(struct sctp_chunk *chunk);
+	void	(*report_ftsn)(struct sctp_ulpq *ulpq, __u32 ftsn);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 8adde71fdb31..be7c6dbdb283 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1368,14 +1368,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_REPORT_FWDTSN:
-			/* Move the Cumulattive TSN Ack ahead. */
-			sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32);
-
-			/* purge the fragmentation queue */
-			sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32);
-
-			/* Abort any in progress partial delivery. */
-			sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+			asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32);
 			break;
 
 		case SCTP_CMD_PROCESS_FWDTSN:
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 8370e6cfe897..b3a9f37c1598 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -754,8 +754,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 	 *     performed.
 	 */
 	max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
-	sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen);
-	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+	asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen);
 
 	/* G1: Compute an appropriate value for the Receiver's Next TSN -- the
 	 *     TSN that the peer should use to send the next DATA chunk.  The
@@ -1024,8 +1023,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 						&asoc->peer.tsn_map);
 			LIST_HEAD(temp);
 
-			sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn);
-			sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+			asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn);
 
 			sctp_tsnmap_init(&asoc->peer.tsn_map,
 					 SCTP_TSN_MAP_INITIAL,
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index cc4a5e320145..f62771ccaf5d 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1193,6 +1193,52 @@ static bool sctp_validate_iftsn(struct sctp_chunk *chunk)
 	return true;
 }
 
+static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+	/* Move the Cumulattive TSN Ack ahead. */
+	sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
+	/* purge the fragmentation queue */
+	sctp_ulpq_reasm_flushtsn(ulpq, ftsn);
+	/* Abort any in progress partial delivery. */
+	sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC);
+}
+
+static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+	struct sk_buff *pos, *tmp;
+
+	skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
+		struct sctp_ulpevent *event = sctp_skb2event(pos);
+		__u32 tsn = event->tsn;
+
+		if (TSN_lte(tsn, ftsn)) {
+			__skb_unlink(pos, &ulpq->reasm);
+			sctp_ulpevent_free(event);
+		}
+	}
+
+	skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) {
+		struct sctp_ulpevent *event = sctp_skb2event(pos);
+		__u32 tsn = event->tsn;
+
+		if (TSN_lte(tsn, ftsn)) {
+			__skb_unlink(pos, &ulpq->reasm_uo);
+			sctp_ulpevent_free(event);
+		}
+	}
+}
+
+static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+	/* Move the Cumulattive TSN Ack ahead. */
+	sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
+	/* purge the fragmentation queue */
+	sctp_intl_reasm_flushtsn(ulpq, ftsn);
+	/* abort only when it's for all data */
+	if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map))
+		sctp_intl_abort_pd(ulpq, GFP_ATOMIC);
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	.ftsn_chunk_len		= sizeof(struct sctp_fwdtsn_chunk),
@@ -1208,6 +1254,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	/* FORWARD-TSN process functions */
 	.generate_ftsn		= sctp_generate_fwdtsn,
 	.validate_ftsn		= sctp_validate_fwdtsn,
+	.report_ftsn		= sctp_report_fwdtsn,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -1225,6 +1272,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	/* I-FORWARD-TSN process functions */
 	.generate_ftsn		= sctp_generate_iftsn,
 	.validate_ftsn		= sctp_validate_iftsn,
+	.report_ftsn		= sctp_report_iftsn,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From de60fe9105431f504de9f8793b1da237a7d7f7ed Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:29 +0800
Subject: sctp: implement handle_ftsn for sctp_stream_interleave

handle_ftsn is added as a member of sctp_stream_interleave, used to skip
ssn for data or mid for idata, called for SCTP_CMD_PROCESS_FWDTSN cmd.

sctp_handle_iftsn works for ifwdtsn, and sctp_handle_fwdtsn works for
fwdtsn. Note that different from sctp_handle_fwdtsn, sctp_handle_iftsn
could do stream abort pd.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |  2 ++
 net/sctp/sm_sideeffect.c             | 15 ++---------
 net/sctp/stream_interleave.c         | 49 ++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 0b55c70ac5af..6657711c8bc4 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -52,6 +52,8 @@ struct sctp_stream_interleave {
 	void	(*generate_ftsn)(struct sctp_outq *q, __u32 ctsn);
 	bool	(*validate_ftsn)(struct sctp_chunk *chunk);
 	void	(*report_ftsn)(struct sctp_ulpq *ulpq, __u32 ftsn);
+	void	(*handle_ftsn)(struct sctp_ulpq *ulpq,
+			       struct sctp_chunk *chunk);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index be7c6dbdb283..16ddf2ca1438 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1007,18 +1007,6 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 	}
 }
 
-/* Process variable FWDTSN chunk information. */
-static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
-				    struct sctp_chunk *chunk)
-{
-	struct sctp_fwdtsn_skip *skip;
-
-	/* Walk through all the skipped SSNs */
-	sctp_walk_fwdtsn(skip, chunk) {
-		sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
-	}
-}
-
 /* Helper function to remove the association non-primary peer
  * transports.
  */
@@ -1372,7 +1360,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_PROCESS_FWDTSN:
-			sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.chunk);
+			asoc->stream.si->handle_ftsn(&asoc->ulpq,
+						     cmd->obj.chunk);
 			break;
 
 		case SCTP_CMD_GEN_SACK:
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index f62771ccaf5d..8c7cf8f08711 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1239,6 +1239,53 @@ static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn)
 		sctp_intl_abort_pd(ulpq, GFP_ATOMIC);
 }
 
+static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
+{
+	struct sctp_fwdtsn_skip *skip;
+
+	/* Walk through all the skipped SSNs */
+	sctp_walk_fwdtsn(skip, chunk)
+		sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
+}
+
+static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid,
+			   __u8 flags)
+{
+	struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid);
+	struct sctp_stream *stream  = &ulpq->asoc->stream;
+
+	if (flags & SCTP_FTSN_U_BIT) {
+		if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) {
+			sin->pd_mode_uo = 0;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1,
+						  GFP_ATOMIC);
+		}
+		return;
+	}
+
+	if (MID_lt(mid, sctp_mid_peek(stream, in, sid)))
+		return;
+
+	if (sin->pd_mode) {
+		sin->pd_mode = 0;
+		sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC);
+	}
+
+	sctp_mid_skip(stream, in, sid, mid);
+
+	sctp_intl_reap_ordered(ulpq, sid);
+}
+
+static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
+{
+	struct sctp_ifwdtsn_skip *skip;
+
+	/* Walk through all the skipped MIDs and abort stream pd if possible */
+	sctp_walk_ifwdtsn(skip, chunk)
+		sctp_intl_skip(ulpq, ntohs(skip->stream),
+			       ntohl(skip->mid), skip->flags);
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	.ftsn_chunk_len		= sizeof(struct sctp_fwdtsn_chunk),
@@ -1255,6 +1302,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.generate_ftsn		= sctp_generate_fwdtsn,
 	.validate_ftsn		= sctp_validate_fwdtsn,
 	.report_ftsn		= sctp_report_fwdtsn,
+	.handle_ftsn		= sctp_handle_fwdtsn,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -1273,6 +1321,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.generate_ftsn		= sctp_generate_iftsn,
 	.validate_ftsn		= sctp_validate_iftsn,
 	.report_ftsn		= sctp_report_iftsn,
+	.handle_ftsn		= sctp_handle_iftsn,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
-- 
cgit v1.2.3


From ef4775e3402b7d45b06dddd79f860a5c4d5fc1cf Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:30 +0800
Subject: sctp: add stream interleave support in stream scheduler

As Marcelo said in the stream scheduler patch:

  Support for I-DATA chunks, also described in RFC8260, with user message
  interleaving is straightforward as it just requires the schedulers to
  probe for the feature and ignore datamsg boundaries when dequeueing.

All needs to do is just to ignore datamsg boundaries when dequeueing.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream_sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index d8c162a4089c..f5fcd425232a 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -242,7 +242,8 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
 
 void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
 {
-	if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) {
+	if (!list_is_last(&ch->frag_list, &ch->msg->chunks) &&
+	    !q->asoc->intl_enable) {
 		struct sctp_stream_out *sout;
 		__u16 sid;
 
-- 
cgit v1.2.3


From 107e242569541795a84264f94375e987ba04d309 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:31 +0800
Subject: sctp: update mid instead of ssn when doing stream and asoc reset

When using idata and doing stream and asoc reset, setting ssn with
0 could only clear the 1st 16 bits of mid.

So to make this work for both data and idata, it sets mid with 0
instead of ssn, and also mid_uo for unordered idata also need to
be cleared, as said in section 2.3.2 of RFC8260.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index b3a9f37c1598..06b644dd858c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -216,11 +216,13 @@ void sctp_stream_clear(struct sctp_stream *stream)
 {
 	int i;
 
-	for (i = 0; i < stream->outcnt; i++)
-		stream->out[i].ssn = 0;
+	for (i = 0; i < stream->outcnt; i++) {
+		stream->out[i].mid = 0;
+		stream->out[i].mid_uo = 0;
+	}
 
 	for (i = 0; i < stream->incnt; i++)
-		stream->in[i].ssn = 0;
+		stream->in[i].mid = 0;
 }
 
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
@@ -607,10 +609,10 @@ struct sctp_chunk *sctp_process_strreset_outreq(
 		}
 
 		for (i = 0; i < nums; i++)
-			stream->in[ntohs(str_p[i])].ssn = 0;
+			stream->in[ntohs(str_p[i])].mid = 0;
 	} else {
 		for (i = 0; i < stream->incnt; i++)
-			stream->in[i].ssn = 0;
+			stream->in[i].mid = 0;
 	}
 
 	result = SCTP_STRRESET_PERFORMED;
@@ -783,10 +785,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 	/* G5:  The next expected and outgoing SSNs MUST be reset to 0 for all
 	 *      incoming and outgoing streams.
 	 */
-	for (i = 0; i < stream->outcnt; i++)
-		stream->out[i].ssn = 0;
+	for (i = 0; i < stream->outcnt; i++) {
+		stream->out[i].mid = 0;
+		stream->out[i].mid_uo = 0;
+	}
 	for (i = 0; i < stream->incnt; i++)
-		stream->in[i].ssn = 0;
+		stream->in[i].mid = 0;
 
 	result = SCTP_STRRESET_PERFORMED;
 
@@ -976,11 +980,15 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 		if (result == SCTP_STRRESET_PERFORMED) {
 			if (nums) {
-				for (i = 0; i < nums; i++)
-					stream->out[ntohs(str_p[i])].ssn = 0;
+				for (i = 0; i < nums; i++) {
+					stream->out[ntohs(str_p[i])].mid = 0;
+					stream->out[ntohs(str_p[i])].mid_uo = 0;
+				}
 			} else {
-				for (i = 0; i < stream->outcnt; i++)
-					stream->out[i].ssn = 0;
+				for (i = 0; i < stream->outcnt; i++) {
+					stream->out[i].mid = 0;
+					stream->out[i].mid_uo = 0;
+				}
 			}
 
 			flags = SCTP_STREAM_RESET_OUTGOING_SSN;
@@ -1041,10 +1049,12 @@ struct sctp_chunk *sctp_process_strreset_resp(
 			asoc->ctsn_ack_point = asoc->next_tsn - 1;
 			asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
 
-			for (i = 0; i < stream->outcnt; i++)
-				stream->out[i].ssn = 0;
+			for (i = 0; i < stream->outcnt; i++) {
+				stream->out[i].mid = 0;
+				stream->out[i].mid_uo = 0;
+			}
 			for (i = 0; i < stream->incnt; i++)
-				stream->in[i].ssn = 0;
+				stream->in[i].mid = 0;
 		}
 
 		for (i = 0; i < stream->outcnt; i++)
-- 
cgit v1.2.3


From 463118c34a3585768fa9715a2faca2b3697b1cf5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:32 +0800
Subject: sctp: support sysctl to allow users to use stream interleave

This is the last patch for support of stream interleave, after this patch,
users could enable stream interleave by systcl -w net.sctp.intl_enable=1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sysctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index ef7ca44d6e6a..33ca5b73cdb3 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -288,6 +288,13 @@ static struct ctl_table sctp_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_sctp_do_auth,
 	},
+	{
+		.procname	= "intl_enable",
+		.data		= &init_net.sctp.intl_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "addr_scope_policy",
 		.data		= &init_net.sctp.scope_policy,
-- 
cgit v1.2.3


From 75e8e15635e08f2598ecd20f4f71f4d043dd6e68 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Fri, 15 Dec 2017 16:16:40 +1100
Subject: net/ncsi: Don't take any action on HNCDSC AEN

The current HNCDSC handler takes the status flag from the AEN packet and
will update or change the current channel based on this flag and the
current channel status.

However the flag from the HNCDSC packet merely represents the host link
state. While the state of the host interface is potentially interesting
information it should not affect the state of the NCSI link. Indeed the
NCSI specification makes no mention of any recommended action related to
the host network controller driver state.

Update the HNCDSC handler to record the host network driver status but
take no other action.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Acked-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-aen.c | 35 +++--------------------------------
 1 file changed, 3 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 67e708e98ccf..e7b05de1e6d1 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -143,43 +143,14 @@ static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp,
 	if (!nc)
 		return -ENODEV;
 
-	/* If the channel is active one, we need reconfigure it */
 	spin_lock_irqsave(&nc->lock, flags);
 	ncm = &nc->modes[NCSI_MODE_LINK];
 	hncdsc = (struct ncsi_aen_hncdsc_pkt *)h;
 	ncm->data[3] = ntohl(hncdsc->status);
-	netdev_info(ndp->ndev.dev, "NCSI: HNCDSC AEN - channel %u state %s\n",
-		    nc->id, ncm->data[3] & 0x3 ? "up" : "down");
-	if (!list_empty(&nc->link) ||
-	    nc->state != NCSI_CHANNEL_ACTIVE) {
-		spin_unlock_irqrestore(&nc->lock, flags);
-		return 0;
-	}
-
-	spin_unlock_irqrestore(&nc->lock, flags);
-	if (!(ndp->flags & NCSI_DEV_HWA) && !(ncm->data[3] & 0x1))
-		ndp->flags |= NCSI_DEV_RESHUFFLE;
-
-	/* If this channel is the active one and the link doesn't
-	 * work, we have to choose another channel to be active one.
-	 * The logic here is exactly similar to what we do when link
-	 * is down on the active channel.
-	 *
-	 * On the other hand, we need configure it when host driver
-	 * state on the active channel becomes ready.
-	 */
-	ncsi_stop_channel_monitor(nc);
-
-	spin_lock_irqsave(&nc->lock, flags);
-	nc->state = (ncm->data[3] & 0x1) ? NCSI_CHANNEL_INACTIVE :
-					   NCSI_CHANNEL_ACTIVE;
 	spin_unlock_irqrestore(&nc->lock, flags);
-
-	spin_lock_irqsave(&ndp->lock, flags);
-	list_add_tail_rcu(&nc->link, &ndp->channel_queue);
-	spin_unlock_irqrestore(&ndp->lock, flags);
-
-	ncsi_process_next_channel(ndp);
+	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
+		      "NCSI: host driver %srunning on channel %u\n",
+		      ncm->data[3] & 0x1 ? "" : "not ", nc->id);
 
 	return 0;
 }
-- 
cgit v1.2.3


From ae3e13373b879670d873e5657a903bd208f0dc40 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 15 Dec 2017 14:27:43 -0800
Subject: net: erspan: fix wrong return value

If pskb_may_pull return failed, return PACKET_REJECT
instead of -ENOMEM.

Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support")
Fixes: f551c91de262 ("net: erspan: introduce erspan v2 for ip_gre")
Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Acked-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 2 +-
 net/ipv6/ip6_gre.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9a80d84fc182..b3a32a980240 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -287,7 +287,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	if (tunnel) {
 		len = gre_hdr_len + erspan_hdr_len(ver);
 		if (unlikely(!pskb_may_pull(skb, len)))
-			return -ENOMEM;
+			return PACKET_REJECT;
 
 		if (__iptunnel_pull_header(skb,
 					   len,
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 5c9c65f1d5c2..b3e4e0384f36 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -524,7 +524,7 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 		int len = erspan_hdr_len(ver);
 
 		if (unlikely(!pskb_may_pull(skb, len)))
-			return -ENOMEM;
+			return PACKET_REJECT;
 
 		if (__iptunnel_pull_header(skb, len,
 					   htons(ETH_P_TEB),
-- 
cgit v1.2.3


From d91e8db5b629a3c8c81db4dc317a66c7b5591821 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 15 Dec 2017 14:27:44 -0800
Subject: net: erspan: reload pointer after pskb_may_pull

pskb_may_pull() can change skb->data, so we need to re-load pkt_md
and ershdr at the right place.

Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support")
Fixes: f551c91de262 ("net: erspan: introduce erspan v2 for ip_gre")
Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 4 +++-
 net/ipv6/ip6_gre.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b3a32a980240..fd4d6e96da7e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -279,7 +279,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	 * Use ERSPAN 10-bit session ID as key.
 	 */
 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
-	pkt_md = (struct erspan_metadata *)(ershdr + 1);
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 				  tpi->flags | TUNNEL_KEY,
 				  iph->saddr, iph->daddr, tpi->key);
@@ -289,6 +288,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 		if (unlikely(!pskb_may_pull(skb, len)))
 			return PACKET_REJECT;
 
+		ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+		pkt_md = (struct erspan_metadata *)(ershdr + 1);
+
 		if (__iptunnel_pull_header(skb,
 					   len,
 					   htons(ETH_P_TEB),
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b3e4e0384f36..87b9892dfa23 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -515,7 +515,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 
 	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
-	pkt_md = (struct erspan_metadata *)(ershdr + 1);
 
 	tunnel = ip6gre_tunnel_lookup(skb->dev,
 				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
@@ -526,6 +525,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 		if (unlikely(!pskb_may_pull(skb, len)))
 			return PACKET_REJECT;
 
+		ershdr = (struct erspan_base_hdr *)skb->data;
+		pkt_md = (struct erspan_metadata *)(ershdr + 1);
+
 		if (__iptunnel_pull_header(skb, len,
 					   htons(ETH_P_TEB),
 					   false, false) < 0)
-- 
cgit v1.2.3


From c060bc6115b6a204cb60e6b03fe64135731bc6c8 Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Tue, 19 Dec 2017 07:17:15 +0800
Subject: bpf: make function xdp_do_generic_redirect_map() static

The function xdp_do_generic_redirect_map() is only used in this file, so
make it static.

Clean up sparse warning:
net/core/filter.c:2687:5: warning: no previous prototype
for 'xdp_do_generic_redirect_map' [-Wmissing-prototypes]

Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 754abe1041b7..130b842c3a15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2684,8 +2684,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
 	return 0;
 }
 
-int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
-				struct bpf_prog *xdp_prog)
+static int xdp_do_generic_redirect_map(struct net_device *dev,
+				       struct sk_buff *skb,
+				       struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
-- 
cgit v1.2.3


From 0973dd45ecefd746569d414406f5733062fe2817 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Dec 2017 10:10:48 +0100
Subject: Revert "mac80211: Add airtime account and scheduling to TXQs"

This reverts commit b0d52ad821843a6c5badebd80feef9f871904fa6.

We need to revert the TXQ scheduling API due to conflicts
with a new driver, and this depends on that API.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 24 ------------------------
 net/mac80211/debugfs.c     |  1 -
 net/mac80211/debugfs_sta.c | 29 -----------------------------
 net/mac80211/ieee80211_i.h |  8 ++------
 net/mac80211/main.c        |  3 +--
 net/mac80211/rx.c          |  8 --------
 net/mac80211/sta_info.c    |  2 --
 net/mac80211/sta_info.h    |  7 -------
 net/mac80211/status.c      | 16 ----------------
 net/mac80211/tx.c          | 31 +++++--------------------------
 10 files changed, 8 insertions(+), 121 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 531b526a10db..45155803c875 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1188,8 +1188,6 @@ enum mac80211_rx_encoding {
  *	HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT)
  * @nss: number of streams (VHT and HE only)
  * @flag: %RX_FLAG_\*
- * @airtime: Duration of frame in usec. See @IEEE80211_HW_AIRTIME_ACCOUNTING for
- *       how to use this.
  * @encoding: &enum mac80211_rx_encoding
  * @bw: &enum rate_info_bw
  * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags
@@ -1204,7 +1202,6 @@ struct ieee80211_rx_status {
 	u32 device_timestamp;
 	u32 ampdu_reference;
 	u32 flag;
-	u16 airtime;
 	u16 freq;
 	u8 enc_flags;
 	u8 encoding:2, bw:3;
@@ -2069,26 +2066,6 @@ struct ieee80211_txq {
  * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on
  *	TDLS links.
  *
- * @IEEE80211_HW_AIRTIME_ACCOUNTING: Hardware supports accounting the airtime
- *      usage of other stations and reports it in the @tx_time and/or @airtime
- *      fields of the TX/RX status structs.
- *      When setting this flag, the driver should ensure that the respective
- *      fields in the TX and RX status structs are always either zero or
- *      contains a valid duration for the frame in usec. The driver can choose
- *      to report either or both of TX and RX airtime, but it is recommended to
- *      report both.
- *      The reported airtime should as a minimum include all time that is spent
- *      transmitting to the remote station, including overhead and padding, but
- *      not including time spent waiting for a TXOP. If the time is not reported
- *      by the hardware it can in some cases be calculated from the rate and
- *      known frame composition. When possible, the time should include any
- *      failed transmission attempts.
- *      For aggregated frames, there are two possible strategies to report the
- *      airtime: Either include the airtime of the entire aggregate in the first
- *      (or last) frame and leave the others at zero. Alternatively, include the
- *      overhead of the full aggregate in the first or last frame and report the
- *      time of each frame + padding not including the full aggregate overhead.
- *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2132,7 +2109,6 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_REPORTS_LOW_ACK,
 	IEEE80211_HW_SUPPORTS_TX_FRAG,
 	IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA,
-	IEEE80211_HW_AIRTIME_ACCOUNTING,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index d6b87a4ec3e9..1f466d12a6bc 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -212,7 +212,6 @@ static const char *hw_flag_names[] = {
 	FLAG(REPORTS_LOW_ACK),
 	FLAG(SUPPORTS_TX_FRAG),
 	FLAG(SUPPORTS_TDLS_BUFFER_STA),
-	FLAG(AIRTIME_ACCOUNTING),
 #undef FLAG
 };
 
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 40dba446836f..b15412c21ac9 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -188,32 +188,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
 }
 STA_OPS(aqm);
 
-static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
-				size_t count, loff_t *ppos)
-{
-	struct sta_info *sta = file->private_data;
-	size_t bufsz = 200;
-	char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
-	ssize_t rv;
-
-	if (!buf)
-		return -ENOMEM;
-
-	spin_lock_bh(&sta->lock);
-
-	p += scnprintf(p, bufsz + buf - p,
-		"RX: %llu us\nTX: %llu us\nDeficit: %lld us\n",
-		sta->airtime_stats.rx_airtime,
-		sta->airtime_stats.tx_airtime,
-		sta->airtime_deficit);
-
-	spin_unlock_bh(&sta->lock);
-	rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
-	kfree(buf);
-	return rv;
-}
-STA_OPS(airtime);
-
 static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
 					size_t count, loff_t *ppos)
 {
@@ -568,9 +542,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	if (local->ops->wake_tx_queue)
 		DEBUGFS_ADD(aqm);
 
-	if (ieee80211_hw_check(&local->hw, AIRTIME_ACCOUNTING))
-		DEBUGFS_ADD(airtime);
-
 	if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
 		debugfs_create_x32("driver_buffered_tids", 0400,
 				   sta->debugfs_dir,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 120c516851cf..4155838c7bef 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -90,9 +90,6 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
 
 #define IEEE80211_MAX_NAN_INSTANCE_ID 255
 
-/* How much to increase airtime deficit on each scheduling round */
-#define IEEE80211_AIRTIME_QUANTUM        1000 /* usec */
-
 struct ieee80211_fragment_entry {
 	struct sk_buff_head skb_list;
 	unsigned long first_frag_time;
@@ -1126,10 +1123,9 @@ struct ieee80211_local {
 	struct codel_vars *cvars;
 	struct codel_params cparams;
 
-	/* protects active_txqs_{new,old} and txqi->schedule_order */
+	/* protects active_txqs and txqi->schedule_order */
 	spinlock_t active_txq_lock;
-	struct list_head active_txqs_new;
-	struct list_head active_txqs_old;
+	struct list_head active_txqs;
 
 	const struct ieee80211_ops *ops;
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index b7142f8491d0..935d6e2491b1 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -619,8 +619,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	spin_lock_init(&local->rx_path_lock);
 	spin_lock_init(&local->queue_stop_reason_lock);
 
-	INIT_LIST_HEAD(&local->active_txqs_new);
-	INIT_LIST_HEAD(&local->active_txqs_old);
+	INIT_LIST_HEAD(&local->active_txqs);
 	spin_lock_init(&local->active_txq_lock);
 
 	INIT_LIST_HEAD(&local->chanctx_list);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 808f41fb536a..b3cff69bfd66 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1630,14 +1630,6 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	if (ieee80211_vif_is_mesh(&rx->sdata->vif))
 		ieee80211_mps_rx_h_sta_process(sta, hdr);
 
-	/* airtime accounting */
-	if (status->airtime) {
-		spin_lock_bh(&sta->lock);
-		sta->airtime_stats.rx_airtime += status->airtime;
-		sta->airtime_deficit -= status->airtime;
-		spin_unlock_bh(&sta->lock);
-	}
-
 	/*
 	 * Drop (qos-)data::nullfunc frames silently, since they
 	 * are used only to control station power saving mode.
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index ed5500e8aafb..e0bcf16df494 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -425,8 +425,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	sta->cparams.interval = MS2TIME(100);
 	sta->cparams.ecn = true;
 
-	sta->airtime_deficit = IEEE80211_AIRTIME_QUANTUM;
-
 	sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
 
 	return sta;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index e356f2f85e12..cd53619435b6 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -559,13 +559,6 @@ struct sta_info {
 	} tx_stats;
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
 
-	/* Airtime stats and deficit, protected by lock */
-	struct {
-		u64 rx_airtime;
-		u64 tx_airtime;
-	} airtime_stats;
-	s64 airtime_deficit;
-
 	/*
 	 * Aggregation information, locked with lock.
 	 */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index b044dbed2bb1..da7427a41529 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -823,14 +823,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 				ieee80211_lost_packet(sta, info);
 			}
 		}
-
-		if (info->status.tx_time &&
-		    ieee80211_hw_check(&local->hw, AIRTIME_ACCOUNTING)) {
-			spin_lock_bh(&sta->lock);
-			sta->airtime_stats.tx_airtime += info->status.tx_time;
-			sta->airtime_deficit -= info->status.tx_time;
-			spin_unlock_bh(&sta->lock);
-		}
 	}
 
 	/* SNMP counters
@@ -955,14 +947,6 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 			sta->status_stats.retry_failed++;
 		sta->status_stats.retry_count += retry_count;
 
-		if (info->status.tx_time &&
-		    ieee80211_hw_check(&local->hw, AIRTIME_ACCOUNTING)) {
-			spin_lock_bh(&sta->lock);
-			sta->airtime_stats.tx_airtime += info->status.tx_time;
-			sta->airtime_deficit -= info->status.tx_time;
-			spin_unlock_bh(&sta->lock);
-		}
-
 		if (acked) {
 			sta->status_stats.last_ack = jiffies;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 18381581b5e9..842881ca8f20 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3566,7 +3566,7 @@ bool ieee80211_schedule_txq(struct ieee80211_hw *hw,
 	spin_lock_bh(&local->active_txq_lock);
 
 	if (list_empty(&txqi->schedule_order)) {
-		list_add_tail(&txqi->schedule_order, &local->active_txqs_new);
+		list_add_tail(&txqi->schedule_order, &local->active_txqs);
 		ret = true;
 	}
 
@@ -3580,35 +3580,14 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = NULL;
-	struct list_head *head;
 
 	spin_lock_bh(&local->active_txq_lock);
 
-begin:
-	head = &local->active_txqs_new;
-	if (list_empty(head)) {
-		head = &local->active_txqs_old;
-		if (list_empty(head))
-			goto out;
-	}
-
-	txqi = list_first_entry(head, struct txq_info, schedule_order);
-
-	if (txqi->txq.sta) {
-		struct sta_info *sta = container_of(txqi->txq.sta,
-						struct sta_info, sta);
-
-		spin_lock_bh(&sta->lock);
-		if (sta->airtime_deficit < 0) {
-			sta->airtime_deficit += IEEE80211_AIRTIME_QUANTUM;
-			list_move_tail(&txqi->schedule_order,
-				       &local->active_txqs_old);
-			spin_unlock_bh(&sta->lock);
-			goto begin;
-		}
-		spin_unlock_bh(&sta->lock);
-	}
+	if (list_empty(&local->active_txqs))
+		goto out;
 
+	txqi = list_first_entry(&local->active_txqs,
+				struct txq_info, schedule_order);
 	list_del_init(&txqi->schedule_order);
 
 out:
-- 
cgit v1.2.3


From e7881bd5942df7df2fc450fd2aaa753fc4c4e125 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Dec 2017 10:11:54 +0100
Subject: Revert "mac80211: Add TXQ scheduling API"

This reverts commit e937b8da5a591f141fe41aa48a2e898df9888c95.

Turns out that a new driver (mt76) is coming in through
Kalle's tree, and will conflict with this. It also has some
conflicting requirements, so we'll revisit this later.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/core.c |   2 +
 drivers/net/wireless/ath/ath10k/core.h |   4 +
 drivers/net/wireless/ath/ath10k/mac.c  |  55 ++++++---
 drivers/net/wireless/ath/ath9k/ath9k.h |   9 +-
 drivers/net/wireless/ath/ath9k/main.c  |   2 +-
 drivers/net/wireless/ath/ath9k/recv.c  |   2 +
 drivers/net/wireless/ath/ath9k/xmit.c  | 210 +++++++++++++++++++++++++--------
 include/net/mac80211.h                 |  37 +-----
 net/mac80211/agg-tx.c                  |   6 +-
 net/mac80211/driver-ops.h              |  12 +-
 net/mac80211/ieee80211_i.h             |   5 -
 net/mac80211/main.c                    |   3 -
 net/mac80211/sta_info.c                |   7 +-
 net/mac80211/trace.h                   |  32 ++++-
 net/mac80211/tx.c                      |  49 +-------
 15 files changed, 262 insertions(+), 173 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 90d16a38475f..b29fdbd21ead 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2574,7 +2574,9 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev,
 
 	mutex_init(&ar->conf_mutex);
 	spin_lock_init(&ar->data_lock);
+	spin_lock_init(&ar->txqs_lock);
 
+	INIT_LIST_HEAD(&ar->txqs);
 	INIT_LIST_HEAD(&ar->peers);
 	init_waitqueue_head(&ar->peer_mapping_wq);
 	init_waitqueue_head(&ar->htt.empty_tx_wq);
diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index 4a79fdce9a08..643041ef3271 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -347,6 +347,7 @@ struct ath10k_peer {
 };
 
 struct ath10k_txq {
+	struct list_head list;
 	unsigned long num_fw_queued;
 	unsigned long num_push_allowed;
 };
@@ -894,7 +895,10 @@ struct ath10k {
 
 	/* protects shared structure data */
 	spinlock_t data_lock;
+	/* protects: ar->txqs, artxq->list */
+	spinlock_t txqs_lock;
 
+	struct list_head txqs;
 	struct list_head arvifs;
 	struct list_head peers;
 	struct ath10k_peer *peer_map[ATH10K_MAX_NUM_PEER_IDS];
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index cca4cd82853b..0a947eef348d 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -3830,10 +3830,12 @@ static void ath10k_mac_txq_init(struct ieee80211_txq *txq)
 		return;
 
 	artxq = (void *)txq->drv_priv;
+	INIT_LIST_HEAD(&artxq->list);
 }
 
 static void ath10k_mac_txq_unref(struct ath10k *ar, struct ieee80211_txq *txq)
 {
+	struct ath10k_txq *artxq;
 	struct ath10k_skb_cb *cb;
 	struct sk_buff *msdu;
 	int msdu_id;
@@ -3841,6 +3843,12 @@ static void ath10k_mac_txq_unref(struct ath10k *ar, struct ieee80211_txq *txq)
 	if (!txq)
 		return;
 
+	artxq = (void *)txq->drv_priv;
+	spin_lock_bh(&ar->txqs_lock);
+	if (!list_empty(&artxq->list))
+		list_del_init(&artxq->list);
+	spin_unlock_bh(&ar->txqs_lock);
+
 	spin_lock_bh(&ar->htt.tx_lock);
 	idr_for_each_entry(&ar->htt.pending_tx, msdu, msdu_id) {
 		cb = ATH10K_SKB_CB(msdu);
@@ -3970,17 +3978,23 @@ int ath10k_mac_tx_push_txq(struct ieee80211_hw *hw,
 void ath10k_mac_tx_push_pending(struct ath10k *ar)
 {
 	struct ieee80211_hw *hw = ar->hw;
-	struct ieee80211_txq *txq, *first = NULL;
+	struct ieee80211_txq *txq;
+	struct ath10k_txq *artxq;
+	struct ath10k_txq *last;
 	int ret;
 	int max;
 
 	if (ar->htt.num_pending_tx >= (ar->htt.max_num_pending_tx / 2))
 		return;
 
+	spin_lock_bh(&ar->txqs_lock);
 	rcu_read_lock();
 
-	txq = ieee80211_next_txq(hw);
-	while (txq) {
+	last = list_last_entry(&ar->txqs, struct ath10k_txq, list);
+	while (!list_empty(&ar->txqs)) {
+		artxq = list_first_entry(&ar->txqs, struct ath10k_txq, list);
+		txq = container_of((void *)artxq, struct ieee80211_txq,
+				   drv_priv);
 
 		/* Prevent aggressive sta/tid taking over tx queue */
 		max = 16;
@@ -3991,21 +4005,18 @@ void ath10k_mac_tx_push_pending(struct ath10k *ar)
 				break;
 		}
 
+		list_del_init(&artxq->list);
 		if (ret != -ENOENT)
-			ieee80211_schedule_txq(hw, txq);
+			list_add_tail(&artxq->list, &ar->txqs);
 
 		ath10k_htt_tx_txq_update(hw, txq);
 
-		if (first == txq || (ret < 0 && ret != -ENOENT))
+		if (artxq == last || (ret < 0 && ret != -ENOENT))
 			break;
-
-		if (!first)
-			first = txq;
-
-		txq = ieee80211_next_txq(hw);
 	}
 
 	rcu_read_unlock();
+	spin_unlock_bh(&ar->txqs_lock);
 }
 
 /************/
@@ -4239,22 +4250,34 @@ static void ath10k_mac_op_tx(struct ieee80211_hw *hw,
 	}
 }
 
-static void ath10k_mac_op_wake_tx_queue(struct ieee80211_hw *hw)
+static void ath10k_mac_op_wake_tx_queue(struct ieee80211_hw *hw,
+					struct ieee80211_txq *txq)
 {
-	struct ieee80211_txq *txq;
+	struct ath10k *ar = hw->priv;
+	struct ath10k_txq *artxq = (void *)txq->drv_priv;
+	struct ieee80211_txq *f_txq;
+	struct ath10k_txq *f_artxq;
 	int ret = 0;
 	int max = 16;
 
-	txq = ieee80211_next_txq(hw);
+	spin_lock_bh(&ar->txqs_lock);
+	if (list_empty(&artxq->list))
+		list_add_tail(&artxq->list, &ar->txqs);
+
+	f_artxq = list_first_entry(&ar->txqs, struct ath10k_txq, list);
+	f_txq = container_of((void *)f_artxq, struct ieee80211_txq, drv_priv);
+	list_del_init(&f_artxq->list);
 
-	while (ath10k_mac_tx_can_push(hw, txq) && max--) {
-		ret = ath10k_mac_tx_push_txq(hw, txq);
+	while (ath10k_mac_tx_can_push(hw, f_txq) && max--) {
+		ret = ath10k_mac_tx_push_txq(hw, f_txq);
 		if (ret)
 			break;
 	}
 	if (ret != -ENOENT)
-		ieee80211_schedule_txq(hw, txq);
+		list_add_tail(&f_artxq->list, &ar->txqs);
+	spin_unlock_bh(&ar->txqs_lock);
 
+	ath10k_htt_tx_txq_update(hw, f_txq);
 	ath10k_htt_tx_txq_update(hw, txq);
 }
 
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index face2bb7f357..ef0de4f1312c 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -246,8 +246,12 @@ struct ath_atx_tid {
 	s8 bar_index;
 	bool active;
 	bool clear_ps_filter;
+	bool has_queued;
 };
 
+void __ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid);
+void ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid);
+
 struct ath_node {
 	struct ath_softc *sc;
 	struct ieee80211_sta *sta; /* station struct we're part of */
@@ -587,7 +591,8 @@ bool ath_drain_all_txq(struct ath_softc *sc);
 void ath_draintxq(struct ath_softc *sc, struct ath_txq *txq);
 void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an);
 void ath_tx_node_cleanup(struct ath_softc *sc, struct ath_node *an);
-void ath_txq_schedule(struct ath_softc *sc);
+void ath_txq_schedule(struct ath_softc *sc, struct ath_txq *txq);
+void ath_txq_schedule_all(struct ath_softc *sc);
 int ath_tx_init(struct ath_softc *sc, int nbufs);
 int ath_txq_update(struct ath_softc *sc, int qnum,
 		   struct ath9k_tx_queue_info *q);
@@ -613,7 +618,7 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
 				   u16 tids, int nframes,
 				   enum ieee80211_frame_release_type reason,
 				   bool more_data);
-void ath9k_wake_tx_queue(struct ieee80211_hw *hw);
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue);
 
 /********/
 /* VIFs */
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index f7dfcdf508ce..a3be8add56e1 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -266,7 +266,7 @@ static bool ath_complete_reset(struct ath_softc *sc, bool start)
 		}
 	work:
 		ath_restart_work(sc);
-		ath_txq_schedule(sc);
+		ath_txq_schedule_all(sc);
 	}
 
 	sc->gtt_cnt = 0;
diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
index a768e841524d..2197aee2bb72 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -1057,6 +1057,8 @@ static void ath_rx_count_airtime(struct ath_softc *sc,
  	if (!!(sc->airtime_flags & AIRTIME_USE_RX)) {
 		spin_lock_bh(&acq->lock);
 		an->airtime_deficit[acno] -= airtime;
+		if (an->airtime_deficit[acno] <= 0)
+			__ath_tx_queue_tid(sc, ATH_AN_2_TID(an, tidno));
 		spin_unlock_bh(&acq->lock);
 	}
 	ath_debug_airtime(sc, an, airtime, 0);
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index bd438062a6db..396bf05c6bf6 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -112,11 +112,62 @@ void ath_txq_unlock_complete(struct ath_softc *sc, struct ath_txq *txq)
 		ath_tx_status(hw, skb);
 }
 
-void ath9k_wake_tx_queue(struct ieee80211_hw *hw)
+void __ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid)
+{
+	struct ath_vif *avp = (struct ath_vif *) tid->an->vif->drv_priv;
+	struct ath_chanctx *ctx = avp->chanctx;
+	struct ath_acq *acq;
+	struct list_head *tid_list;
+	u8 acno = TID_TO_WME_AC(tid->tidno);
+
+	if (!ctx || !list_empty(&tid->list))
+		return;
+
+
+	acq = &ctx->acq[acno];
+	if ((sc->airtime_flags & AIRTIME_USE_NEW_QUEUES) &&
+	    tid->an->airtime_deficit[acno] > 0)
+		tid_list = &acq->acq_new;
+	else
+		tid_list = &acq->acq_old;
+
+	list_add_tail(&tid->list, tid_list);
+}
+
+void ath_tx_queue_tid(struct ath_softc *sc, struct ath_atx_tid *tid)
+{
+	struct ath_vif *avp = (struct ath_vif *) tid->an->vif->drv_priv;
+	struct ath_chanctx *ctx = avp->chanctx;
+	struct ath_acq *acq;
+
+	if (!ctx || !list_empty(&tid->list))
+		return;
+
+	acq = &ctx->acq[TID_TO_WME_AC(tid->tidno)];
+	spin_lock_bh(&acq->lock);
+	__ath_tx_queue_tid(sc, tid);
+	spin_unlock_bh(&acq->lock);
+}
+
+
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue)
 {
 	struct ath_softc *sc = hw->priv;
+	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	struct ath_atx_tid *tid = (struct ath_atx_tid *) queue->drv_priv;
+	struct ath_txq *txq = tid->txq;
+
+	ath_dbg(common, QUEUE, "Waking TX queue: %pM (%d)\n",
+		queue->sta ? queue->sta->addr : queue->vif->addr,
+		tid->tidno);
+
+	ath_txq_lock(sc, txq);
 
-	ath_txq_schedule(sc);
+	tid->has_queued = true;
+	ath_tx_queue_tid(sc, tid);
+	ath_txq_schedule(sc, txq);
+
+	ath_txq_unlock(sc, txq);
 }
 
 static struct ath_frame_info *get_frame_info(struct sk_buff *skb)
@@ -179,9 +230,14 @@ ath_tid_pull(struct ath_atx_tid *tid)
 	struct ath_frame_info *fi;
 	int q;
 
+	if (!tid->has_queued)
+		return NULL;
+
 	skb = ieee80211_tx_dequeue(hw, txq);
-	if (!skb)
+	if (!skb) {
+		tid->has_queued = false;
 		return NULL;
+	}
 
 	if (ath_tx_prepare(hw, skb, &txctl)) {
 		ieee80211_free_txskb(hw, skb);
@@ -198,6 +254,12 @@ ath_tid_pull(struct ath_atx_tid *tid)
 	return skb;
  }
 
+
+static bool ath_tid_has_buffered(struct ath_atx_tid *tid)
+{
+	return !skb_queue_empty(&tid->retry_q) || tid->has_queued;
+}
+
 static struct sk_buff *ath_tid_dequeue(struct ath_atx_tid *tid)
 {
 	struct sk_buff *skb;
@@ -609,10 +671,7 @@ static void ath_tx_complete_aggr(struct ath_softc *sc, struct ath_txq *txq,
 
 		skb_queue_splice_tail(&bf_pending, &tid->retry_q);
 		if (!an->sleeping) {
-			struct ieee80211_txq *queue = container_of(
-				(void *)tid, struct ieee80211_txq, drv_priv);
-
-			ieee80211_schedule_txq(sc->hw, queue);
+			ath_tx_queue_tid(sc, tid);
 
 			if (ts->ts_status & (ATH9K_TXERR_FILT | ATH9K_TXERR_XRETRY))
 				tid->clear_ps_filter = true;
@@ -660,6 +719,8 @@ static void ath_tx_count_airtime(struct ath_softc *sc, struct ath_node *an,
 
 		spin_lock_bh(&acq->lock);
 		an->airtime_deficit[q] -= airtime;
+		if (an->airtime_deficit[q] <= 0)
+			__ath_tx_queue_tid(sc, tid);
 		spin_unlock_bh(&acq->lock);
 	}
 	ath_debug_airtime(sc, an, 0, airtime);
@@ -709,6 +770,8 @@ static void ath_tx_process_buffer(struct ath_softc *sc, struct ath_txq *txq,
 	} else
 		ath_tx_complete_aggr(sc, txq, bf, bf_head, sta, tid, ts, txok);
 
+	if (!flush)
+		ath_txq_schedule(sc, txq);
 }
 
 static bool ath_lookup_legacy(struct ath_buf *bf)
@@ -1443,8 +1506,8 @@ ath_tx_form_burst(struct ath_softc *sc, struct ath_txq *txq,
 	} while (1);
 }
 
-static int ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
-			     struct ath_atx_tid *tid)
+static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
+			      struct ath_atx_tid *tid)
 {
 	struct ath_buf *bf;
 	struct ieee80211_tx_info *tx_info;
@@ -1452,18 +1515,21 @@ static int ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
 	int aggr_len = 0;
 	bool aggr;
 
+	if (!ath_tid_has_buffered(tid))
+		return false;
+
 	INIT_LIST_HEAD(&bf_q);
 
 	bf = ath_tx_get_tid_subframe(sc, txq, tid);
 	if (!bf)
-		return -ENOENT;
+		return false;
 
 	tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
 	aggr = !!(tx_info->flags & IEEE80211_TX_CTL_AMPDU);
 	if ((aggr && txq->axq_ampdu_depth >= ATH_AGGR_MIN_QDEPTH) ||
 	    (!aggr && txq->axq_depth >= ATH_NON_AGGR_MIN_QDEPTH)) {
 		__skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
-		return -ENOBUFS;
+		return false;
 	}
 
 	ath_set_rates(tid->an->vif, tid->an->sta, bf);
@@ -1473,7 +1539,7 @@ static int ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
 		ath_tx_form_burst(sc, txq, tid, &bf_q, bf);
 
 	if (list_empty(&bf_q))
-		return -ENOENT;
+		return false;
 
 	if (tid->clear_ps_filter || tid->an->no_ps_filter) {
 		tid->clear_ps_filter = false;
@@ -1482,7 +1548,7 @@ static int ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
 
 	ath_tx_fill_desc(sc, bf, txq, aggr_len);
 	ath_tx_txqaddbuf(sc, txq, &bf_q, false);
-	return 0;
+	return true;
 }
 
 int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta,
@@ -1545,49 +1611,52 @@ void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	struct ath_atx_tid *tid;
-	struct ieee80211_txq *queue;
+	struct ath_txq *txq;
 	int tidno;
 
 	ath_dbg(common, XMIT, "%s called\n", __func__);
 
 	for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
 		tid = ath_node_to_tid(an, tidno);
-		queue = container_of((void *)tid,
-				     struct ieee80211_txq, drv_priv);
+		txq = tid->txq;
+
+		ath_txq_lock(sc, txq);
+
+		if (list_empty(&tid->list)) {
+			ath_txq_unlock(sc, txq);
+			continue;
+		}
 
 		if (!skb_queue_empty(&tid->retry_q))
 			ieee80211_sta_set_buffered(sta, tid->tidno, true);
 
+		list_del_init(&tid->list);
+
+		ath_txq_unlock(sc, txq);
 	}
 }
 
 void ath_tx_aggr_wakeup(struct ath_softc *sc, struct ath_node *an)
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
-	struct ieee80211_txq *queue;
 	struct ath_atx_tid *tid;
 	struct ath_txq *txq;
 	int tidno;
-	bool sched, wake = false;
 
 	ath_dbg(common, XMIT, "%s called\n", __func__);
 
 	for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
 		tid = ath_node_to_tid(an, tidno);
 		txq = tid->txq;
-		queue = container_of((void *)tid,
-				     struct ieee80211_txq, drv_priv);
 
 		ath_txq_lock(sc, txq);
 		tid->clear_ps_filter = true;
-		sched = !skb_queue_empty(&tid->retry_q);
-		ath_txq_unlock(sc, txq);
-
-		if (sched && ieee80211_schedule_txq(sc->hw, queue))
-			wake = true;
+		if (ath_tid_has_buffered(tid)) {
+			ath_tx_queue_tid(sc, tid);
+			ath_txq_schedule(sc, txq);
+		}
+		ath_txq_unlock_complete(sc, txq);
 	}
-	if (wake)
-		ath_txq_schedule(sc);
 }
 
 void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
@@ -1879,44 +1948,86 @@ void ath_tx_cleanupq(struct ath_softc *sc, struct ath_txq *txq)
 /* For each acq entry, for each tid, try to schedule packets
  * for transmit until ampdu_depth has reached min Q depth.
  */
-void ath_txq_schedule(struct ath_softc *sc)
+void ath_txq_schedule(struct ath_softc *sc, struct ath_txq *txq)
 {
-	struct ieee80211_hw *hw = sc->hw;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
-	struct ieee80211_txq *queue;
 	struct ath_atx_tid *tid;
-	struct ath_txq *txq;
-	int ret = 0;
+	struct list_head *tid_list;
+	struct ath_acq *acq;
+	bool active = AIRTIME_ACTIVE(sc->airtime_flags);
 
-	if (test_bit(ATH_OP_HW_RESET, &common->op_flags))
+	if (txq->mac80211_qnum < 0)
 		return;
 
-	queue = ieee80211_next_txq(hw);
-	if (!queue)
+	if (test_bit(ATH_OP_HW_RESET, &common->op_flags))
 		return;
 
-	tid = (struct ath_atx_tid *)queue->drv_priv;
-	txq = tid->txq;
+	spin_lock_bh(&sc->chan_lock);
+	rcu_read_lock();
+	acq = &sc->cur_chan->acq[txq->mac80211_qnum];
 
-	ath_txq_lock(sc, txq);
-	if (txq->mac80211_qnum < 0)
+	if (sc->cur_chan->stopped)
 		goto out;
 
-	spin_lock_bh(&sc->chan_lock);
-	rcu_read_lock();
+begin:
+	tid_list = &acq->acq_new;
+	if (list_empty(tid_list)) {
+		tid_list = &acq->acq_old;
+		if (list_empty(tid_list))
+			goto out;
+	}
+	tid = list_first_entry(tid_list, struct ath_atx_tid, list);
 
-	if (!sc->cur_chan->stopped)
-		ret = ath_tx_sched_aggr(sc, txq, tid);
+	if (active && tid->an->airtime_deficit[txq->mac80211_qnum] <= 0) {
+		spin_lock_bh(&acq->lock);
+		tid->an->airtime_deficit[txq->mac80211_qnum] += ATH_AIRTIME_QUANTUM;
+		list_move_tail(&tid->list, &acq->acq_old);
+		spin_unlock_bh(&acq->lock);
+		goto begin;
+	}
 
+	if (!ath_tid_has_buffered(tid)) {
+		spin_lock_bh(&acq->lock);
+		if ((tid_list == &acq->acq_new) && !list_empty(&acq->acq_old))
+			list_move_tail(&tid->list, &acq->acq_old);
+		else {
+			list_del_init(&tid->list);
+		}
+		spin_unlock_bh(&acq->lock);
+		goto begin;
+	}
+
+
+	/*
+	 * If we succeed in scheduling something, immediately restart to make
+	 * sure we keep the HW busy.
+	 */
+	if(ath_tx_sched_aggr(sc, txq, tid)) {
+		if (!active) {
+			spin_lock_bh(&acq->lock);
+			list_move_tail(&tid->list, &acq->acq_old);
+			spin_unlock_bh(&acq->lock);
+		}
+		goto begin;
+	}
+
+out:
 	rcu_read_unlock();
 	spin_unlock_bh(&sc->chan_lock);
+}
 
-out:
+void ath_txq_schedule_all(struct ath_softc *sc)
+{
+	struct ath_txq *txq;
+	int i;
 
-	if (ret != -ENOENT)
-		ieee80211_schedule_txq(hw, queue);
+	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+		txq = sc->tx.txq_map[i];
 
-	ath_txq_unlock(sc, txq);
+		spin_lock_bh(&txq->axq_lock);
+		ath_txq_schedule(sc, txq);
+		spin_unlock_bh(&txq->axq_lock);
+	}
 }
 
 /***********/
@@ -2534,6 +2645,7 @@ static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq)
 
 		if (list_empty(&txq->axq_q)) {
 			txq->axq_link = NULL;
+			ath_txq_schedule(sc, txq);
 			break;
 		}
 		bf = list_first_entry(&txq->axq_q, struct ath_buf, list);
@@ -2585,7 +2697,6 @@ static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq)
 		ath_tx_process_buffer(sc, txq, &ts, bf, &bf_head);
 	}
 	ath_txq_unlock_complete(sc, txq);
-	ath_txq_schedule(sc);
 }
 
 void ath_tx_tasklet(struct ath_softc *sc)
@@ -2600,7 +2711,6 @@ void ath_tx_tasklet(struct ath_softc *sc)
 			ath_tx_processq(sc, &sc->tx.txq[i]);
 	}
 	rcu_read_unlock();
-	ath_txq_schedule(sc);
 }
 
 void ath_tx_edma_tasklet(struct ath_softc *sc)
@@ -2686,7 +2796,6 @@ void ath_tx_edma_tasklet(struct ath_softc *sc)
 		ath_txq_unlock_complete(sc, txq);
 	}
 	rcu_read_unlock();
-	ath_txq_schedule(sc);
 }
 
 /*****************/
@@ -2766,6 +2875,7 @@ void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an)
 		tid->baw_head  = tid->baw_tail = 0;
 		tid->active	   = false;
 		tid->clear_ps_filter = true;
+		tid->has_queued  = false;
 		__skb_queue_head_init(&tid->retry_q);
 		INIT_LIST_HEAD(&tid->list);
 		acno = TID_TO_WME_AC(tidno);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 45155803c875..906e90223066 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -105,12 +105,9 @@
  * The driver is expected to initialize its private per-queue data for stations
  * and interfaces in the .add_interface and .sta_add ops.
  *
- * The driver can't access the queue directly. To obtain the next queue to pull
- * frames from, the driver calls ieee80211_next_txq(). To dequeue a frame from a
- * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a
- * queue, it calls the .wake_tx_queue driver op. The driver is expected to
- * re-schedule the txq using ieee80211_schedule_txq() if it is still active
- * after the driver has finished pulling packets from it.
+ * The driver can't access the queue directly. To dequeue a frame, it calls
+ * ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a queue, it
+ * calls the .wake_tx_queue driver op.
  *
  * For AP powersave TIM handling, the driver only needs to indicate if it has
  * buffered packets in the driver specific data structures by calling
@@ -3734,7 +3731,8 @@ struct ieee80211_ops {
 					 struct ieee80211_vif *vif,
 					 struct ieee80211_tdls_ch_sw_params *params);
 
-	void (*wake_tx_queue)(struct ieee80211_hw *hw);
+	void (*wake_tx_queue)(struct ieee80211_hw *hw,
+			      struct ieee80211_txq *txq);
 	void (*sync_rx_queues)(struct ieee80211_hw *hw);
 
 	int (*start_nan)(struct ieee80211_hw *hw,
@@ -5885,36 +5883,13 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid);
  * ieee80211_tx_dequeue - dequeue a packet from a software tx queue
  *
  * @hw: pointer as obtained from ieee80211_alloc_hw()
- * @txq: pointer obtained from ieee80211_next_txq()
+ * @txq: pointer obtained from station or virtual interface
  *
  * Returns the skb if successful, %NULL if no frame was available.
  */
 struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 				     struct ieee80211_txq *txq);
 
-/**
- * ieee80211_schedule_txq - add txq to scheduling loop
- *
- * @hw: pointer as obtained from ieee80211_alloc_hw()
- * @txq: pointer obtained from station or virtual interface
- *
- * Returns %true if the txq was actually added to the scheduling,
- * %false otherwise.
- */
-bool ieee80211_schedule_txq(struct ieee80211_hw *hw,
-			    struct ieee80211_txq *txq);
-
-/**
- * ieee80211_next_txq - get next tx queue to pull packets from
- *
- * @hw: pointer as obtained from ieee80211_alloc_hw()
- *
- * Returns the next txq if successful, %NULL if no queue is eligible. If a txq
- * is returned, it will have been removed from the scheduler queue and needs to
- * be re-scheduled with ieee80211_schedule_txq() to continue to be active.
- */
-struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw);
-
 /**
  * ieee80211_txq_get_depth - get pending frame/byte count of given txq
  *
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 6c6cad98ce92..595c662a61e8 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -226,13 +226,9 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable)
 		clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags);
 
 	clear_bit(IEEE80211_TXQ_STOP, &txqi->flags);
-
-	if (!ieee80211_schedule_txq(&sta->sdata->local->hw, txq))
-		return;
-
 	local_bh_disable();
 	rcu_read_lock();
-	drv_wake_tx_queue(sta->sdata->local);
+	drv_wake_tx_queue(sta->sdata->local, txqi);
 	rcu_read_unlock();
 	local_bh_enable();
 }
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index cdd76306cb8f..c7f93fd9ca7a 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1158,10 +1158,16 @@ drv_tdls_recv_channel_switch(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
-static inline void drv_wake_tx_queue(struct ieee80211_local *local)
+static inline void drv_wake_tx_queue(struct ieee80211_local *local,
+				     struct txq_info *txq)
 {
-	trace_drv_wake_tx_queue(local);
-	local->ops->wake_tx_queue(&local->hw);
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);
+
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	trace_drv_wake_tx_queue(local, sdata, txq);
+	local->ops->wake_tx_queue(&local->hw, &txq->txq);
 }
 
 static inline int drv_start_nan(struct ieee80211_local *local,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4155838c7bef..26900025de2f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -832,7 +832,6 @@ struct txq_info {
 	struct codel_vars def_cvars;
 	struct codel_stats cstats;
 	struct sk_buff_head frags;
-	struct list_head schedule_order;
 	unsigned long flags;
 
 	/* keep last! */
@@ -1123,10 +1122,6 @@ struct ieee80211_local {
 	struct codel_vars *cvars;
 	struct codel_params cparams;
 
-	/* protects active_txqs and txqi->schedule_order */
-	spinlock_t active_txq_lock;
-	struct list_head active_txqs;
-
 	const struct ieee80211_ops *ops;
 
 	/*
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 935d6e2491b1..0785d04a80bc 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -619,9 +619,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	spin_lock_init(&local->rx_path_lock);
 	spin_lock_init(&local->queue_stop_reason_lock);
 
-	INIT_LIST_HEAD(&local->active_txqs);
-	spin_lock_init(&local->active_txq_lock);
-
 	INIT_LIST_HEAD(&local->chanctx_list);
 	mutex_init(&local->chanctx_mtx);
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index e0bcf16df494..0c5627f8a104 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1237,17 +1237,12 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 		drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
 
 	if (sta->sta.txq[0]) {
-		bool wake = false;
-
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
 			if (!txq_has_queue(sta->sta.txq[i]))
 				continue;
 
-			if (ieee80211_schedule_txq(&local->hw, sta->sta.txq[i]))
-				wake = true;
+			drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
 		}
-		if (wake)
-			drv_wake_tx_queue(local);
 	}
 
 	skb_queue_head_init(&pending);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 08eaad85942e..591ad02e1fa4 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2550,9 +2550,35 @@ TRACE_EVENT(drv_tdls_recv_channel_switch,
 	)
 );
 
-DEFINE_EVENT(local_only_evt, drv_wake_tx_queue,
-	     TP_PROTO(struct ieee80211_local *local),
-	     TP_ARGS(local)
+TRACE_EVENT(drv_wake_tx_queue,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct txq_info *txq),
+
+	TP_ARGS(local, sdata, txq),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+		__field(u8, ac)
+		__field(u8, tid)
+	),
+
+	TP_fast_assign(
+		struct ieee80211_sta *sta = txq->txq.sta;
+
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+		__entry->ac = txq->txq.ac;
+		__entry->tid = txq->txq.tid;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " ac:%d tid:%d",
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid
+	)
 );
 
 #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 842881ca8f20..25904af38839 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1439,7 +1439,6 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	codel_vars_init(&txqi->def_cvars);
 	codel_stats_init(&txqi->cstats);
 	__skb_queue_head_init(&txqi->frags);
-	INIT_LIST_HEAD(&txqi->schedule_order);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1463,7 +1462,6 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
 	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
-	list_del_init(&txqi->schedule_order);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1560,8 +1558,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
 	ieee80211_txq_enqueue(local, txqi, skb);
 	spin_unlock_bh(&fq->lock);
 
-	if (ieee80211_schedule_txq(&local->hw, &txqi->txq))
-		drv_wake_tx_queue(local);
+	drv_wake_tx_queue(local, txqi);
 
 	return true;
 }
@@ -3556,50 +3553,6 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
-bool ieee80211_schedule_txq(struct ieee80211_hw *hw,
-			    struct ieee80211_txq *txq)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct txq_info *txqi = to_txq_info(txq);
-	bool ret = false;
-
-	spin_lock_bh(&local->active_txq_lock);
-
-	if (list_empty(&txqi->schedule_order)) {
-		list_add_tail(&txqi->schedule_order, &local->active_txqs);
-		ret = true;
-	}
-
-	spin_unlock_bh(&local->active_txq_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(ieee80211_schedule_txq);
-
-struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct txq_info *txqi = NULL;
-
-	spin_lock_bh(&local->active_txq_lock);
-
-	if (list_empty(&local->active_txqs))
-		goto out;
-
-	txqi = list_first_entry(&local->active_txqs,
-				struct txq_info, schedule_order);
-	list_del_init(&txqi->schedule_order);
-
-out:
-	spin_unlock_bh(&local->active_txq_lock);
-
-	if (!txqi)
-		return NULL;
-
-	return &txqi->txq;
-}
-EXPORT_SYMBOL(ieee80211_next_txq);
-
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 				  struct net_device *dev,
 				  u32 info_flags)
-- 
cgit v1.2.3


From 86b6c465727cc917ec8367fef29b20ccf8ffa081 Mon Sep 17 00:00:00 2001
From: David Spinadel <david.spinadel@intel.com>
Date: Mon, 18 Dec 2017 12:14:05 +0200
Subject: nl80211: send deauth reason if locally generated

Send disconnection reason code to user space even if it's locally
generated, since some tests that check reason code may fail because of
the current behavior.

Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e4522ad5f770..e4dddfb64ced 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13963,7 +13963,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    (from_ap && reason &&
+	    (reason &&
 	     nla_put_u16(msg, NL80211_ATTR_REASON_CODE, reason)) ||
 	    (from_ap &&
 	     nla_put_flag(msg, NL80211_ATTR_DISCONNECTED_BY_AP)) ||
-- 
cgit v1.2.3


From 983dafaab799511e092ffd006f3a064b37ccbccf Mon Sep 17 00:00:00 2001
From: Sunil Dutt <usdutt@qti.qualcomm.com>
Date: Wed, 13 Dec 2017 19:51:36 +0200
Subject: cfg80211: Scan results to also report the per chain signal strength

This commit enhances the scan results to report the per chain signal
strength based on the latest BSS update. This provides similar
information to what is already available through STA information.

Signed-off-by: Sunil Dutt <usdutt@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 8 ++++++++
 include/uapi/linux/nl80211.h | 4 ++++
 net/wireless/nl80211.c       | 5 +++++
 net/wireless/scan.c          | 5 +++++
 4 files changed, 22 insertions(+)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d7f8e7b96bcb..3a4a1a903a4d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1773,6 +1773,8 @@ enum cfg80211_signal_type {
  *	by %parent_bssid.
  * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to
  *	the BSS that requested the scan in which the beacon/probe was received.
+ * @chains: bitmask for filled values in @chain_signal.
+ * @chain_signal: per-chain signal strength of last received BSS in dBm.
  */
 struct cfg80211_inform_bss {
 	struct ieee80211_channel *chan;
@@ -1781,6 +1783,8 @@ struct cfg80211_inform_bss {
 	u64 boottime_ns;
 	u64 parent_tsf;
 	u8 parent_bssid[ETH_ALEN] __aligned(2);
+	u8 chains;
+	s8 chain_signal[IEEE80211_MAX_CHAINS];
 };
 
 /**
@@ -1824,6 +1828,8 @@ struct cfg80211_bss_ies {
  *	that holds the beacon data. @beacon_ies is still valid, of course, and
  *	points to the same data as hidden_beacon_bss->beacon_ies in that case.
  * @signal: signal strength value (type depends on the wiphy's signal_type)
+ * @chains: bitmask for filled values in @chain_signal.
+ * @chain_signal: per-chain signal strength of last received BSS in dBm.
  * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes
  */
 struct cfg80211_bss {
@@ -1842,6 +1848,8 @@ struct cfg80211_bss {
 	u16 capability;
 
 	u8 bssid[ETH_ALEN];
+	u8 chains;
+	s8 chain_signal[IEEE80211_MAX_CHAINS];
 
 	u8 priv[0] __aligned(sizeof(void *));
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f882fe1f9709..c587a61c32bf 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3862,6 +3862,9 @@ enum nl80211_bss_scan_width {
  *	@NL80211_BSS_PARENT_BSSID. (u64).
  * @NL80211_BSS_PARENT_BSSID: the BSS according to which @NL80211_BSS_PARENT_TSF
  *	is set.
+ * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update.
+ *	Contains a nested array of signal strength attributes (u8, dBm),
+ *	using the nesting index as the antenna number.
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -3885,6 +3888,7 @@ enum nl80211_bss {
 	NL80211_BSS_PAD,
 	NL80211_BSS_PARENT_TSF,
 	NL80211_BSS_PARENT_BSSID,
+	NL80211_BSS_CHAIN_SIGNAL,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e4dddfb64ced..b3f8970c3a47 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7839,6 +7839,11 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			      intbss->ts_boottime, NL80211_BSS_PAD))
 		goto nla_put_failure;
 
+	if (!nl80211_put_signal(msg, intbss->pub.chains,
+				intbss->pub.chain_signal,
+				NL80211_BSS_CHAIN_SIGNAL))
+		goto nla_put_failure;
+
 	switch (rdev->wiphy.signal_type) {
 	case CFG80211_SIGNAL_TYPE_MBM:
 		if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal))
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index f6c5fe482506..d36c3eb7b931 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -981,6 +981,9 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 		found->ts = tmp->ts;
 		found->ts_boottime = tmp->ts_boottime;
 		found->parent_tsf = tmp->parent_tsf;
+		found->pub.chains = tmp->pub.chains;
+		memcpy(found->pub.chain_signal, tmp->pub.chain_signal,
+		       IEEE80211_MAX_CHAINS);
 		ether_addr_copy(found->parent_bssid, tmp->parent_bssid);
 	} else {
 		struct cfg80211_internal_bss *new;
@@ -1233,6 +1236,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 	tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
 	tmp.ts_boottime = data->boottime_ns;
 	tmp.parent_tsf = data->parent_tsf;
+	tmp.pub.chains = data->chains;
+	memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS);
 	ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
 
 	signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
-- 
cgit v1.2.3


From 08fc7f8140730d2f8499c91b5abad44581b74635 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Thu, 14 Dec 2017 05:51:57 -0800
Subject: sock: Change the netns_core member name.

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/core.h |  2 +-
 net/core/sock.c          | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 0ad4d0c71228..45cfb5dc76c7 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,7 +11,7 @@ struct netns_core {
 
 	int	sysctl_somaxconn;
 
-	struct prot_inuse __percpu *inuse;
+	struct prot_inuse __percpu *prot_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index c0b5b2f17412..c2dd2d339db7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3045,7 +3045,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3055,7 +3055,7 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 	int res = 0;
 
 	for_each_possible_cpu(cpu)
-		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
 	return res >= 0 ? res : 0;
 }
@@ -3063,13 +3063,13 @@ EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-	net->core.inuse = alloc_percpu(struct prot_inuse);
-	return net->core.inuse ? 0 : -ENOMEM;
+	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+	return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-	free_percpu(net->core.inuse);
+	free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
cgit v1.2.3


From 648845ab7e200993dccd3948c719c858368c91e7 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Thu, 14 Dec 2017 05:51:58 -0800
Subject: sock: Move the socket inuse to namespace.

In some case, we want to know how many sockets are in use in
different _net_ namespaces. It's a key resource metric.

This patch add a member in struct netns_core. This is a counter
for socket-inuse in the _net_ namespace. The patch will add/sub
counter in the sk_alloc, sk_clone_lock and __sk_free.

This patch will not counter the socket created in kernel.
It's not very useful for userspace to know how many kernel
sockets we created.

The main reasons for doing this are that:

1. When linux calls the 'do_exit' for process to exit, the functions
'exit_task_namespaces' and 'exit_task_work' will be called sequentially.
'exit_task_namespaces' may have destroyed the _net_ namespace, but
'sock_release' called in 'exit_task_work' may use the _net_ namespace
if we counter the socket-inuse in sock_release.

2. socket and sock are in pair. More important, sock holds the _net_
namespace. We counter the socket-inuse in sock, for avoiding holding
_net_ namespace again in socket. It's a easy way to maintain the code.

Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/core.h |  3 +++
 include/net/sock.h       |  1 +
 net/core/sock.c          | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 net/socket.c             | 21 ++-------------------
 4 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 45cfb5dc76c7..a5e8a66c57b4 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,9 @@ struct netns_core {
 
 	int	sysctl_somaxconn;
 
+#ifdef CONFIG_PROC_FS
+	int __percpu *sock_inuse;
+#endif
 	struct prot_inuse __percpu *prot_inuse;
 };
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 9a9047268d37..0a32f3ce381c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1262,6 +1262,7 @@ proto_memory_pressure(struct proto *prot)
 /* Called with local bh disabled */
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
 int sock_prot_inuse_get(struct net *net, struct proto *proto);
+int sock_inuse_get(struct net *net);
 #else
 static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
 		int inc)
diff --git a/net/core/sock.c b/net/core/sock.c
index c2dd2d339db7..72d14b221784 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,8 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+static void sock_inuse_add(struct net *net, int val);
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sk->sk_kern_sock = kern;
 		sock_lock_init(sk);
 		sk->sk_net_refcnt = kern ? 0 : 1;
-		if (likely(sk->sk_net_refcnt))
+		if (likely(sk->sk_net_refcnt)) {
 			get_net(net);
+			sock_inuse_add(net, 1);
+		}
+
 		sock_net_set(sk, net);
 		refcount_set(&sk->sk_wmem_alloc, 1);
 
@@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+	if (likely(sk->sk_net_refcnt))
+		sock_inuse_add(sock_net(sk), -1);
+
 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
 		sock_diag_broadcast_destroy(sk);
 	else
@@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
+		if (likely(newsk->sk_net_refcnt))
+			sock_inuse_add(sock_net(newsk), 1);
 
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
@@ -3061,15 +3071,44 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+	this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+	int cpu, res = 0;
+
+	for_each_possible_cpu(cpu)
+		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+	return res;
+}
+
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
 static int __net_init sock_inuse_init_net(struct net *net)
 {
 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-	return net->core.prot_inuse ? 0 : -ENOMEM;
+	if (net->core.prot_inuse == NULL)
+		return -ENOMEM;
+
+	net->core.sock_inuse = alloc_percpu(int);
+	if (net->core.sock_inuse == NULL)
+		goto out;
+
+	return 0;
+
+out:
+	free_percpu(net->core.prot_inuse);
+	return -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
 	free_percpu(net->core.prot_inuse);
+	free_percpu(net->core.sock_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
@@ -3112,6 +3151,10 @@ static inline void assign_proto_idx(struct proto *prot)
 static inline void release_proto_idx(struct proto *prot)
 {
 }
+
+static void sock_inuse_add(struct net *net, int val)
+{
+}
 #endif
 
 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
diff --git a/net/socket.c b/net/socket.c
index 05f361faec45..bbd2e9ceb692 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -162,12 +162,6 @@ static const struct file_operations socket_file_ops = {
 static DEFINE_SPINLOCK(net_family_lock);
 static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
 
-/*
- *	Statistics counters of the socket lists
- */
-
-static DEFINE_PER_CPU(int, sockets_in_use);
-
 /*
  * Support routines.
  * Move socket addresses back and forth across the kernel/user
@@ -578,7 +572,6 @@ struct socket *sock_alloc(void)
 	inode->i_gid = current_fsgid();
 	inode->i_op = &sockfs_inode_ops;
 
-	this_cpu_add(sockets_in_use, 1);
 	return sock;
 }
 EXPORT_SYMBOL(sock_alloc);
@@ -605,7 +598,6 @@ void sock_release(struct socket *sock)
 	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
-	this_cpu_sub(sockets_in_use, 1);
 	if (!sock->file) {
 		iput(SOCK_INODE(sock));
 		return;
@@ -2622,17 +2614,8 @@ core_initcall(sock_init);	/* early initcall */
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-	int cpu;
-	int counter = 0;
-
-	for_each_possible_cpu(cpu)
-	    counter += per_cpu(sockets_in_use, cpu);
-
-	/* It can be negative, by the way. 8) */
-	if (counter < 0)
-		counter = 0;
-
-	seq_printf(seq, "sockets: used %d\n", counter);
+	seq_printf(seq, "sockets: used %d\n",
+		   sock_inuse_get(seq->private));
 }
 #endif				/* CONFIG_PROC_FS */
 
-- 
cgit v1.2.3


From fb1f5f79ae96331a0201b4080d34f3bc3b5c0b1d Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sat, 16 Dec 2017 03:09:40 -0500
Subject: net: Introduce NETIF_F_GRO_HW.

Introduce NETIF_F_GRO_HW feature flag for NICs that support hardware
GRO.  With this flag, we can now independently turn on or off hardware
GRO when GRO is on.  Previously, drivers were using NETIF_F_GRO to
control hardware GRO and so it cannot be independently turned on or
off without affecting GRO.

Hardware GRO (just like GRO) guarantees that packets can be re-segmented
by TSO/GSO to reconstruct the original packet stream.  Logically,
GRO_HW should depend on GRO since it a subset, but we will let
individual drivers enforce this dependency as they see fit.

Since NETIF_F_GRO is not propagated between upper and lower devices,
NETIF_F_GRO_HW should follow suit since it is a subset of GRO.  In other
words, a lower device can independent have GRO/GRO_HW enabled or disabled
and no feature propagation is required.  This will preserve the current
GRO behavior.  This can be changed later if we decide to propagate GRO/
GRO_HW/RXCSUM from upper to lower devices.

Cc: Ariel Elior <Ariel.Elior@cavium.com>
Cc: everest-linux-l2@cavium.com
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-features.txt |  9 +++++++++
 include/linux/netdev_features.h              |  3 +++
 net/core/dev.c                               | 12 ++++++++++++
 net/core/ethtool.c                           |  1 +
 4 files changed, 25 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt
index 7413eb05223b..c77f9d57eb91 100644
--- a/Documentation/networking/netdev-features.txt
+++ b/Documentation/networking/netdev-features.txt
@@ -163,3 +163,12 @@ This requests that the NIC receive all possible frames, including errored
 frames (such as bad FCS, etc).  This can be helpful when sniffing a link with
 bad packets on it.  Some NICs may receive more packets if also put into normal
 PROMISC mode.
+
+*  rx-gro-hw
+
+This requests that the NIC enables Hardware GRO (generic receive offload).
+Hardware GRO is basically the exact reverse of TSO, and is generally
+stricter than Hardware LRO.  A packet stream merged by Hardware GRO must
+be re-segmentable by GSO or TSO back to the exact original packet stream.
+Hardware GRO is dependent on RXCSUM since every packet successfully merged
+by hardware must also have the checksum verified by hardware.
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index b1b0ca7ccb2b..db84c516bcfb 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -78,6 +78,8 @@ enum {
 	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 	NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
 
+	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
+
 	/*
 	 * Add your fresh new feature above and remember to update
 	 * netdev_features_strings[] in net/core/ethtool.c and maybe
@@ -97,6 +99,7 @@ enum {
 #define NETIF_F_FRAGLIST	__NETIF_F(FRAGLIST)
 #define NETIF_F_FSO		__NETIF_F(FSO)
 #define NETIF_F_GRO		__NETIF_F(GRO)
+#define NETIF_F_GRO_HW		__NETIF_F(GRO_HW)
 #define NETIF_F_GSO		__NETIF_F(GSO)
 #define NETIF_F_GSO_ROBUST	__NETIF_F(GSO_ROBUST)
 #define NETIF_F_HIGHDMA		__NETIF_F(HIGHDMA)
diff --git a/net/core/dev.c b/net/core/dev.c
index b0eee49a2489..4b43f5dcabcd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7424,6 +7424,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		features &= ~dev->gso_partial_features;
 	}
 
+	if (!(features & NETIF_F_RXCSUM)) {
+		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
+		 * successfully merged by hardware must also have the
+		 * checksum verified by hardware.  If the user does not
+		 * want to enable RXCSUM, logically, we should disable GRO_HW.
+		 */
+		if (features & NETIF_F_GRO_HW) {
+			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
+			features &= ~NETIF_F_GRO_HW;
+		}
+	}
+
 	return features;
 }
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f8fcf450a36e..50a79203043b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_LLTX_BIT] =             "tx-lockless",
 	[NETIF_F_NETNS_LOCAL_BIT] =      "netns-local",
 	[NETIF_F_GRO_BIT] =              "rx-gro",
+	[NETIF_F_GRO_HW_BIT] =           "rx-gro-hw",
 	[NETIF_F_LRO_BIT] =              "rx-lro",
 
 	[NETIF_F_TSO_BIT] =              "tx-tcp-segmentation",
-- 
cgit v1.2.3


From 56f5aa77cdad1076bea0ae8ddeb74ba68ddc9502 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sat, 16 Dec 2017 03:09:41 -0500
Subject: net: Disable GRO_HW when generic XDP is installed on a device.

Hardware should not aggregate any packets when generic XDP is installed.

Cc: Ariel Elior <Ariel.Elior@cavium.com>
Cc: everest-linux-l2@cavium.com
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4b43f5dcabcd..c7db39926769 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1542,6 +1542,23 @@ void dev_disable_lro(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
+/**
+ *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
+ *	@dev: device
+ *
+ *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
+ *	called under RTNL.  This is needed if Generic XDP is installed on
+ *	the device.
+ */
+static void dev_disable_gro_hw(struct net_device *dev)
+{
+	dev->wanted_features &= ~NETIF_F_GRO_HW;
+	netdev_update_features(dev);
+
+	if (unlikely(dev->features & NETIF_F_GRO_HW))
+		netdev_WARN(dev, "failed to disable GRO_HW!\n");
+}
+
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
@@ -4564,6 +4581,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		} else if (new && !old) {
 			static_key_slow_inc(&generic_xdp_needed);
 			dev_disable_lro(dev);
+			dev_disable_gro_hw(dev);
 		}
 		break;
 
-- 
cgit v1.2.3


From 1df94c3c5dadbce3df6cc0e989d8c85d43a903d6 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 18 Dec 2017 14:34:26 -0800
Subject: net_sched: properly check for empty skb array on error path

First, the check of &q->ring.queue against NULL is wrong, it
is always false. We should check the value rather than the address.

Secondly, we need the same check in pfifo_fast_reset() too,
as both ->reset() and ->destroy() are called in qdisc_destroy().

Fixes: c5ad119fb6c0 ("net: sched: pfifo_fast use skb_array")
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 981c08fe810b..876fab2604b8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -659,6 +659,12 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
 		struct skb_array *q = band2list(priv, band);
 		struct sk_buff *skb;
 
+		/* NULL ring is possible if destroy path is due to a failed
+		 * skb_array_init() in pfifo_fast_init() case.
+		 */
+		if (!q->ring.queue)
+			continue;
+
 		while ((skb = skb_array_consume_bh(q)) != NULL)
 			kfree_skb(skb);
 	}
@@ -719,7 +725,7 @@ static void pfifo_fast_destroy(struct Qdisc *sch)
 		/* NULL ring is possible if destroy path is due to a failed
 		 * skb_array_init() in pfifo_fast_init() case.
 		 */
-		if (!&q->ring.queue)
+		if (!q->ring.queue)
 			continue;
 		/* Destroy ring but no need to kfree_skb because a call to
 		 * pfifo_fast_reset() has already done that work.
-- 
cgit v1.2.3


From 3dca3f38cfb8efb8571040568cac7d0025fa5bb1 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 20 Dec 2017 10:41:31 +0100
Subject: xfrm: Separate ESP handling from segmentation for GRO packets.

We change the ESP GSO handlers to only segment the packets.
The ESP handling and encryption is defered to validate_xmit_xfrm()
where this is done for non GRO packets too. This makes the code
more robust and prepares for asynchronous crypto handling.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h           |  6 +--
 net/core/dev.c               |  5 +--
 net/ipv4/esp4_offload.c      | 73 +++++++++++--------------------------
 net/ipv4/xfrm4_mode_tunnel.c |  5 +--
 net/ipv6/esp6_offload.c      | 80 ++++++++++++----------------------------
 net/ipv6/xfrm6_mode_tunnel.c |  5 +--
 net/xfrm/xfrm_device.c       | 87 +++++++++++++++++++++++++++++++++++++++-----
 7 files changed, 129 insertions(+), 132 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1ec0c4760646..df7f3d0ac4a1 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1888,7 +1888,7 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
 void __net_init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features);
+struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features);
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
@@ -1929,9 +1929,9 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 	}
 }
 #else
-static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
+static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
 {
-	return 0;
+	return skb;
 }
 
 static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo)
diff --git a/net/core/dev.c b/net/core/dev.c
index c7db39926769..fb7a24a373d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3083,9 +3083,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		    __skb_linearize(skb))
 			goto out_kfree_skb;
 
-		if (validate_xmit_xfrm(skb, features))
-			goto out_kfree_skb;
-
 		/* If packet is not checksummed and device does not
 		 * support checksumming for this protocol, complete
 		 * checksumming here.
@@ -3102,6 +3099,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		}
 	}
 
+	skb = validate_xmit_xfrm(skb, features);
+
 	return skb;
 
 out_kfree_skb:
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index f8b918c766b0..c359f3cfeec3 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -108,75 +108,36 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
 static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
 				        netdev_features_t features)
 {
-	__u32 seq;
-	int err = 0;
-	struct sk_buff *skb2;
 	struct xfrm_state *x;
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
-	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	netdev_features_t esp_features = features;
 	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (!xo)
-		goto out;
-
-	seq = xo->seq.low;
+		return ERR_PTR(-EINVAL);
 
 	x = skb->sp->xvec[skb->sp->len - 1];
 	aead = x->data;
 	esph = ip_esp_hdr(skb);
 
 	if (esph->spi != x->id.spi)
-		goto out;
+		return ERR_PTR(-EINVAL);
 
 	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
-		goto out;
+		return ERR_PTR(-EINVAL);
 
 	__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
 
 	skb->encap_hdr_csum = 1;
 
-	if (!(features & NETIF_F_HW_ESP))
+	if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
+	    (x->xso.dev != skb->dev))
 		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
 
-	segs = x->outer_mode->gso_segment(x, skb, esp_features);
-	if (IS_ERR_OR_NULL(segs))
-		goto out;
-
-	__skb_pull(skb, skb->data - skb_mac_header(skb));
-
-	skb2 = segs;
-	do {
-		struct sk_buff *nskb = skb2->next;
-
-		xo = xfrm_offload(skb2);
-		xo->flags |= XFRM_GSO_SEGMENT;
-		xo->seq.low = seq;
-		xo->seq.hi = xfrm_replay_seqhi(x, seq);
+	xo->flags |= XFRM_GSO_SEGMENT;
 
-		if(!(features & NETIF_F_HW_ESP))
-			xo->flags |= CRYPTO_FALLBACK;
-
-		x->outer_mode->xmit(x, skb2);
-
-		err = x->type_offload->xmit(x, skb2, esp_features);
-		if (err) {
-			kfree_skb_list(segs);
-			return ERR_PTR(err);
-		}
-
-		if (!skb_is_gso(skb2))
-			seq++;
-		else
-			seq += skb_shinfo(skb2)->gso_segs;
-
-		skb_push(skb2, skb2->mac_len);
-		skb2 = nskb;
-	} while (skb2);
-
-out:
-	return segs;
+	return x->outer_mode->gso_segment(x, skb, esp_features);
 }
 
 static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
@@ -203,6 +164,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_
 	struct crypto_aead *aead;
 	struct esp_info esp;
 	bool hw_offload = true;
+	__u32 seq;
 
 	esp.inplace = true;
 
@@ -241,23 +203,30 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_
 			return esp.nfrags;
 	}
 
+	seq = xo->seq.low;
+
 	esph = esp.esph;
 	esph->spi = x->id.spi;
 
 	skb_push(skb, -skb_network_offset(skb));
 
 	if (xo->flags & XFRM_GSO_SEGMENT) {
-		esph->seq_no = htonl(xo->seq.low);
-	} else {
-		ip_hdr(skb)->tot_len = htons(skb->len);
-		ip_send_check(ip_hdr(skb));
+		esph->seq_no = htonl(seq);
+
+		if (!skb_is_gso(skb))
+			xo->seq.low++;
+		else
+			xo->seq.low += skb_shinfo(skb)->gso_segs;
 	}
 
+	esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
+
+	ip_hdr(skb)->tot_len = htons(skb->len);
+	ip_send_check(ip_hdr(skb));
+
 	if (hw_offload)
 		return 0;
 
-	esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
-
 	err = esp_output_tail(x, skb, &esp);
 	if (err)
 		return err;
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 7d885a44dc9d..8affc6d83d58 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -105,18 +105,15 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x,
 {
 	__skb_push(skb, skb->mac_len);
 	return skb_mac_gso_segment(skb, features);
-
 }
 
 static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct xfrm_offload *xo = xfrm_offload(skb);
 
-	if (xo->flags & XFRM_GSO_SEGMENT) {
-		skb->network_header = skb->network_header - x->props.header_len;
+	if (xo->flags & XFRM_GSO_SEGMENT)
 		skb->transport_header = skb->network_header +
 					sizeof(struct iphdr);
-	}
 
 	skb_reset_mac_len(skb);
 	pskb_pull(skb, skb->mac_len + x->props.header_len);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 333a478aa161..0bb7d54cf2cb 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -135,75 +135,36 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
 static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
 				        netdev_features_t features)
 {
-	__u32 seq;
-	int err = 0;
-	struct sk_buff *skb2;
 	struct xfrm_state *x;
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
-	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	netdev_features_t esp_features = features;
 	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (!xo)
-		goto out;
-
-	seq = xo->seq.low;
+		return ERR_PTR(-EINVAL);
 
 	x = skb->sp->xvec[skb->sp->len - 1];
 	aead = x->data;
 	esph = ip_esp_hdr(skb);
 
 	if (esph->spi != x->id.spi)
-		goto out;
+		return ERR_PTR(-EINVAL);
 
 	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
-		goto out;
+		return ERR_PTR(-EINVAL);
 
 	__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
 
 	skb->encap_hdr_csum = 1;
 
-	if (!(features & NETIF_F_HW_ESP))
+	if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
+	    (x->xso.dev != skb->dev))
 		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
 
-	segs = x->outer_mode->gso_segment(x, skb, esp_features);
-	if (IS_ERR_OR_NULL(segs))
-		goto out;
-
-	__skb_pull(skb, skb->data - skb_mac_header(skb));
-
-	skb2 = segs;
-	do {
-		struct sk_buff *nskb = skb2->next;
-
-		xo = xfrm_offload(skb2);
-		xo->flags |= XFRM_GSO_SEGMENT;
-		xo->seq.low = seq;
-		xo->seq.hi = xfrm_replay_seqhi(x, seq);
-
-		if(!(features & NETIF_F_HW_ESP))
-			xo->flags |= CRYPTO_FALLBACK;
-
-		x->outer_mode->xmit(x, skb2);
-
-		err = x->type_offload->xmit(x, skb2, esp_features);
-		if (err) {
-			kfree_skb_list(segs);
-			return ERR_PTR(err);
-		}
-
-		if (!skb_is_gso(skb2))
-			seq++;
-		else
-			seq += skb_shinfo(skb2)->gso_segs;
-
-		skb_push(skb2, skb2->mac_len);
-		skb2 = nskb;
-	} while (skb2);
+	xo->flags |= XFRM_GSO_SEGMENT;
 
-out:
-	return segs;
+	return x->outer_mode->gso_segment(x, skb, esp_features);
 }
 
 static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
@@ -222,6 +183,7 @@ static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
 
 static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_t features)
 {
+	int len;
 	int err;
 	int alen;
 	int blksize;
@@ -230,6 +192,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 	struct crypto_aead *aead;
 	struct esp_info esp;
 	bool hw_offload = true;
+	__u32 seq;
 
 	esp.inplace = true;
 
@@ -265,28 +228,33 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 			return esp.nfrags;
 	}
 
+	seq = xo->seq.low;
+
 	esph = ip_esp_hdr(skb);
 	esph->spi = x->id.spi;
 
 	skb_push(skb, -skb_network_offset(skb));
 
 	if (xo->flags & XFRM_GSO_SEGMENT) {
-		esph->seq_no = htonl(xo->seq.low);
-	} else {
-		int len;
-
-		len = skb->len - sizeof(struct ipv6hdr);
-		if (len > IPV6_MAXPLEN)
-			len = 0;
+		esph->seq_no = htonl(seq);
 
-		ipv6_hdr(skb)->payload_len = htons(len);
+		if (!skb_is_gso(skb))
+			xo->seq.low++;
+		else
+			xo->seq.low += skb_shinfo(skb)->gso_segs;
 	}
 
+	esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
+
+	len = skb->len - sizeof(struct ipv6hdr);
+	if (len > IPV6_MAXPLEN)
+		len = 0;
+
+	ipv6_hdr(skb)->payload_len = htons(len);
+
 	if (hw_offload)
 		return 0;
 
-	esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
-
 	err = esp6_output_tail(x, skb, &esp);
 	if (err)
 		return err;
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index e66b94f46532..4e12859bc2ee 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -105,17 +105,14 @@ static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x,
 {
 	__skb_push(skb, skb->mac_len);
 	return skb_mac_gso_segment(skb, features);
-
 }
 
 static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct xfrm_offload *xo = xfrm_offload(skb);
 
-	if (xo->flags & XFRM_GSO_SEGMENT) {
-		skb->network_header = skb->network_header - x->props.header_len;
+	if (xo->flags & XFRM_GSO_SEGMENT)
 		skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
-	}
 
 	skb_reset_mac_len(skb);
 	pskb_pull(skb, skb->mac_len + x->props.header_len);
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 00641b611aed..a5a7a716c465 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -23,32 +23,99 @@
 #include <linux/notifier.h>
 
 #ifdef CONFIG_XFRM_OFFLOAD
-int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
+struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
 {
 	int err;
+	__u32 seq;
 	struct xfrm_state *x;
+	struct sk_buff *skb2;
+	netdev_features_t esp_features = features;
 	struct xfrm_offload *xo = xfrm_offload(skb);
 
-	if (skb_is_gso(skb))
-		return 0;
+	if (!xo)
+		return skb;
 
-	if (xo) {
-		x = skb->sp->xvec[skb->sp->len - 1];
-		if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND)
-			return 0;
+	if (!(features & NETIF_F_HW_ESP))
+		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+
+	x = skb->sp->xvec[skb->sp->len - 1];
+	if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND)
+		return skb;
+
+	if (skb_is_gso(skb)) {
+		struct net_device *dev = skb->dev;
+
+		if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) {
+			struct sk_buff *segs;
+
+			/* Packet got rerouted, fixup features and segment it. */
+			esp_features = esp_features & ~(NETIF_F_HW_ESP
+							| NETIF_F_GSO_ESP);
 
+			segs = skb_gso_segment(skb, esp_features);
+			if (IS_ERR(segs)) {
+				XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+				kfree_skb(skb);
+				return NULL;
+			} else {
+				consume_skb(skb);
+				skb = segs;
+			}
+		} else {
+			return skb;
+		}
+	}
+
+	if (!skb->next) {
 		x->outer_mode->xmit(x, skb);
 
-		err = x->type_offload->xmit(x, skb, features);
+		err = x->type_offload->xmit(x, skb, esp_features);
 		if (err) {
 			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
-			return err;
+			kfree_skb(skb);
+			return NULL;
 		}
 
 		skb_push(skb, skb->data - skb_mac_header(skb));
+
+		return skb;
 	}
 
-	return 0;
+	skb2 = skb;
+	seq = xo->seq.low;
+
+	do {
+		struct sk_buff *nskb = skb2->next;
+
+		xo = xfrm_offload(skb2);
+		xo->flags |= XFRM_GSO_SEGMENT;
+		xo->seq.low = seq;
+		xo->seq.hi = xfrm_replay_seqhi(x, seq);
+
+		if(!(features & NETIF_F_HW_ESP))
+			xo->flags |= CRYPTO_FALLBACK;
+
+		x->outer_mode->xmit(x, skb2);
+
+		err = x->type_offload->xmit(x, skb2, esp_features);
+		if (err) {
+			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+			skb2->next = nskb;
+			kfree_skb_list(skb2);
+			return NULL;
+		}
+
+		if (!skb_is_gso(skb2))
+			seq++;
+		else
+			seq += skb_shinfo(skb2)->gso_segs;
+
+		skb_push(skb2, skb2->data - skb_mac_header(skb2));
+
+		skb2 = nskb;
+	} while (skb2);
+
+	return skb;
 }
 EXPORT_SYMBOL_GPL(validate_xmit_xfrm);
 
-- 
cgit v1.2.3


From f53c723902d1ac5f0b0a11d7c9dcbff748dde74e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 20 Dec 2017 10:41:36 +0100
Subject: net: Add asynchronous callbacks for xfrm on layer 2.

This patch implements asynchronous crypto callbacks
and a backlog handler that can be used when IPsec
is done at layer 2 in the TX path. It also extends
the skb validate functions so that we can update
the driver transmit return codes based on async
crypto operation or to indicate that we queued the
packet in a backlog queue.

Joint work with: Aviv Heller <avivh@mellanox.com>

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |   6 ++-
 include/net/xfrm.h        |  22 ++++++++--
 net/core/dev.c            |  16 +++++---
 net/ipv4/esp4.c           |  24 +++++++++--
 net/ipv6/esp6.c           |  24 +++++++++--
 net/packet/af_packet.c    |   3 +-
 net/sched/sch_generic.c   |  16 +++++++-
 net/xfrm/xfrm_device.c    | 100 +++++++++++++++++++++++++++++++++++++---------
 8 files changed, 175 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc4ce7456e38..c82d207ebc97 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2793,7 +2793,9 @@ struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct Qdisc		**output_queue_tailp;
 	struct sk_buff		*completion_queue;
-
+#ifdef CONFIG_XFRM_OFFLOAD
+	struct sk_buff_head	xfrm_backlog;
+#endif
 #ifdef CONFIG_RPS
 	/* input_queue_head should be written by cpu owning this struct,
 	 * and only read by other cpus. Worth using a cache line.
@@ -3325,7 +3327,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index df7f3d0ac4a1..2517c4f7781a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1051,6 +1051,7 @@ struct xfrm_offload {
 #define	XFRM_GSO_SEGMENT	16
 #define	XFRM_GRO		32
 #define	XFRM_ESP_NO_TRAILER	64
+#define	XFRM_DEV_RESUME		128
 
 	__u32			status;
 #define CRYPTO_SUCCESS				1
@@ -1874,21 +1875,28 @@ static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
 {
 	return skb->sp->xvec[skb->sp->len - 1];
 }
+#endif
+
 static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
 {
+#ifdef CONFIG_XFRM
 	struct sec_path *sp = skb->sp;
 
 	if (!sp || !sp->olen || sp->len != sp->olen)
 		return NULL;
 
 	return &sp->ovec[sp->olen - 1];
-}
+#else
+	return NULL;
 #endif
+}
 
 void __net_init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features);
+void xfrm_dev_resume(struct sk_buff *skb);
+void xfrm_dev_backlog(struct softnet_data *sd);
+struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
@@ -1929,7 +1937,15 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 	}
 }
 #else
-static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
+static inline void xfrm_dev_resume(struct sk_buff *skb)
+{
+}
+
+static inline void xfrm_dev_backlog(struct softnet_data *sd)
+{
+}
+
+static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
 {
 	return skb;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index fb7a24a373d1..821dd8cb7169 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3059,7 +3059,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_csum_hwoffload_help);
 
-static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 {
 	netdev_features_t features;
 
@@ -3099,7 +3099,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		}
 	}
 
-	skb = validate_xmit_xfrm(skb, features);
+	skb = validate_xmit_xfrm(skb, features, again);
 
 	return skb;
 
@@ -3110,7 +3110,7 @@ out_null:
 	return NULL;
 }
 
-struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 {
 	struct sk_buff *next, *head = NULL, *tail;
 
@@ -3121,7 +3121,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 		/* in case skb wont be segmented, point to itself */
 		skb->prev = skb;
 
-		skb = validate_xmit_skb(skb, dev);
+		skb = validate_xmit_skb(skb, dev, again);
 		if (!skb)
 			continue;
 
@@ -3448,6 +3448,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	struct netdev_queue *txq;
 	struct Qdisc *q;
 	int rc = -ENOMEM;
+	bool again = false;
 
 	skb_reset_mac_header(skb);
 
@@ -3509,7 +3510,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 				     XMIT_RECURSION_LIMIT))
 				goto recursion_alert;
 
-			skb = validate_xmit_skb(skb, dev);
+			skb = validate_xmit_skb(skb, dev, &again);
 			if (!skb)
 				goto out;
 
@@ -4193,6 +4194,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 				spin_unlock(root_lock);
 		}
 	}
+
+	xfrm_dev_backlog(sd);
 }
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
@@ -8874,6 +8877,9 @@ static int __init net_dev_init(void)
 
 		skb_queue_head_init(&sd->input_pkt_queue);
 		skb_queue_head_init(&sd->process_queue);
+#ifdef CONFIG_XFRM_OFFLOAD
+		skb_queue_head_init(&sd->xfrm_backlog);
+#endif
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue_tailp = &sd->output_queue;
 #ifdef CONFIG_RPS
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d57aa64fa7c7..7948833dc204 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -121,14 +121,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	void *tmp;
-	struct dst_entry *dst = skb_dst(skb);
-	struct xfrm_state *x = dst->xfrm;
+	struct xfrm_state *x;
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME))
+		x = skb->sp->xvec[skb->sp->len - 1];
+	else
+		x = skb_dst(skb)->xfrm;
 
 	tmp = ESP_SKB_CB(skb)->tmp;
 	esp_ssg_unref(x, tmp);
 	kfree(tmp);
-	xfrm_output_resume(skb, err);
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+		if (err) {
+			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+			kfree_skb(skb);
+			return;
+		}
+
+		skb_push(skb, skb->data - skb_mac_header(skb));
+		secpath_reset(skb);
+		xfrm_dev_resume(skb);
+	} else {
+		xfrm_output_resume(skb, err);
+	}
 }
 
 /* Move ESP header back into place. */
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index a902ff8f59be..08a424fa8009 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -141,14 +141,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	void *tmp;
-	struct dst_entry *dst = skb_dst(skb);
-	struct xfrm_state *x = dst->xfrm;
+	struct xfrm_state *x;
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME))
+		x = skb->sp->xvec[skb->sp->len - 1];
+	else
+		x = skb_dst(skb)->xfrm;
 
 	tmp = ESP_SKB_CB(skb)->tmp;
 	esp_ssg_unref(x, tmp);
 	kfree(tmp);
-	xfrm_output_resume(skb, err);
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+		if (err) {
+			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+			kfree_skb(skb);
+			return;
+		}
+
+		skb_push(skb, skb->data - skb_mac_header(skb));
+		secpath_reset(skb);
+		xfrm_dev_resume(skb);
+	} else {
+		xfrm_output_resume(skb, err);
+	}
 }
 
 /* Move ESP header back into place. */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index da215e5c1399..ee7aa0ba3a67 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -247,12 +247,13 @@ static int packet_direct_xmit(struct sk_buff *skb)
 	struct sk_buff *orig_skb = skb;
 	struct netdev_queue *txq;
 	int ret = NETDEV_TX_BUSY;
+	bool again = false;
 
 	if (unlikely(!netif_running(dev) ||
 		     !netif_carrier_ok(dev)))
 		goto drop;
 
-	skb = validate_xmit_skb_list(skb, dev);
+	skb = validate_xmit_skb_list(skb, dev, &again);
 	if (skb != orig_skb)
 		goto drop;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 876fab2604b8..f9a8761f0ff2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -32,6 +32,7 @@
 #include <net/pkt_sched.h>
 #include <net/dst.h>
 #include <trace/events/qdisc.h>
+#include <net/xfrm.h>
 
 /* Qdisc to use by default */
 const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
@@ -230,6 +231,8 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 
 		/* skb in gso_skb were already validated */
 		*validate = false;
+		if (xfrm_offload(skb))
+			*validate = true;
 		/* check the reason of requeuing without tx lock first */
 		txq = skb_get_tx_queue(txq->dev, skb);
 		if (!netif_xmit_frozen_or_stopped(txq)) {
@@ -285,6 +288,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		     spinlock_t *root_lock, bool validate)
 {
 	int ret = NETDEV_TX_BUSY;
+	bool again = false;
 
 	/* And release qdisc */
 	if (root_lock)
@@ -292,7 +296,17 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
 	if (validate)
-		skb = validate_xmit_skb_list(skb, dev);
+		skb = validate_xmit_skb_list(skb, dev, &again);
+
+#ifdef CONFIG_XFRM_OFFLOAD
+	if (unlikely(again)) {
+		if (root_lock)
+			spin_lock(root_lock);
+
+		dev_requeue_skb(skb, q);
+		return false;
+	}
+#endif
 
 	if (likely(skb)) {
 		HARD_TX_LOCK(dev, txq, smp_processor_id());
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index a5a7a716c465..fc8ab9f71127 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -23,12 +23,13 @@
 #include <linux/notifier.h>
 
 #ifdef CONFIG_XFRM_OFFLOAD
-struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
+struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
 {
 	int err;
-	__u32 seq;
+	unsigned long flags;
 	struct xfrm_state *x;
 	struct sk_buff *skb2;
+	struct softnet_data *sd;
 	netdev_features_t esp_features = features;
 	struct xfrm_offload *xo = xfrm_offload(skb);
 
@@ -42,6 +43,16 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 	if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND)
 		return skb;
 
+	local_irq_save(flags);
+	sd = this_cpu_ptr(&softnet_data);
+	err = !skb_queue_empty(&sd->xfrm_backlog);
+	local_irq_restore(flags);
+
+	if (err) {
+		*again = true;
+		return skb;
+	}
+
 	if (skb_is_gso(skb)) {
 		struct net_device *dev = skb->dev;
 
@@ -54,23 +65,26 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 
 			segs = skb_gso_segment(skb, esp_features);
 			if (IS_ERR(segs)) {
-				XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
 				kfree_skb(skb);
+				atomic_long_inc(&dev->tx_dropped);
 				return NULL;
 			} else {
 				consume_skb(skb);
 				skb = segs;
 			}
-		} else {
-			return skb;
 		}
 	}
 
 	if (!skb->next) {
 		x->outer_mode->xmit(x, skb);
 
+		xo->flags |= XFRM_DEV_RESUME;
+
 		err = x->type_offload->xmit(x, skb, esp_features);
 		if (err) {
+			if (err == -EINPROGRESS)
+				return NULL;
+
 			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
 			kfree_skb(skb);
 			return NULL;
@@ -82,36 +96,37 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 	}
 
 	skb2 = skb;
-	seq = xo->seq.low;
 
 	do {
 		struct sk_buff *nskb = skb2->next;
+		skb2->next = NULL;
 
 		xo = xfrm_offload(skb2);
-		xo->flags |= XFRM_GSO_SEGMENT;
-		xo->seq.low = seq;
-		xo->seq.hi = xfrm_replay_seqhi(x, seq);
-
-		if(!(features & NETIF_F_HW_ESP))
-			xo->flags |= CRYPTO_FALLBACK;
+		xo->flags |= XFRM_DEV_RESUME;
 
 		x->outer_mode->xmit(x, skb2);
 
 		err = x->type_offload->xmit(x, skb2, esp_features);
-		if (err) {
+		if (!err) {
+			skb2->next = nskb;
+		} else if (err != -EINPROGRESS) {
 			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
 			skb2->next = nskb;
 			kfree_skb_list(skb2);
 			return NULL;
-		}
+		} else {
+			if (skb == skb2)
+				skb = nskb;
+
+			if (!skb)
+				return NULL;
 
-		if (!skb_is_gso(skb2))
-			seq++;
-		else
-			seq += skb_shinfo(skb2)->gso_segs;
+			goto skip_push;
+		}
 
 		skb_push(skb2, skb2->data - skb_mac_header(skb2));
 
+skip_push:
 		skb2 = nskb;
 	} while (skb2);
 
@@ -207,6 +222,55 @@ ok:
 	return true;
 }
 EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);
+
+void xfrm_dev_resume(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	int ret = NETDEV_TX_BUSY;
+	struct netdev_queue *txq;
+	struct softnet_data *sd;
+	unsigned long flags;
+
+	rcu_read_lock();
+	txq = netdev_pick_tx(dev, skb, NULL);
+
+	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	if (!netif_xmit_frozen_or_stopped(txq))
+		skb = dev_hard_start_xmit(skb, dev, txq, &ret);
+	HARD_TX_UNLOCK(dev, txq);
+
+	if (!dev_xmit_complete(ret)) {
+		local_irq_save(flags);
+		sd = this_cpu_ptr(&softnet_data);
+		skb_queue_tail(&sd->xfrm_backlog, skb);
+		raise_softirq_irqoff(NET_TX_SOFTIRQ);
+		local_irq_restore(flags);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_resume);
+
+void xfrm_dev_backlog(struct softnet_data *sd)
+{
+	struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog;
+	struct sk_buff_head list;
+	struct sk_buff *skb;
+
+	if (skb_queue_empty(xfrm_backlog))
+		return;
+
+	__skb_queue_head_init(&list);
+
+	spin_lock(&xfrm_backlog->lock);
+	skb_queue_splice_init(xfrm_backlog, &list);
+	spin_unlock(&xfrm_backlog->lock);
+
+	while (!skb_queue_empty(&list)) {
+		skb = __skb_dequeue(&list);
+		xfrm_dev_resume(skb);
+	}
+
+}
 #endif
 
 static int xfrm_dev_register(struct net_device *dev)
-- 
cgit v1.2.3


From 95bff4b580e7e6c895c5ecbc0c9f703635c2972d Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 20 Dec 2017 10:41:42 +0100
Subject: xfrm: Allow to use the layer2 IPsec GSO codepath for software crypto.

We now have support for asynchronous crypto operations in the layer 2 TX
path. This was the missing part to allow the GSO codepath for software
crypto, so allow this codepath now.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index fc8ab9f71127..20a96181867a 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -202,8 +202,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 	if (!x->type_offload || x->encap)
 		return false;
 
-	if ((x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev)) &&
-	     !xdst->child->xfrm && x->type->get_mtu) {
+	if ((!dev || (x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev))) &&
+	     (!xdst->child->xfrm && x->type->get_mtu)) {
 		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
 
 		if (skb->len <= mtu)
-- 
cgit v1.2.3


From f58869c44fb3f0835dd2dabce06e5919a18655c6 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 20 Dec 2017 10:41:53 +0100
Subject: esp: Don't require synchronous crypto fallback on offloading anymore.

We support asynchronous crypto on layer 2 ESP now.
So no need to force synchronous crypto fallback on
offloading anymore.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4.c | 12 ++----------
 net/ipv6/esp6.c | 12 ++----------
 2 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7948833dc204..6f00e43120a8 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -843,17 +843,13 @@ static int esp_init_aead(struct xfrm_state *x)
 	char aead_name[CRYPTO_MAX_ALG_NAME];
 	struct crypto_aead *aead;
 	int err;
-	u32 mask = 0;
 
 	err = -ENAMETOOLONG;
 	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
 		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
 		goto error;
 
-	if (x->xso.offload_handle)
-		mask |= CRYPTO_ALG_ASYNC;
-
-	aead = crypto_alloc_aead(aead_name, 0, mask);
+	aead = crypto_alloc_aead(aead_name, 0, 0);
 	err = PTR_ERR(aead);
 	if (IS_ERR(aead))
 		goto error;
@@ -883,7 +879,6 @@ static int esp_init_authenc(struct xfrm_state *x)
 	char authenc_name[CRYPTO_MAX_ALG_NAME];
 	unsigned int keylen;
 	int err;
-	u32 mask = 0;
 
 	err = -EINVAL;
 	if (!x->ealg)
@@ -909,10 +904,7 @@ static int esp_init_authenc(struct xfrm_state *x)
 			goto error;
 	}
 
-	if (x->xso.offload_handle)
-		mask |= CRYPTO_ALG_ASYNC;
-
-	aead = crypto_alloc_aead(authenc_name, 0, mask);
+	aead = crypto_alloc_aead(authenc_name, 0, 0);
 	err = PTR_ERR(aead);
 	if (IS_ERR(aead))
 		goto error;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 08a424fa8009..7c888c6e53a9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -752,17 +752,13 @@ static int esp_init_aead(struct xfrm_state *x)
 	char aead_name[CRYPTO_MAX_ALG_NAME];
 	struct crypto_aead *aead;
 	int err;
-	u32 mask = 0;
 
 	err = -ENAMETOOLONG;
 	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
 		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
 		goto error;
 
-	if (x->xso.offload_handle)
-		mask |= CRYPTO_ALG_ASYNC;
-
-	aead = crypto_alloc_aead(aead_name, 0, mask);
+	aead = crypto_alloc_aead(aead_name, 0, 0);
 	err = PTR_ERR(aead);
 	if (IS_ERR(aead))
 		goto error;
@@ -792,7 +788,6 @@ static int esp_init_authenc(struct xfrm_state *x)
 	char authenc_name[CRYPTO_MAX_ALG_NAME];
 	unsigned int keylen;
 	int err;
-	u32 mask = 0;
 
 	err = -EINVAL;
 	if (!x->ealg)
@@ -818,10 +813,7 @@ static int esp_init_authenc(struct xfrm_state *x)
 			goto error;
 	}
 
-	if (x->xso.offload_handle)
-		mask |= CRYPTO_ALG_ASYNC;
-
-	aead = crypto_alloc_aead(authenc_name, 0, mask);
+	aead = crypto_alloc_aead(authenc_name, 0, 0);
 	err = PTR_ERR(aead);
 	if (IS_ERR(aead))
 		goto error;
-- 
cgit v1.2.3


From 53c81e95df1793933f87748d36070a721f6cb287 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Tue, 19 Dec 2017 16:59:21 +0300
Subject: ip6_vti: adjust vti mtu according to mtu of lower device

LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams over
ip6_vti that require fragmentation and the underlying device has an
MTU smaller than 1500 plus some extra space for headers. This happens
because ip6_vti, by default, sets MTU to ETH_DATA_LEN and not updating
it depending on a destination address or link parameter. Further
attempts to send UDP packets may succeed because pmtu gets updated on
ICMPV6_PKT_TOOBIG in vti6_err().

In case the lower device has larger MTU size, e.g. 9000, ip6_vti works
but not using the possible maximum size, output packets have 1500 limit.

The above cases require manual MTU setup after ip6_vti creation. However
ip_vti already updates MTU based on lower device with ip_tunnel_bind_dev().

Here is the example when the lower device MTU is set to 9000:

  # ip a sh ltp_ns_veth2
      ltp_ns_veth2@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 ...
        inet 10.0.0.2/24 scope global ltp_ns_veth2
        inet6 fd00::2/64 scope global

  # ip li add vti6 type vti6 local fd00::2 remote fd00::1
  # ip li show vti6
      vti6@NONE: <POINTOPOINT,NOARP> mtu 1500 ...
        link/tunnel6 fd00::2 peer fd00::1

After the patch:
  # ip li add vti6 type vti6 local fd00::2 remote fd00::1
  # ip li show vti6
      vti6@NONE: <POINTOPOINT,NOARP> mtu 8832 ...
        link/tunnel6 fd00::2 peer fd00::1

Reported-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_vti.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index dbb74f3c57a7..18caa9539e6d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -626,6 +626,7 @@ static void vti6_link_config(struct ip6_tnl *t)
 {
 	struct net_device *dev = t->dev;
 	struct __ip6_tnl_parm *p = &t->parms;
+	struct net_device *tdev = NULL;
 
 	memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
 	memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
@@ -638,6 +639,25 @@ static void vti6_link_config(struct ip6_tnl *t)
 		dev->flags |= IFF_POINTOPOINT;
 	else
 		dev->flags &= ~IFF_POINTOPOINT;
+
+	if (p->flags & IP6_TNL_F_CAP_XMIT) {
+		int strict = (ipv6_addr_type(&p->raddr) &
+			      (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
+		struct rt6_info *rt = rt6_lookup(t->net,
+						 &p->raddr, &p->laddr,
+						 p->link, strict);
+
+		if (rt)
+			tdev = rt->dst.dev;
+		ip6_rt_put(rt);
+	}
+
+	if (!tdev && p->link)
+		tdev = __dev_get_by_index(t->net, p->link);
+
+	if (tdev)
+		dev->mtu = max_t(int, tdev->mtu - dev->hard_header_len,
+				 IPV6_MIN_MTU);
 }
 
 /**
-- 
cgit v1.2.3


From 223b229b63f461cd3c14583a0e517fde19e7a081 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 19 Dec 2017 20:10:53 +0200
Subject: bridge: Use helpers to handle MAC address

Use
	%pM to print MAC
	mac_pton() to convert it from ASCII to binary format, and
	ether_addr_copy() to copy.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_sysfs_br.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 723f25eed8ea..b1be0dcfba6b 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -272,10 +272,7 @@ static ssize_t group_addr_show(struct device *d,
 			       struct device_attribute *attr, char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%x:%x:%x:%x:%x:%x\n",
-		       br->group_addr[0], br->group_addr[1],
-		       br->group_addr[2], br->group_addr[3],
-		       br->group_addr[4], br->group_addr[5]);
+	return sprintf(buf, "%pM\n", br->group_addr);
 }
 
 static ssize_t group_addr_store(struct device *d,
@@ -284,14 +281,11 @@ static ssize_t group_addr_store(struct device *d,
 {
 	struct net_bridge *br = to_bridge(d);
 	u8 new_addr[6];
-	int i;
 
 	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
-	if (sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
-		   &new_addr[0], &new_addr[1], &new_addr[2],
-		   &new_addr[3], &new_addr[4], &new_addr[5]) != 6)
+	if (!mac_pton(buf, new_addr))
 		return -EINVAL;
 
 	if (!is_link_local_ether_addr(new_addr))
@@ -306,8 +300,7 @@ static ssize_t group_addr_store(struct device *d,
 		return restart_syscall();
 
 	spin_lock_bh(&br->lock);
-	for (i = 0; i < 6; i++)
-		br->group_addr[i] = new_addr[i];
+	ether_addr_copy(br->group_addr, new_addr);
 	spin_unlock_bh(&br->lock);
 
 	br->group_addr_set = true;
-- 
cgit v1.2.3


From 293a1991cf0d62d0a60b41af42f1dd601d5029fc Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 20 Dec 2017 09:53:19 +0800
Subject: ip6_gre: fix a pontential issue in ip6erspan_rcv

pskb_may_pull() can change skb->data, so we need to load ipv6h/ershdr at
the right place.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Cc: William Tu <u9012063@gmail.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 87b9892dfa23..9bd110371fe3 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -507,12 +507,11 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 	struct ip6_tnl *tunnel;
 	u8 ver;
 
-	ipv6h = ipv6_hdr(skb);
-	ershdr = (struct erspan_base_hdr *)skb->data;
-
 	if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
 		return PACKET_REJECT;
 
+	ipv6h = ipv6_hdr(skb);
+	ershdr = (struct erspan_base_hdr *)skb->data;
 	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
 
-- 
cgit v1.2.3


From dd8d5b8c5b22e31079b259b8bfb686f1fac1080a Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 20 Dec 2017 10:21:46 +0800
Subject: ip_gre: fix error path when erspan_rcv failed

When erspan_rcv call return PACKET_REJECT, we shoudn't call ipgre_rcv to
process packets again, instead send icmp unreachable message in error
path.

Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
Acked-by: William Tu <u9012063@gmail.com>
Cc: William Tu <u9012063@gmail.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fd4d6e96da7e..5c36b6745dbc 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -434,11 +434,13 @@ static int gre_rcv(struct sk_buff *skb)
 		     tpi.proto == htons(ETH_P_ERSPAN2))) {
 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 			return 0;
+		goto out;
 	}
 
 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 		return 0;
 
+out:
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 drop:
 	kfree_skb(skb);
-- 
cgit v1.2.3


From a7343211f007fdd9e4ebeb80b40d5054798eb890 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 20 Dec 2017 10:21:47 +0800
Subject: ip6_gre: fix error path when ip6erspan_rcv failed

Same as ipv4 code, when ip6erspan_rcv call return PACKET_REJECT, we
should call icmpv6_send to send icmp unreachable message in error path.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Acked-by: William Tu <u9012063@gmail.com>
Cc: William Tu <u9012063@gmail.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 9bd110371fe3..b7a79ddcf851 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -602,12 +602,13 @@ static int gre_rcv(struct sk_buff *skb)
 		     tpi.proto == htons(ETH_P_ERSPAN2))) {
 		if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD)
 			return 0;
-		goto drop;
+		goto out;
 	}
 
 	if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
 		return 0;
 
+out:
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 drop:
 	kfree_skb(skb);
-- 
cgit v1.2.3


From 50670b6ee9bc4ae8f9ce3112b437987adf273245 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 20 Dec 2017 10:07:00 +0800
Subject: ip_gre: fix potential memory leak in erspan_rcv

If md is NULL, tun_dst must be freed, otherwise it will cause memory
leak.

Fixes: 1a66a836da6 ("gre: add collect_md mode to ERSPAN tunnel")
Cc: William Tu <u9012063@gmail.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5c36b6745dbc..90c912307814 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -313,8 +313,10 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return PACKET_REJECT;
 
 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
-			if (!md)
+			if (!md) {
+				dst_release((struct dst_entry *)tun_dst);
 				return PACKET_REJECT;
+			}
 
 			memcpy(md, pkt_md, sizeof(*md));
 			md->version = ver;
-- 
cgit v1.2.3


From afb4c97d90e62f33b9f389aa0023f8478bb89db2 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 20 Dec 2017 10:07:01 +0800
Subject: ip6_gre: fix potential memory leak in ip6erspan_rcv

If md is NULL, tun_dst must be freed, otherwise it will cause memory
leak.

Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Cc: William Tu <u9012063@gmail.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b7a79ddcf851..8451d00b210b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -550,8 +550,10 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 
 			info = &tun_dst->u.tun_info;
 			md = ip_tunnel_info_opts(info);
-			if (!md)
+			if (!md) {
+				dst_release((struct dst_entry *)tun_dst);
 				return PACKET_REJECT;
+			}
 
 			memcpy(md, pkt_md, sizeof(*md));
 			md->version = ver;
-- 
cgit v1.2.3


From 563e0bb0dc74b3ca888e24f8c08f0239fe4016b0 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 20 Dec 2017 11:12:51 +0800
Subject: net: tracepoint: replace tcp_set_state tracepoint with
 inet_sock_set_state tracepoint

As sk_state is a common field for struct sock, so the state
transition tracepoint should not be a TCP specific feature.
Currently it traces all AF_INET state transition, so I rename this
tracepoint to inet_sock_set_state tracepoint with some minor changes and move it
into trace/events/sock.h.
We dont need to create a file named trace/events/inet_sock.h for this one single
tracepoint.

Two helpers are introduced to trace sk_state transition
    - void inet_sk_state_store(struct sock *sk, int newstate);
    - void inet_sk_set_state(struct sock *sk, int state);
As trace header should not be included in other header files,
so they are defined in sock.c.

The protocol such as SCTP maybe compiled as a ko, hence export
inet_sk_set_state().

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h         |   2 +
 include/trace/events/sock.h     | 107 ++++++++++++++++++++++++++++++++++++++++
 include/trace/events/tcp.h      |  31 ------------
 net/ipv4/af_inet.c              |  14 ++++++
 net/ipv4/inet_connection_sock.c |   6 +--
 net/ipv4/inet_hashtables.c      |   2 +-
 net/ipv4/tcp.c                  |   6 +--
 7 files changed, 128 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 39efb968b7a4..a3431a4ff9cc 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -290,6 +290,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #endif
 
 int inet_sk_rebuild_header(struct sock *sk);
+void inet_sk_set_state(struct sock *sk, int state);
+void inet_sk_state_store(struct sock *sk, int newstate);
 
 static inline unsigned int __inet_ehashfn(const __be32 laddr,
 					  const __u16 lport,
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index ec4dade24466..3b9094a07b80 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -6,7 +6,50 @@
 #define _TRACE_SOCK_H
 
 #include <net/sock.h>
+#include <net/ipv6.h>
 #include <linux/tracepoint.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+
+/* The protocol traced by sock_set_state */
+#define inet_protocol_names		\
+		EM(IPPROTO_TCP)			\
+		EM(IPPROTO_DCCP)		\
+		EMe(IPPROTO_SCTP)
+
+#define tcp_state_names			\
+		EM(TCP_ESTABLISHED)		\
+		EM(TCP_SYN_SENT)		\
+		EM(TCP_SYN_RECV)		\
+		EM(TCP_FIN_WAIT1)		\
+		EM(TCP_FIN_WAIT2)		\
+		EM(TCP_TIME_WAIT)		\
+		EM(TCP_CLOSE)			\
+		EM(TCP_CLOSE_WAIT)		\
+		EM(TCP_LAST_ACK)		\
+		EM(TCP_LISTEN)			\
+		EM(TCP_CLOSING)			\
+		EMe(TCP_NEW_SYN_RECV)
+
+/* enums need to be exported to user space */
+#undef EM
+#undef EMe
+#define EM(a)       TRACE_DEFINE_ENUM(a);
+#define EMe(a)      TRACE_DEFINE_ENUM(a);
+
+inet_protocol_names
+tcp_state_names
+
+#undef EM
+#undef EMe
+#define EM(a)       { a, #a },
+#define EMe(a)      { a, #a }
+
+#define show_inet_protocol_name(val)    \
+	__print_symbolic(val, inet_protocol_names)
+
+#define show_tcp_state_name(val)        \
+	__print_symbolic(val, tcp_state_names)
 
 TRACE_EVENT(sock_rcvqueue_full,
 
@@ -63,6 +106,70 @@ TRACE_EVENT(sock_exceed_buf_limit,
 		__entry->rmem_alloc)
 );
 
+TRACE_EVENT(inet_sock_set_state,
+
+	TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),
+
+	TP_ARGS(sk, oldstate, newstate),
+
+	TP_STRUCT__entry(
+		__field(const void *, skaddr)
+		__field(int, oldstate)
+		__field(int, newstate)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u8, protocol)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->skaddr = sk;
+		__entry->oldstate = oldstate;
+		__entry->newstate = newstate;
+
+		__entry->protocol = sk->sk_protocol;
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 =  inet->inet_daddr;
+
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sk->sk_family == AF_INET6) {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			*pin6 = sk->sk_v6_rcv_saddr;
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			*pin6 = sk->sk_v6_daddr;
+		} else
+#endif
+		{
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
+		}
+	),
+
+	TP_printk("protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4"
+			"saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
+			show_inet_protocol_name(__entry->protocol),
+			__entry->sport, __entry->dport,
+			__entry->saddr, __entry->daddr,
+			__entry->saddr_v6, __entry->daddr_v6,
+			show_tcp_state_name(__entry->oldstate),
+			show_tcp_state_name(__entry->newstate))
+);
+
 #endif /* _TRACE_SOCK_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index ec52fb3b4ae0..8e88a1671538 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -9,37 +9,6 @@
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
 
-#define tcp_state_names         \
-		EM(TCP_ESTABLISHED)     \
-		EM(TCP_SYN_SENT)        \
-		EM(TCP_SYN_RECV)        \
-		EM(TCP_FIN_WAIT1)       \
-		EM(TCP_FIN_WAIT2)       \
-		EM(TCP_TIME_WAIT)       \
-		EM(TCP_CLOSE)           \
-		EM(TCP_CLOSE_WAIT)      \
-		EM(TCP_LAST_ACK)        \
-		EM(TCP_LISTEN)          \
-		EM(TCP_CLOSING)         \
-		EMe(TCP_NEW_SYN_RECV)   \
-
-/* enums need to be exported to user space */
-#undef EM
-#undef EMe
-#define EM(a)         TRACE_DEFINE_ENUM(a);
-#define EMe(a)        TRACE_DEFINE_ENUM(a);
-
-tcp_state_names
-
-#undef EM
-#undef EMe
-#define EM(a)         tcp_state_name(a),
-#define EMe(a)        tcp_state_name(a)
-
-#define tcp_state_name(state)	{ state, #state }
-#define show_tcp_state_name(val)			\
-	__print_symbolic(val, tcp_state_names)
-
 /*
  * tcp event with arguments sk and skb
  *
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f00499a46927..bab98a4fedad 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -121,6 +121,7 @@
 #endif
 #include <net/l3mdev.h>
 
+#include <trace/events/sock.h>
 
 /* The inetsw table contains everything that inet_create needs to
  * build a new socket.
@@ -1220,6 +1221,19 @@ int inet_sk_rebuild_header(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_sk_rebuild_header);
 
+void inet_sk_set_state(struct sock *sk, int state)
+{
+	trace_inet_sock_set_state(sk, sk->sk_state, state);
+	sk->sk_state = state;
+}
+EXPORT_SYMBOL(inet_sk_set_state);
+
+void inet_sk_state_store(struct sock *sk, int newstate)
+{
+	trace_inet_sock_set_state(sk, sk->sk_state, newstate);
+	smp_store_release(&sk->sk_state, newstate);
+}
+
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4ca46dc08e63..f460fc04aa66 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -783,7 +783,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 	if (newsk) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 
-		newsk->sk_state = TCP_SYN_RECV;
+		inet_sk_set_state(newsk, TCP_SYN_RECV);
 		newicsk->icsk_bind_hash = NULL;
 
 		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
@@ -877,7 +877,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
 	 * It is OK, because this socket enters to hash table only
 	 * after validation is complete.
 	 */
-	sk_state_store(sk, TCP_LISTEN);
+	inet_sk_state_store(sk, TCP_LISTEN);
 	if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
 		inet->inet_sport = htons(inet->inet_num);
 
@@ -888,7 +888,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
 			return 0;
 	}
 
-	sk->sk_state = TCP_CLOSE;
+	inet_sk_set_state(sk, TCP_CLOSE);
 	return err;
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index f6f58108b4c5..37b7da0b975d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -544,7 +544,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	} else {
 		percpu_counter_inc(sk->sk_prot->orphan_count);
-		sk->sk_state = TCP_CLOSE;
+		inet_sk_set_state(sk, TCP_CLOSE);
 		sock_set_flag(sk, SOCK_DEAD);
 		inet_csk_destroy_sock(sk);
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c470fec9062f..d408fb41c804 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,8 +283,6 @@
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
 
-#include <trace/events/tcp.h>
-
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -2040,8 +2038,6 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
-	trace_tcp_set_state(sk, oldstate, state);
-
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
@@ -2065,7 +2061,7 @@ void tcp_set_state(struct sock *sk, int state)
 	/* Change state AFTER socket is unhashed to avoid closed
 	 * socket sitting in hash tables.
 	 */
-	sk_state_store(sk, state);
+	inet_sk_state_store(sk, state);
 
 #ifdef STATE_TRACE
 	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-- 
cgit v1.2.3


From 986ffdfd08dbaae721e82720e6bfc2c307e732dd Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 20 Dec 2017 11:12:52 +0800
Subject: net: sock: replace sk_state_load with inet_sk_state_load and remove
 sk_state_store

sk_state_load is only used by AF_INET/AF_INET6, so rename it to
inet_sk_state_load and move it into inet_sock.h.

sk_state_store is removed as it is not used any more.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h         | 25 ++++++++++++++++++++++++-
 include/net/sock.h              | 25 -------------------------
 net/ipv4/inet_connection_sock.c |  2 +-
 net/ipv4/tcp.c                  |  4 ++--
 net/ipv4/tcp_diag.c             |  2 +-
 net/ipv4/tcp_ipv4.c             |  2 +-
 net/ipv6/tcp_ipv6.c             |  2 +-
 7 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index a3431a4ff9cc..0a671c32d6b9 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -290,9 +290,32 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #endif
 
 int inet_sk_rebuild_header(struct sock *sk);
-void inet_sk_set_state(struct sock *sk, int state);
+
+/**
+ * inet_sk_state_load - read sk->sk_state for lockless contexts
+ * @sk: socket pointer
+ *
+ * Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
+ * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
+ */
+static inline int inet_sk_state_load(const struct sock *sk)
+{
+	/* state change might impact lockless readers. */
+	return smp_load_acquire(&sk->sk_state);
+}
+
+/**
+ * inet_sk_state_store - update sk->sk_state
+ * @sk: socket pointer
+ * @newstate: new state
+ *
+ * Paired with inet_sk_state_load(). Should be used in contexts where
+ * state change might impact lockless readers.
+ */
 void inet_sk_state_store(struct sock *sk, int newstate);
 
+void inet_sk_set_state(struct sock *sk, int state);
+
 static inline unsigned int __inet_ehashfn(const __be32 laddr,
 					  const __u16 lport,
 					  const __be32 faddr,
diff --git a/include/net/sock.h b/include/net/sock.h
index 0a32f3ce381c..6c1db823f8b9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2333,31 +2333,6 @@ static inline bool sk_listener(const struct sock *sk)
 	return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
 }
 
-/**
- * sk_state_load - read sk->sk_state for lockless contexts
- * @sk: socket pointer
- *
- * Paired with sk_state_store(). Used in places we do not hold socket lock :
- * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
- */
-static inline int sk_state_load(const struct sock *sk)
-{
-	return smp_load_acquire(&sk->sk_state);
-}
-
-/**
- * sk_state_store - update sk->sk_state
- * @sk: socket pointer
- * @newstate: new state
- *
- * Paired with sk_state_load(). Should be used in contexts where
- * state change might impact lockless readers.
- */
-static inline void sk_state_store(struct sock *sk, int newstate)
-{
-	smp_store_release(&sk->sk_state, newstate);
-}
-
 void sock_enable_timestamp(struct sock *sk, int flag);
 int sock_get_timestamp(struct sock *, struct timeval __user *);
 int sock_get_timestampns(struct sock *, struct timespec __user *);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f460fc04aa66..12410ec6f7f7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -685,7 +685,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 	int max_retries, thresh;
 	u8 defer_accept;
 
-	if (sk_state_load(sk_listener) != TCP_LISTEN)
+	if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
 		goto drop;
 
 	max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d408fb41c804..67d39b79c801 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -502,7 +502,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
-	state = sk_state_load(sk);
+	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
@@ -2916,7 +2916,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	if (sk->sk_type != SOCK_STREAM)
 		return;
 
-	info->tcpi_state = sk_state_load(sk);
+	info->tcpi_state = inet_sk_state_load(sk);
 
 	/* Report meaningful fields for all TCP states, including listeners */
 	rate = READ_ONCE(sk->sk_pacing_rate);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index abbf0edcf6c2..81148f7a2323 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -24,7 +24,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 {
 	struct tcp_info *info = _info;
 
-	if (sk_state_load(sk) == TCP_LISTEN) {
+	if (inet_sk_state_load(sk) == TCP_LISTEN) {
 		r->idiag_rqueue = sk->sk_ack_backlog;
 		r->idiag_wqueue = sk->sk_max_ack_backlog;
 	} else if (sk->sk_type == SOCK_STREAM) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 94e28350f420..dd945b114215 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2281,7 +2281,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 		timer_expires = jiffies;
 	}
 
-	state = sk_state_load(sk);
+	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		rx_queue = sk->sk_ack_backlog;
 	else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7178476b3d2f..aa12a26a96c6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1795,7 +1795,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 		timer_expires = jiffies;
 	}
 
-	state = sk_state_load(sp);
+	state = inet_sk_state_load(sp);
 	if (state == TCP_LISTEN)
 		rx_queue = sp->sk_ack_backlog;
 	else
-- 
cgit v1.2.3


From b0832e30058405405cfec73e7f545b184c198905 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 20 Dec 2017 11:12:53 +0800
Subject: net: tracepoint: using sock_set_state tracepoint to trace DCCP state
 transition

With changes in inet_ files, DCCP state transitions are traced with
inet_sock_set_state tracepoint.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 9d43c1f40274..7a75a1d3568b 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -110,7 +110,7 @@ void dccp_set_state(struct sock *sk, const int state)
 	/* Change state AFTER socket is unhashed to avoid closed
 	 * socket sitting in hash tables.
 	 */
-	sk->sk_state = state;
+	inet_sk_set_state(sk, state);
 }
 
 EXPORT_SYMBOL_GPL(dccp_set_state);
-- 
cgit v1.2.3


From cbabf46364b27d08335fef37ecd7a8b89a1c8e07 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 20 Dec 2017 11:12:54 +0800
Subject: net: tracepoint: using sock_set_state tracepoint to trace SCTP state
 transition

With changes in inet_ files, SCTP state transitions are traced with
inet_sock_set_state tracepoint.
As SCTP state names, i.e. SCTP_SS_CLOSED, SCTP_SS_ESTABLISHED,
have the same value with TCP state names. So the output info still print
the TCP state names, that makes the code easy.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/endpointola.c   |  2 +-
 net/sctp/sm_sideeffect.c |  4 ++--
 net/sctp/socket.c        | 12 ++++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index ee1e601a0b11..8b3146816519 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -232,7 +232,7 @@ void sctp_endpoint_free(struct sctp_endpoint *ep)
 {
 	ep->base.dead = true;
 
-	ep->base.sk->sk_state = SCTP_SS_CLOSED;
+	inet_sk_set_state(ep->base.sk, SCTP_SS_CLOSED);
 
 	/* Unlink this endpoint, so we can't find it again! */
 	sctp_unhash_endpoint(ep);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 16ddf2ca1438..b71e7fb0a20a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -878,12 +878,12 @@ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
 		 * successfully completed a connect() call.
 		 */
 		if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
-			sk->sk_state = SCTP_SS_ESTABLISHED;
+			inet_sk_set_state(sk, SCTP_SS_ESTABLISHED);
 
 		/* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */
 		if (sctp_state(asoc, SHUTDOWN_RECEIVED) &&
 		    sctp_sstate(sk, ESTABLISHED)) {
-			sk->sk_state = SCTP_SS_CLOSING;
+			inet_sk_set_state(sk, SCTP_SS_CLOSING);
 			sk->sk_shutdown |= RCV_SHUTDOWN;
 		}
 	}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5e4100df7bae..aadcd4244d9b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1544,7 +1544,7 @@ static void sctp_close(struct sock *sk, long timeout)
 
 	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 	sk->sk_shutdown = SHUTDOWN_MASK;
-	sk->sk_state = SCTP_SS_CLOSING;
+	inet_sk_set_state(sk, SCTP_SS_CLOSING);
 
 	ep = sctp_sk(sk)->ep;
 
@@ -4657,7 +4657,7 @@ static void sctp_shutdown(struct sock *sk, int how)
 	if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) {
 		struct sctp_association *asoc;
 
-		sk->sk_state = SCTP_SS_CLOSING;
+		inet_sk_set_state(sk, SCTP_SS_CLOSING);
 		asoc = list_entry(ep->asocs.next,
 				  struct sctp_association, asocs);
 		sctp_primitive_SHUTDOWN(net, asoc, NULL);
@@ -7513,13 +7513,13 @@ static int sctp_listen_start(struct sock *sk, int backlog)
 	 * sockets.
 	 *
 	 */
-	sk->sk_state = SCTP_SS_LISTENING;
+	inet_sk_set_state(sk, SCTP_SS_LISTENING);
 	if (!ep->base.bind_addr.port) {
 		if (sctp_autobind(sk))
 			return -EAGAIN;
 	} else {
 		if (sctp_get_port(sk, inet_sk(sk)->inet_num)) {
-			sk->sk_state = SCTP_SS_CLOSED;
+			inet_sk_set_state(sk, SCTP_SS_CLOSED);
 			return -EADDRINUSE;
 		}
 	}
@@ -8542,10 +8542,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	 * is called, set RCV_SHUTDOWN flag.
 	 */
 	if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) {
-		newsk->sk_state = SCTP_SS_CLOSED;
+		inet_sk_set_state(newsk, SCTP_SS_CLOSED);
 		newsk->sk_shutdown |= RCV_SHUTDOWN;
 	} else {
-		newsk->sk_state = SCTP_SS_ESTABLISHED;
+		inet_sk_set_state(newsk, SCTP_SS_ESTABLISHED);
 	}
 
 	release_sock(newsk);
-- 
cgit v1.2.3


From 92a2320697f7f07f4ee988077820965a0fbd11d0 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Tue, 19 Dec 2017 15:35:48 -0800
Subject: xfrm: check for xdo_dev_ops add and delete

This adds a check for the required add and delete functions up front
at registration time to be sure both are defined.

Since both the features check and the registration check are looking
at the same things, break out the check for both to call.

Lastly, for some reason the feature check was setting xfrmdev_ops to
NULL if the NETIF_F_HW_ESP bit was missing, which would probably
surprise the driver later if the driver turned its NETIF_F_HW_ESP bit
back on.  We shouldn't be messing with the driver's callback list, so
we stop doing that with this patch.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_device.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 20a96181867a..75982506617b 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -273,17 +273,31 @@ void xfrm_dev_backlog(struct softnet_data *sd)
 }
 #endif
 
-static int xfrm_dev_register(struct net_device *dev)
+static int xfrm_api_check(struct net_device *dev)
 {
-	if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops)
-		return NOTIFY_BAD;
+#ifdef CONFIG_XFRM_OFFLOAD
 	if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
 	    !(dev->features & NETIF_F_HW_ESP))
 		return NOTIFY_BAD;
 
+	if ((dev->features & NETIF_F_HW_ESP) &&
+	    (!(dev->xfrmdev_ops &&
+	       dev->xfrmdev_ops->xdo_dev_state_add &&
+	       dev->xfrmdev_ops->xdo_dev_state_delete)))
+		return NOTIFY_BAD;
+#else
+	if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM))
+		return NOTIFY_BAD;
+#endif
+
 	return NOTIFY_DONE;
 }
 
+static int xfrm_dev_register(struct net_device *dev)
+{
+	return xfrm_api_check(dev);
+}
+
 static int xfrm_dev_unregister(struct net_device *dev)
 {
 	xfrm_policy_cache_flush();
@@ -292,16 +306,7 @@ static int xfrm_dev_unregister(struct net_device *dev)
 
 static int xfrm_dev_feat_change(struct net_device *dev)
 {
-	if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops)
-		return NOTIFY_BAD;
-	else if (!(dev->features & NETIF_F_HW_ESP))
-		dev->xfrmdev_ops = NULL;
-
-	if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
-	    !(dev->features & NETIF_F_HW_ESP))
-		return NOTIFY_BAD;
-
-	return NOTIFY_DONE;
+	return xfrm_api_check(dev);
 }
 
 static int xfrm_dev_down(struct net_device *dev)
-- 
cgit v1.2.3


From ac8ef4ab731fae8617a41960917ceda475d5a2df Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:11 -0500
Subject: net: sched: fix coding style issues

This patch fix checkpatch issues for upcomming patches according to the
sched api file. It changes mostly how to check on null pointer.

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c    |  2 +-
 net/sched/sch_cbq.c    | 12 ++++++------
 net/sched/sch_gred.c   |  7 ++++---
 net/sched/sch_hfsc.c   |  2 +-
 net/sched/sch_multiq.c |  2 +-
 net/sched/sch_tbf.c    |  2 +-
 6 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 74c22b4e365e..96a5e5d9378e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -669,7 +669,7 @@ int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 	unsigned int size = 4;
 
 	clhash->hash = qdisc_class_hash_alloc(size);
-	if (clhash->hash == NULL)
+	if (!clhash->hash)
 		return -ENOMEM;
 	clhash->hashsize  = size;
 	clhash->hashmask  = size - 1;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 525eb3a6d625..0692fe35f4ec 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1150,12 +1150,13 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
+	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE])
 		return -EINVAL;
 
 	r = nla_data(tb[TCA_CBQ_RATE]);
 
-	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
+	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB]);
+	if (!q->link.R_tab)
 		return -EINVAL;
 
 	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch);
@@ -1460,7 +1461,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	struct cbq_class *parent;
 	struct qdisc_rate_table *rtab = NULL;
 
-	if (opt == NULL)
+	if (!opt)
 		return -EINVAL;
 
 	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
@@ -1532,8 +1533,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (parentid == TC_H_ROOT)
 		return -EINVAL;
 
-	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
-	    tb[TCA_CBQ_LSSOPT] == NULL)
+	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT])
 		return -EINVAL;
 
 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
@@ -1565,7 +1565,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (parentid) {
 		parent = cbq_class_lookup(q, parentid);
 		err = -EINVAL;
-		if (parent == NULL)
+		if (!parent)
 			goto failure;
 	}
 
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index bc30f9186ac6..ccd1a00e2a9a 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -306,12 +306,13 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
 	struct tc_gred_sopt *sopt;
 	int i;
 
-	if (dps == NULL)
+	if (!dps)
 		return -EINVAL;
 
 	sopt = nla_data(dps);
 
-	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
+	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 ||
+	    sopt->def_DP >= sopt->DPs)
 		return -EINVAL;
 
 	sch_tree_lock(sch);
@@ -470,7 +471,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt)
 	struct nlattr *tb[TCA_GRED_MAX + 1];
 	int err;
 
-	if (opt == NULL)
+	if (!opt)
 		return -EINVAL;
 
 	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index d04068a97d81..94db20352f37 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1396,7 +1396,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 
 	qdisc_watchdog_init(&q->watchdog, sch);
 
-	if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 	qopt = nla_data(opt);
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 012216386c0b..37195e0c64ba 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -243,7 +243,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	q->queues = NULL;
 
-	if (opt == NULL)
+	if (!opt)
 		return -EINVAL;
 
 	err = tcf_block_get(&q->block, &q->filter_list, sch);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 120f4f365967..e8f3345674c5 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -428,7 +428,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
 	qdisc_watchdog_init(&q->watchdog, sch);
 	q->qdisc = &noop_qdisc;
 
-	if (opt == NULL)
+	if (!opt)
 		return -EINVAL;
 
 	q->t_c = ktime_get_ns();
-- 
cgit v1.2.3


From 09215598119ebf89bd204ca4ad8b7059266053d9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:12 -0500
Subject: net: sched: sch_api: handle generic qdisc errors

This patch adds extack support for generic qdisc handling. The extack
will be set deeper to each called function which is not part of netdev
core api.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 148 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 105 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 96a5e5d9378e..954c0fc45473 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -449,7 +449,8 @@ static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
 };
 
-static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
+static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
+					       struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_STAB_MAX + 1];
 	struct qdisc_size_table *stab;
@@ -458,23 +459,29 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 	u16 *tab = NULL;
 	int err;
 
-	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, NULL);
+	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
 	if (err < 0)
 		return ERR_PTR(err);
-	if (!tb[TCA_STAB_BASE])
+	if (!tb[TCA_STAB_BASE]) {
+		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
 		return ERR_PTR(-EINVAL);
+	}
 
 	s = nla_data(tb[TCA_STAB_BASE]);
 
 	if (s->tsize > 0) {
-		if (!tb[TCA_STAB_DATA])
+		if (!tb[TCA_STAB_DATA]) {
+			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
 			return ERR_PTR(-EINVAL);
+		}
 		tab = nla_data(tb[TCA_STAB_DATA]);
 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 	}
 
-	if (tsize != s->tsize || (!tab && tsize > 0))
+	if (tsize != s->tsize || (!tab && tsize > 0)) {
+		NL_SET_ERR_MSG(extack, "Invalid size of size table");
 		return ERR_PTR(-EINVAL);
+	}
 
 	list_for_each_entry(stab, &qdisc_stab_list, list) {
 		if (memcmp(&stab->szopts, s, sizeof(*s)))
@@ -899,7 +906,8 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 
 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
-		       struct Qdisc *new, struct Qdisc *old)
+		       struct Qdisc *new, struct Qdisc *old,
+		       struct netlink_ext_ack *extack)
 {
 	struct Qdisc *q = old;
 	struct net *net = dev_net(dev);
@@ -914,8 +922,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		    (new && new->flags & TCQ_F_INGRESS)) {
 			num_q = 1;
 			ingress = 1;
-			if (!dev_ingress_queue(dev))
+			if (!dev_ingress_queue(dev)) {
+				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
 				return -ENOENT;
+			}
 		}
 
 		if (dev->flags & IFF_UP)
@@ -966,10 +976,12 @@ skip:
 		if (cops && cops->graft) {
 			unsigned long cl = cops->find(parent, classid);
 
-			if (cl)
+			if (cl) {
 				err = cops->graft(parent, cl, new, &old);
-			else
+			} else {
+				NL_SET_ERR_MSG(extack, "Specified class not found");
 				err = -ENOENT;
+			}
 		}
 		if (!err)
 			notify_and_destroy(net, skb, n, classid, old, new);
@@ -990,7 +1002,8 @@ static struct lock_class_key qdisc_rx_lock;
 static struct Qdisc *qdisc_create(struct net_device *dev,
 				  struct netdev_queue *dev_queue,
 				  struct Qdisc *p, u32 parent, u32 handle,
-				  struct nlattr **tca, int *errp)
+				  struct nlattr **tca, int *errp,
+				  struct netlink_ext_ack *extack)
 {
 	int err;
 	struct nlattr *kind = tca[TCA_KIND];
@@ -1028,8 +1041,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 #endif
 
 	err = -ENOENT;
-	if (!ops)
+	if (!ops) {
+		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
 		goto err_out;
+	}
 
 	sch = qdisc_alloc(dev_queue, ops);
 	if (IS_ERR(sch)) {
@@ -1086,7 +1101,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 	}
 
 	if (tca[TCA_STAB]) {
-		stab = qdisc_get_stab(tca[TCA_STAB]);
+		stab = qdisc_get_stab(tca[TCA_STAB], extack);
 		if (IS_ERR(stab)) {
 			err = PTR_ERR(stab);
 			goto err_out4;
@@ -1097,8 +1112,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		seqcount_t *running;
 
 		err = -EOPNOTSUPP;
-		if (sch->flags & TCQ_F_MQROOT)
+		if (sch->flags & TCQ_F_MQROOT) {
+			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
 			goto err_out4;
+		}
 
 		if (sch->parent != TC_H_ROOT &&
 		    !(sch->flags & TCQ_F_INGRESS) &&
@@ -1113,8 +1130,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 					NULL,
 					running,
 					tca[TCA_RATE]);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
 			goto err_out4;
+		}
 	}
 
 	qdisc_hash_add(sch, false);
@@ -1147,21 +1166,24 @@ err_out4:
 	goto err_out3;
 }
 
-static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
+static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
+			struct netlink_ext_ack *extack)
 {
 	struct qdisc_size_table *ostab, *stab = NULL;
 	int err = 0;
 
 	if (tca[TCA_OPTIONS]) {
-		if (!sch->ops->change)
+		if (!sch->ops->change) {
+			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
 			return -EINVAL;
+		}
 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 		if (err)
 			return err;
 	}
 
 	if (tca[TCA_STAB]) {
-		stab = qdisc_get_stab(tca[TCA_STAB]);
+		stab = qdisc_get_stab(tca[TCA_STAB], extack);
 		if (IS_ERR(stab))
 			return PTR_ERR(stab);
 	}
@@ -1259,8 +1281,10 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
 		if (clid != TC_H_ROOT) {
 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
-				if (!p)
+				if (!p) {
+					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
 					return -ENOENT;
+				}
 				q = qdisc_leaf(p, clid);
 			} else if (dev_ingress_queue(dev)) {
 				q = dev_ingress_queue(dev)->qdisc_sleeping;
@@ -1268,26 +1292,38 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
 		} else {
 			q = dev->qdisc;
 		}
-		if (!q)
+		if (!q) {
+			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
 			return -ENOENT;
+		}
 
-		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
+		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
+			NL_SET_ERR_MSG(extack, "Invalid handle");
 			return -EINVAL;
+		}
 	} else {
 		q = qdisc_lookup(dev, tcm->tcm_handle);
-		if (!q)
+		if (!q) {
+			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
 			return -ENOENT;
+		}
 	}
 
-	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
 		return -EINVAL;
+	}
 
 	if (n->nlmsg_type == RTM_DELQDISC) {
-		if (!clid)
+		if (!clid) {
+			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
 			return -EINVAL;
-		if (q->handle == 0)
+		}
+		if (q->handle == 0) {
+			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
 			return -ENOENT;
-		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+		}
+		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
 		if (err != 0)
 			return err;
 	} else {
@@ -1333,8 +1369,10 @@ replay:
 		if (clid != TC_H_ROOT) {
 			if (clid != TC_H_INGRESS) {
 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
-				if (!p)
+				if (!p) {
+					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
 					return -ENOENT;
+				}
 				q = qdisc_leaf(p, clid);
 			} else if (dev_ingress_queue_create(dev)) {
 				q = dev_ingress_queue(dev)->qdisc_sleeping;
@@ -1349,21 +1387,33 @@ replay:
 
 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
 			if (tcm->tcm_handle) {
-				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
+				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
+					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
 					return -EEXIST;
-				if (TC_H_MIN(tcm->tcm_handle))
+				}
+				if (TC_H_MIN(tcm->tcm_handle)) {
+					NL_SET_ERR_MSG(extack, "Invalid minor handle");
 					return -EINVAL;
+				}
 				q = qdisc_lookup(dev, tcm->tcm_handle);
-				if (!q)
+				if (!q) {
+					NL_SET_ERR_MSG(extack, "No qdisc found for specified handle");
 					goto create_n_graft;
-				if (n->nlmsg_flags & NLM_F_EXCL)
+				}
+				if (n->nlmsg_flags & NLM_F_EXCL) {
+					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
 					return -EEXIST;
+				}
 				if (tca[TCA_KIND] &&
-				    nla_strcmp(tca[TCA_KIND], q->ops->id))
+				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
 					return -EINVAL;
+				}
 				if (q == p ||
-				    (p && check_loop(q, p, 0)))
+				    (p && check_loop(q, p, 0))) {
+					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
 					return -ELOOP;
+				}
 				qdisc_refcount_inc(q);
 				goto graft;
 			} else {
@@ -1398,33 +1448,45 @@ replay:
 			}
 		}
 	} else {
-		if (!tcm->tcm_handle)
+		if (!tcm->tcm_handle) {
+			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
 			return -EINVAL;
+		}
 		q = qdisc_lookup(dev, tcm->tcm_handle);
 	}
 
 	/* Change qdisc parameters */
-	if (!q)
+	if (!q) {
+		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
 		return -ENOENT;
-	if (n->nlmsg_flags & NLM_F_EXCL)
+	}
+	if (n->nlmsg_flags & NLM_F_EXCL) {
+		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
 		return -EEXIST;
-	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+	}
+	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
 		return -EINVAL;
-	err = qdisc_change(q, tca);
+	}
+	err = qdisc_change(q, tca, extack);
 	if (err == 0)
 		qdisc_notify(net, skb, n, clid, NULL, q);
 	return err;
 
 create_n_graft:
-	if (!(n->nlmsg_flags & NLM_F_CREATE))
+	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
 		return -ENOENT;
+	}
 	if (clid == TC_H_INGRESS) {
-		if (dev_ingress_queue(dev))
+		if (dev_ingress_queue(dev)) {
 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
 					 tcm->tcm_parent, tcm->tcm_parent,
-					 tca, &err);
-		else
+					 tca, &err, extack);
+		} else {
+			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
 			err = -ENOENT;
+		}
 	} else {
 		struct netdev_queue *dev_queue;
 
@@ -1437,7 +1499,7 @@ create_n_graft:
 
 		q = qdisc_create(dev, dev_queue, p,
 				 tcm->tcm_parent, tcm->tcm_handle,
-				 tca, &err);
+				 tca, &err, extack);
 	}
 	if (q == NULL) {
 		if (err == -EAGAIN)
@@ -1446,7 +1508,7 @@ create_n_graft:
 	}
 
 graft:
-	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
+	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
 	if (err) {
 		if (q)
 			qdisc_destroy(q);
-- 
cgit v1.2.3


From e63d7dfd2df7aa204849599c6f378e627e926657 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:13 -0500
Subject: net: sched: sch: add extack for init callback

This patch adds extack support for init callback to prepare per-qdisc
specific changes for extack.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  3 ++-
 net/sched/sch_api.c       |  2 +-
 net/sched/sch_atm.c       |  3 ++-
 net/sched/sch_cbq.c       |  3 ++-
 net/sched/sch_cbs.c       |  3 ++-
 net/sched/sch_choke.c     |  3 ++-
 net/sched/sch_codel.c     |  3 ++-
 net/sched/sch_drr.c       |  3 ++-
 net/sched/sch_dsmark.c    |  3 ++-
 net/sched/sch_fifo.c      | 14 ++++++++++----
 net/sched/sch_fq.c        |  3 ++-
 net/sched/sch_fq_codel.c  |  3 ++-
 net/sched/sch_generic.c   |  8 +++++---
 net/sched/sch_gred.c      |  3 ++-
 net/sched/sch_hfsc.c      |  3 ++-
 net/sched/sch_hhf.c       |  3 ++-
 net/sched/sch_htb.c       |  3 ++-
 net/sched/sch_ingress.c   |  6 ++++--
 net/sched/sch_mq.c        |  3 ++-
 net/sched/sch_mqprio.c    |  3 ++-
 net/sched/sch_multiq.c    |  3 ++-
 net/sched/sch_netem.c     |  3 ++-
 net/sched/sch_pie.c       |  3 ++-
 net/sched/sch_plug.c      |  3 ++-
 net/sched/sch_prio.c      |  3 ++-
 net/sched/sch_qfq.c       |  3 ++-
 net/sched/sch_red.c       |  3 ++-
 net/sched/sch_sfb.c       |  3 ++-
 net/sched/sch_sfq.c       |  3 ++-
 net/sched/sch_tbf.c       |  3 ++-
 net/sched/sch_teql.c      |  3 ++-
 31 files changed, 74 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bc6b25faba99..4c5faa0ff47d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -189,7 +189,8 @@ struct Qdisc_ops {
 	struct sk_buff *	(*dequeue)(struct Qdisc *);
 	struct sk_buff *	(*peek)(struct Qdisc *);
 
-	int			(*init)(struct Qdisc *sch, struct nlattr *arg);
+	int			(*init)(struct Qdisc *sch, struct nlattr *arg,
+					struct netlink_ext_ack *extack);
 	void			(*reset)(struct Qdisc *);
 	void			(*destroy)(struct Qdisc *);
 	int			(*change)(struct Qdisc *sch,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 954c0fc45473..49ee016347d2 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1084,7 +1084,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 	}
 
 	if (ops->init) {
-		err = ops->init(sch, tca[TCA_OPTIONS]);
+		err = ops->init(sch, tca[TCA_OPTIONS], extack);
 		if (err != 0)
 			goto err_out5;
 	}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 2dbd249c0b2f..53a07687c0fb 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -531,7 +531,8 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
 	return p->link.q->ops->peek(p->link.q);
 }
 
-static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
+static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	int err;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 0692fe35f4ec..86eba01457f3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1132,7 +1132,8 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
 };
 
-static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
+static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_CBQ_MAX + 1];
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 7a72980c1509..d77c632a276c 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -291,7 +291,8 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 531250fceb9e..49dda301e3bb 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -431,7 +431,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+static int choke_init(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	return choke_change(sch, opt);
 }
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index c518a1efcb9d..7221244e7f3b 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -184,7 +184,8 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+static int codel_init(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct codel_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 5bbcef3dcd8c..1a88473cd768 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -408,7 +408,8 @@ out:
 	return NULL;
 }
 
-static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
+			  struct netlink_ext_ack *extack)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	int err;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index fb4fb71c68cf..16dd480b5583 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -330,7 +330,8 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)
 	return p->q->ops->peek(p->q);
 }
 
-static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
+static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	struct nlattr *tb[TCA_DSMARK_MAX + 1];
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 1e37247656f8..a2d1c9f9b798 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -55,7 +55,8 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	return NET_XMIT_CN;
 }
 
-static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
 {
 	bool bypass;
 	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
@@ -88,6 +89,11 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
+static int fifo_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	return fifo_init(sch, opt, NULL);
+}
+
 static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct tc_fifo_qopt opt = { .limit = sch->limit };
@@ -108,7 +114,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_init,
+	.change		=	fifo_change,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
@@ -122,7 +128,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_init,
+	.change		=	fifo_change,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
@@ -136,7 +142,7 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_init,
+	.change		=	fifo_change,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 263d16e3219e..c9f61ffe220e 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -788,7 +788,8 @@ static void fq_destroy(struct Qdisc *sch)
 	qdisc_watchdog_cancel(&q->watchdog);
 }
 
-static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+static int fq_init(struct Qdisc *sch, struct nlattr *opt,
+		   struct netlink_ext_ack *extack)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	int err;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 0305d791ea94..5d0b20898ffa 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -458,7 +458,8 @@ static void fq_codel_destroy(struct Qdisc *sch)
 	kvfree(q->flows);
 }
 
-static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
+			 struct netlink_ext_ack *extack)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	int i;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 876fab2604b8..30bc38c5d7ae 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -551,7 +551,8 @@ struct Qdisc noop_qdisc = {
 };
 EXPORT_SYMBOL(noop_qdisc);
 
-static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
+static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
 {
 	/* register_qdisc() assigns a default of noop_enqueue if unset,
 	 * but __dev_queue_xmit() treats noqueue only as such
@@ -690,7 +691,8 @@ nla_put_failure:
 	return -1;
 }
 
-static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
+static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
+			   struct netlink_ext_ack *extack)
 {
 	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
@@ -840,7 +842,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 	}
 	sch->parent = parentid;
 
-	if (!ops->init || ops->init(sch, NULL) == 0)
+	if (!ops->init || ops->init(sch, NULL, NULL) == 0)
 		return sch;
 
 	qdisc_destroy(sch);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index ccd1a00e2a9a..4cab6ccad643 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -466,7 +466,8 @@ errout:
 	return err;
 }
 
-static int gred_init(struct Qdisc *sch, struct nlattr *opt)
+static int gred_init(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_GRED_MAX + 1];
 	int err;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 94db20352f37..1102943c46c9 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1388,7 +1388,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)
 }
 
 static int
-hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
+		struct netlink_ext_ack *extack)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct tc_hfsc_qopt *qopt;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 73a53c08091b..b3a80f0ed4b0 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -571,7 +571,8 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
+static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	int i;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index fa0380730ff0..41d9b7da9273 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1017,7 +1017,8 @@ static void htb_work_func(struct work_struct *work)
 	rcu_read_unlock();
 }
 
-static int htb_init(struct Qdisc *sch, struct nlattr *opt)
+static int htb_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_HTB_MAX + 1];
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index fc1286f499c1..a6f175e64016 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -62,7 +62,8 @@ static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
 	mini_qdisc_pair_swap(miniqp, tp_head);
 }
 
-static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
+static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
 {
 	struct ingress_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
@@ -167,7 +168,8 @@ static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl)
 	}
 }
 
-static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
+static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct clsact_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 8cbb5c829d59..b91f7d8cb184 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -36,7 +36,8 @@ static void mq_destroy(struct Qdisc *sch)
 	kfree(priv->qdiscs);
 }
 
-static int mq_init(struct Qdisc *sch, struct nlattr *opt)
+static int mq_init(struct Qdisc *sch, struct nlattr *opt,
+		   struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mq_sched *priv = qdisc_priv(sch);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 8622745f3cd9..0379fc4ee7bb 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -132,7 +132,8 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 	return 0;
 }
 
-static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 37195e0c64ba..54132dde6d42 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -236,7 +236,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	int i, err;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index dd70924cbcdf..6490ce08d29e 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -984,7 +984,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 	return ret;
 }
 
-static int netem_init(struct Qdisc *sch, struct nlattr *opt)
+static int netem_init(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	int ret;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 776c694c77c7..c4c87ed3971f 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -439,7 +439,8 @@ static void pie_timer(struct timer_list *t)
 
 }
 
-static int pie_init(struct Qdisc *sch, struct nlattr *opt)
+static int pie_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index 1c6cbab3e7b9..d9c6fbe55ae5 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -123,7 +123,8 @@ static struct sk_buff *plug_dequeue(struct Qdisc *sch)
 	return qdisc_dequeue_head(sch);
 }
 
-static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+static int plug_init(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
 {
 	struct plug_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 2c79559a0d31..8632d795e6ee 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -205,7 +205,8 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int prio_init(struct Qdisc *sch, struct nlattr *opt)
+static int prio_init(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	int err;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 6962b37a3ad3..7c1b976314bd 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1413,7 +1413,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
 	qfq_deactivate_class(q, cl);
 }
 
-static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
+			  struct netlink_ext_ack *extack)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_group *grp;
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index f0747eb87dc4..46d12206c7af 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -272,7 +272,8 @@ static inline void red_adaptative_timer(struct timer_list *t)
 	spin_unlock(root_lock);
 }
 
-static int red_init(struct Qdisc *sch, struct nlattr *opt)
+static int red_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 0678debdd856..b2205eaa0f51 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -549,7 +549,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
 	int err;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 930e5bd26d3d..3b5869c7b3f3 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -721,7 +721,8 @@ static void sfq_destroy(struct Qdisc *sch)
 	kfree(q->red_parms);
 }
 
-static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
+static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	int i;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index e8f3345674c5..9abff1271ec0 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -421,7 +421,8 @@ done:
 	return err;
 }
 
-static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 9fe6b427afed..93f04cf5cac1 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -167,7 +167,8 @@ teql_destroy(struct Qdisc *sch)
 	}
 }
 
-static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
+static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
+			   struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct teql_master *m = (struct teql_master *)sch->ops;
-- 
cgit v1.2.3


From 2030721cc0c39ff19df94a0df77b0401fdb71c1a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:14 -0500
Subject: net: sched: sch: add extack for change qdisc ops

This patch adds extack support for change callback for qdisc ops
structtur to prepare per-qdisc specific changes for extack.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  3 ++-
 net/sched/sch_api.c       |  2 +-
 net/sched/sch_cbs.c       |  5 +++--
 net/sched/sch_choke.c     |  5 +++--
 net/sched/sch_codel.c     |  5 +++--
 net/sched/sch_fifo.c      | 13 ++++---------
 net/sched/sch_fq.c        |  5 +++--
 net/sched/sch_fq_codel.c  |  5 +++--
 net/sched/sch_gred.c      |  3 ++-
 net/sched/sch_hfsc.c      |  3 ++-
 net/sched/sch_hhf.c       |  5 +++--
 net/sched/sch_multiq.c    |  5 +++--
 net/sched/sch_netem.c     |  5 +++--
 net/sched/sch_pie.c       |  5 +++--
 net/sched/sch_plug.c      |  3 ++-
 net/sched/sch_prio.c      |  5 +++--
 net/sched/sch_red.c       |  5 +++--
 net/sched/sch_sfb.c       |  5 +++--
 net/sched/sch_tbf.c       |  5 +++--
 19 files changed, 52 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4c5faa0ff47d..e7a3e206b904 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -194,7 +194,8 @@ struct Qdisc_ops {
 	void			(*reset)(struct Qdisc *);
 	void			(*destroy)(struct Qdisc *);
 	int			(*change)(struct Qdisc *sch,
-					  struct nlattr *arg);
+					  struct nlattr *arg,
+					  struct netlink_ext_ack *extack);
 	void			(*attach)(struct Qdisc *sch);
 
 	int			(*dump)(struct Qdisc *, struct sk_buff *);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 49ee016347d2..fcc70415fd26 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1177,7 +1177,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
 			return -EINVAL;
 		}
-		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
+		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
 		if (err)
 			return err;
 	}
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index d77c632a276c..8bf6e163d29c 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -246,7 +246,8 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
 	return 0;
 }
 
-static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
@@ -307,7 +308,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
 
 	qdisc_watchdog_init(&q->watchdog, sch);
 
-	return cbs_change(sch, opt);
+	return cbs_change(sch, opt, extack);
 }
 
 static void cbs_destroy(struct Qdisc *sch)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 49dda301e3bb..eafc0d17d174 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -344,7 +344,8 @@ static void choke_free(void *addr)
 	kvfree(addr);
 }
 
-static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+static int choke_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
@@ -434,7 +435,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
 		      struct netlink_ext_ack *extack)
 {
-	return choke_change(sch, opt);
+	return choke_change(sch, opt, extack);
 }
 
 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 7221244e7f3b..17cd81f84b5d 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -130,7 +130,8 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
 	[TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 },
 };
 
-static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+static int codel_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
 {
 	struct codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_CODEL_MAX + 1];
@@ -197,7 +198,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt,
 	q->params.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
-		int err = codel_change(sch, opt);
+		int err = codel_change(sch, opt, extack);
 
 		if (err)
 			return err;
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index a2d1c9f9b798..c65f23c70f40 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -89,11 +89,6 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
 	return 0;
 }
 
-static int fifo_change(struct Qdisc *sch, struct nlattr *opt)
-{
-	return fifo_init(sch, opt, NULL);
-}
-
 static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct tc_fifo_qopt opt = { .limit = sch->limit };
@@ -114,7 +109,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_change,
+	.change		=	fifo_init,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
@@ -128,7 +123,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_change,
+	.change		=	fifo_init,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
@@ -142,7 +137,7 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_change,
+	.change		=	fifo_init,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
@@ -163,7 +158,7 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit)
 		nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
 		((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
 
-		ret = q->ops->change(q, nla);
+		ret = q->ops->change(q, nla, NULL);
 		kfree(nla);
 	}
 	return ret;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index c9f61ffe220e..a366e4c9413a 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -685,7 +685,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
 };
 
-static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+static int fq_change(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_MAX + 1];
@@ -812,7 +813,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	if (opt)
-		err = fq_change(sch, opt);
+		err = fq_change(sch, opt, extack);
 	else
 		err = fq_resize(sch, q->fq_trees_log);
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 5d0b20898ffa..d798c93f7c96 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -377,7 +377,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
 	[TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 },
 };
 
-static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
+			   struct netlink_ext_ack *extack)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
@@ -478,7 +479,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
 	q->cparams.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
-		int err = fq_codel_change(sch, opt);
+		int err = fq_codel_change(sch, opt, NULL);
 		if (err)
 			return err;
 	}
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 4cab6ccad643..cbe4831f46f4 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -392,7 +392,8 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
 	[TCA_GRED_LIMIT]	= { .type = NLA_U32 },
 };
 
-static int gred_change(struct Qdisc *sch, struct nlattr *opt)
+static int gred_change(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct gred_sched *table = qdisc_priv(sch);
 	struct tc_gred_qopt *ctl;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 1102943c46c9..f49a4a4fe095 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1430,7 +1430,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
 }
 
 static int
-hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt)
+hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt,
+		  struct netlink_ext_ack *extack)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct tc_hfsc_qopt *qopt;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index b3a80f0ed4b0..bce2632212d3 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -504,7 +504,8 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
 	[TCA_HHF_NON_HH_WEIGHT]	 = { .type = NLA_U32 },
 };
 
-static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
+static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_HHF_MAX + 1];
@@ -590,7 +591,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
 	q->hhf_non_hh_weight = 2;
 
 	if (opt) {
-		int err = hhf_change(sch, opt);
+		int err = hhf_change(sch, opt, extack);
 
 		if (err)
 			return err;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 54132dde6d42..a8db1dbeb04f 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -180,7 +180,8 @@ multiq_destroy(struct Qdisc *sch)
 	kfree(q->queues);
 }
 
-static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	struct tc_multiq_qopt *qopt;
@@ -259,7 +260,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt,
 	for (i = 0; i < q->max_bands; i++)
 		q->queues[i] = &noop_qdisc;
 
-	return multiq_tune(sch, opt);
+	return multiq_tune(sch, opt, extack);
 }
 
 static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 6490ce08d29e..f45040b55531 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -893,7 +893,8 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 }
 
 /* Parse netlink message to set options */
-static int netem_change(struct Qdisc *sch, struct nlattr *opt)
+static int netem_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_NETEM_MAX + 1];
@@ -996,7 +997,7 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 
 	q->loss_model = CLG_RANDOM;
-	ret = netem_change(sch, opt);
+	ret = netem_change(sch, opt, extack);
 	if (ret)
 		pr_info("netem: change failed\n");
 	return ret;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index c4c87ed3971f..18d30bb86881 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -181,7 +181,8 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
 	[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
 };
 
-static int pie_change(struct Qdisc *sch, struct nlattr *opt)
+static int pie_change(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_PIE_MAX + 1];
@@ -452,7 +453,7 @@ static int pie_init(struct Qdisc *sch, struct nlattr *opt,
 	timer_setup(&q->adapt_timer, pie_timer, 0);
 
 	if (opt) {
-		int err = pie_change(sch, opt);
+		int err = pie_change(sch, opt, extack);
 
 		if (err)
 			return err;
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index d9c6fbe55ae5..5619d2eb17b6 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -159,7 +159,8 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt,
  *   command is received (just act as a pass-thru queue).
  * TCQ_PLUG_LIMIT: Increase/decrease queue size
  */
-static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+static int plug_change(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	struct plug_sched_data *q = qdisc_priv(sch);
 	struct tc_plug_qopt *msg;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8632d795e6ee..5f8ecbaa2610 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -153,7 +153,8 @@ prio_destroy(struct Qdisc *sch)
 		qdisc_destroy(q->queues[prio]);
 }
 
-static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
+static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
@@ -218,7 +219,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
 	if (err)
 		return err;
 
-	return prio_tune(sch, opt);
+	return prio_tune(sch, opt, extack);
 }
 
 static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 46d12206c7af..6b85f8334b74 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -197,7 +197,8 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
 	[TCA_RED_MAX_P] = { .type = NLA_U32 },
 };
 
-static int red_change(struct Qdisc *sch, struct nlattr *opt)
+static int red_change(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_RED_MAX + 1];
@@ -280,7 +281,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
 	q->qdisc = &noop_qdisc;
 	q->sch = sch;
 	timer_setup(&q->adapt_timer, red_adaptative_timer, 0);
-	return red_change(sch, opt);
+	return red_change(sch, opt, extack);
 }
 
 static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index b2205eaa0f51..1b9d69bd6ed6 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -488,7 +488,8 @@ static const struct tc_sfb_qopt sfb_default_ops = {
 	.penalty_burst = 20,
 };
 
-static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
 	struct Qdisc *child;
@@ -560,7 +561,7 @@ static int sfb_init(struct Qdisc *sch, struct nlattr *opt,
 		return err;
 
 	q->qdisc = &noop_qdisc;
-	return sfb_change(sch, opt);
+	return sfb_change(sch, opt, extack);
 }
 
 static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 9abff1271ec0..273228eb5ce0 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -302,7 +302,8 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 	[TCA_TBF_PBURST] = { .type = NLA_U32 },
 };
 
-static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
+		      struct netlink_ext_ack *extack)
 {
 	int err;
 	struct tbf_sched_data *q = qdisc_priv(sch);
@@ -434,7 +435,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 
 	q->t_c = ktime_get_ns();
 
-	return tbf_change(sch, opt);
+	return tbf_change(sch, opt, extack);
 }
 
 static void tbf_destroy(struct Qdisc *sch)
-- 
cgit v1.2.3


From 793d81d6a1965f1e1806ebc9aacc84a639b90282 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:15 -0500
Subject: net: sched: sch: add extack to change class

This patch adds extack support for class change callback api. This prepares
to handle extack support inside each specific class implementation.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/sch_api.c       | 2 +-
 net/sched/sch_atm.c       | 3 ++-
 net/sched/sch_cbq.c       | 2 +-
 net/sched/sch_drr.c       | 3 ++-
 net/sched/sch_dsmark.c    | 3 ++-
 net/sched/sch_fq_codel.c  | 2 +-
 net/sched/sch_hfsc.c      | 3 ++-
 net/sched/sch_htb.c       | 2 +-
 net/sched/sch_qfq.c       | 3 ++-
 net/sched/sch_sfb.c       | 3 ++-
 11 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e7a3e206b904..b4660a3ea99c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -158,7 +158,8 @@ struct Qdisc_class_ops {
 	/* Class manipulation routines */
 	unsigned long		(*find)(struct Qdisc *, u32 classid);
 	int			(*change)(struct Qdisc *, u32, u32,
-					struct nlattr **, unsigned long *);
+					struct nlattr **, unsigned long *,
+					struct netlink_ext_ack *);
 	int			(*delete)(struct Qdisc *, unsigned long);
 	void			(*walk)(struct Qdisc *, struct qdisc_walker * arg);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index fcc70415fd26..6cf2f7dadbdb 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1907,7 +1907,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 	new_cl = cl;
 	err = -EOPNOTSUPP;
 	if (cops->change)
-		err = cops->change(q, clid, portid, tca, &new_cl);
+		err = cops->change(q, clid, portid, tca, &new_cl, extack);
 	if (err == 0) {
 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
 		/* We just create a new class, need to do reverse binding. */
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 53a07687c0fb..80ada9affe81 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -191,7 +191,8 @@ static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
 };
 
 static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
-			 struct nlattr **tca, unsigned long *arg)
+			 struct nlattr **tca, unsigned long *arg,
+			 struct netlink_ext_ack *extack)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 86eba01457f3..8f1832df8b4f 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1452,7 +1452,7 @@ static void cbq_destroy(struct Qdisc *sch)
 
 static int
 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
-		 unsigned long *arg)
+		 unsigned long *arg, struct netlink_ext_ack *extack)
 {
 	int err;
 	struct cbq_sched_data *q = qdisc_priv(sch);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 1a88473cd768..73b914bc47a4 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -64,7 +64,8 @@ static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
 };
 
 static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-			    struct nlattr **tca, unsigned long *arg)
+			    struct nlattr **tca, unsigned long *arg,
+			    struct netlink_ext_ack *extack)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl = (struct drr_class *)*arg;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 16dd480b5583..89e433bbd590 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -112,7 +112,8 @@ static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
 };
 
 static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
-			 struct nlattr **tca, unsigned long *arg)
+			 struct nlattr **tca, unsigned long *arg,
+			 struct netlink_ext_ack *extack)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	struct nlattr *opt = tca[TCA_OPTIONS];
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d798c93f7c96..b4ca46aafb5a 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -479,7 +479,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
 	q->cparams.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
-		int err = fq_codel_change(sch, opt, NULL);
+		int err = fq_codel_change(sch, opt, extack);
 		if (err)
 			return err;
 	}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index f49a4a4fe095..11410b0e4068 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -921,7 +921,8 @@ static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
 
 static int
 hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-		  struct nlattr **tca, unsigned long *arg)
+		  struct nlattr **tca, unsigned long *arg,
+		  struct netlink_ext_ack *extack)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *cl = (struct hfsc_class *)*arg;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 41d9b7da9273..eb535a23a69b 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1327,7 +1327,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 
 static int htb_change_class(struct Qdisc *sch, u32 classid,
 			    u32 parentid, struct nlattr **tca,
-			    unsigned long *arg)
+			    unsigned long *arg, struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct htb_sched *q = qdisc_priv(sch);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 7c1b976314bd..1f4a84b687d2 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -402,7 +402,8 @@ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
 }
 
 static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-			    struct nlattr **tca, unsigned long *arg)
+			    struct nlattr **tca, unsigned long *arg,
+			    struct netlink_ext_ack *extack)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl = (struct qfq_class *)*arg;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 1b9d69bd6ed6..d70d470361be 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -645,7 +645,8 @@ static void sfb_unbind(struct Qdisc *sch, unsigned long arg)
 }
 
 static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-			    struct nlattr **tca, unsigned long *arg)
+			    struct nlattr **tca, unsigned long *arg,
+			    struct netlink_ext_ack *extack)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From cbaacc4e8a394d63bcd707775ca5bb7a51aaabee Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:16 -0500
Subject: net: sched: sch: add extack for block callback

This patch adds extack support for block callback to prepare per-qdisc
specific changes for extack.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/cls_api.c       | 4 ++--
 net/sched/sch_api.c       | 2 +-
 net/sched/sch_atm.c       | 3 ++-
 net/sched/sch_cbq.c       | 3 ++-
 net/sched/sch_drr.c       | 3 ++-
 net/sched/sch_dsmark.c    | 3 ++-
 net/sched/sch_fq_codel.c  | 3 ++-
 net/sched/sch_hfsc.c      | 3 ++-
 net/sched/sch_htb.c       | 3 ++-
 net/sched/sch_ingress.c   | 6 ++++--
 net/sched/sch_multiq.c    | 3 ++-
 net/sched/sch_prio.c      | 3 ++-
 net/sched/sch_qfq.c       | 3 ++-
 net/sched/sch_sfb.c       | 3 ++-
 net/sched/sch_sfq.c       | 3 ++-
 16 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index b4660a3ea99c..f65dd2837142 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -165,7 +165,8 @@ struct Qdisc_class_ops {
 
 	/* Filter manipulation */
 	struct tcf_block *	(*tcf_block)(struct Qdisc *sch,
-					     unsigned long arg);
+					     unsigned long arg,
+					     struct netlink_ext_ack *extack);
 	unsigned long		(*bind_tcf)(struct Qdisc *, unsigned long,
 					u32 classid);
 	void			(*unbind_tcf)(struct Qdisc *, unsigned long);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 32b1ea7cf863..22b977d40e1d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -793,7 +793,7 @@ replay:
 	}
 
 	/* And the last stroke */
-	block = cops->tcf_block(q, cl);
+	block = cops->tcf_block(q, cl, extack);
 	if (!block) {
 		err = -EINVAL;
 		goto errout;
@@ -1040,7 +1040,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		if (cl == 0)
 			goto out;
 	}
-	block = cops->tcf_block(q, cl);
+	block = cops->tcf_block(q, cl, NULL);
 	if (!block)
 		goto out;
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6cf2f7dadbdb..8c8c15b4da3b 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1760,7 +1760,7 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
 	cl = cops->find(q, portid);
 	if (!cl)
 		return;
-	block = cops->tcf_block(q, cl);
+	block = cops->tcf_block(q, cl, NULL);
 	if (!block)
 		return;
 	list_for_each_entry(chain, &block->chain_list, list) {
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 80ada9affe81..b606a75af333 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -357,7 +357,8 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl,
+					  struct netlink_ext_ack *extack)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 8f1832df8b4f..d46048a439a6 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1679,7 +1679,8 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	return 0;
 }
 
-static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
+static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
+				       struct netlink_ext_ack *extack)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl = (struct cbq_class *)arg;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 73b914bc47a4..44a2870f6f10 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -173,7 +173,8 @@ static unsigned long drr_search_class(struct Qdisc *sch, u32 classid)
 	return (unsigned long)drr_find_class(sch, classid);
 }
 
-static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl,
+				       struct netlink_ext_ack *extack)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 89e433bbd590..5dc5d5216fbb 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -185,7 +185,8 @@ ignore:
 	}
 }
 
-static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl,
+					  struct netlink_ext_ack *extack)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index b4ca46aafb5a..06e5360c54d8 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -597,7 +597,8 @@ static void fq_codel_unbind(struct Qdisc *q, unsigned long cl)
 {
 }
 
-static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl,
+					    struct netlink_ext_ack *extack)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 11410b0e4068..961668d657a0 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1247,7 +1247,8 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
 	cl->filter_cnt--;
 }
 
-static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg)
+static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg,
+					struct netlink_ext_ack *extack)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index eb535a23a69b..79cf24468a38 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1525,7 +1525,8 @@ failure:
 	return err;
 }
 
-static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
+static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
+				       struct netlink_ext_ack *extack)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)arg;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index a6f175e64016..b9de7be531dd 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -48,7 +48,8 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 }
 
-static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl,
+					   struct netlink_ext_ack *extack)
 {
 	struct ingress_sched_data *q = qdisc_priv(sch);
 
@@ -154,7 +155,8 @@ static unsigned long clsact_bind_filter(struct Qdisc *sch,
 	return clsact_find(sch, classid);
 }
 
-static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl,
+					  struct netlink_ext_ack *extack)
 {
 	struct clsact_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index a8db1dbeb04f..4bcbd3636606 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -371,7 +371,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl,
+					  struct netlink_ext_ack *extack)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 5f8ecbaa2610..077af4730749 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -329,7 +329,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl,
+					struct netlink_ext_ack *extack)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 1f4a84b687d2..e77e7131e620 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -565,7 +565,8 @@ static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
 	return (unsigned long)qfq_find_class(sch, classid);
 }
 
-static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl,
+				       struct netlink_ext_ack *extack)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index d70d470361be..9e01b80edfe7 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -668,7 +668,8 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl,
+				       struct netlink_ext_ack *extack)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3b5869c7b3f3..7a217be39f2a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -837,7 +837,8 @@ static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 {
 }
 
-static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
+static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
+				       struct netlink_ext_ack *extack)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 
-- 
cgit v1.2.3


From 653d6fd68d8e5b43d496ca8a1d38331d515a226b Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:17 -0500
Subject: net: sched: sch: add extack for graft callback

This patch adds extack support for graft callback to prepare per-qdisc
specific changes for extack.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/sch_api.c       | 3 ++-
 net/sched/sch_atm.c       | 3 ++-
 net/sched/sch_cbq.c       | 2 +-
 net/sched/sch_drr.c       | 3 ++-
 net/sched/sch_dsmark.c    | 3 ++-
 net/sched/sch_hfsc.c      | 2 +-
 net/sched/sch_htb.c       | 2 +-
 net/sched/sch_mq.c        | 2 +-
 net/sched/sch_mqprio.c    | 2 +-
 net/sched/sch_multiq.c    | 2 +-
 net/sched/sch_netem.c     | 2 +-
 net/sched/sch_prio.c      | 2 +-
 net/sched/sch_qfq.c       | 3 ++-
 net/sched/sch_red.c       | 2 +-
 net/sched/sch_sfb.c       | 2 +-
 net/sched/sch_tbf.c       | 2 +-
 17 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f65dd2837142..3baadac9e7a5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -151,7 +151,8 @@ struct Qdisc_class_ops {
 	/* Child qdisc manipulation */
 	struct netdev_queue *	(*select_queue)(struct Qdisc *, struct tcmsg *);
 	int			(*graft)(struct Qdisc *, unsigned long cl,
-					struct Qdisc *, struct Qdisc **);
+					struct Qdisc *, struct Qdisc **,
+					struct netlink_ext_ack *extack);
 	struct Qdisc *		(*leaf)(struct Qdisc *, unsigned long cl);
 	void			(*qlen_notify)(struct Qdisc *, unsigned long);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 8c8c15b4da3b..4b950d72d13b 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -977,7 +977,8 @@ skip:
 			unsigned long cl = cops->find(parent, classid);
 
 			if (cl) {
-				err = cops->graft(parent, cl, new, &old);
+				err = cops->graft(parent, cl, new, &old,
+						  extack);
 			} else {
 				NL_SET_ERR_MSG(extack, "Specified class not found");
 				err = -ENOENT;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index b606a75af333..8972ab72cda5 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -82,7 +82,8 @@ static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
 }
 
 static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
-			struct Qdisc *new, struct Qdisc **old)
+			struct Qdisc *new, struct Qdisc **old,
+			struct netlink_ext_ack *extack)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d46048a439a6..bb7e4ccd7caf 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1371,7 +1371,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 }
 
 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old)
+		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct cbq_class *cl = (struct cbq_class *)arg;
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 44a2870f6f10..30e9cba54ddb 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -203,7 +203,8 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
 }
 
 static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
-			   struct Qdisc *new, struct Qdisc **old)
+			   struct Qdisc *new, struct Qdisc **old,
+			   struct netlink_ext_ack *extack)
 {
 	struct drr_class *cl = (struct drr_class *)arg;
 
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 5dc5d5216fbb..92a36aa4c713 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -61,7 +61,8 @@ static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
 /* ------------------------- Class/flow operations ------------------------- */
 
 static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
-			struct Qdisc *new, struct Qdisc **old)
+			struct Qdisc *new, struct Qdisc **old,
+			struct netlink_ext_ack *extack)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 961668d657a0..7f6a06ac4b9f 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1177,7 +1177,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 
 static int
 hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		 struct Qdisc **old)
+		 struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 79cf24468a38..65762d57a70d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1172,7 +1172,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 }
 
 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old)
+		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct htb_class *cl = (struct htb_class *)arg;
 
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index b91f7d8cb184..50292e470432 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -155,7 +155,7 @@ static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
 }
 
 static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
-		    struct Qdisc **old)
+		    struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
 	struct net_device *dev = qdisc_dev(sch);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 0379fc4ee7bb..29071cf329f3 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -320,7 +320,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
 }
 
 static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
-		    struct Qdisc **old)
+			struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 4bcbd3636606..177d86de4b32 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -283,7 +283,7 @@ nla_put_failure:
 }
 
 static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		      struct Qdisc **old)
+			struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index f45040b55531..7bbc13b8ca47 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1159,7 +1159,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 }
 
 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old)
+		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 077af4730749..8fbd65661d77 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -242,7 +242,7 @@ nla_put_failure:
 }
 
 static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		      struct Qdisc **old)
+		      struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index e77e7131e620..7ec893f770d2 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -595,7 +595,8 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
 }
 
 static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
-			   struct Qdisc *new, struct Qdisc **old)
+			   struct Qdisc *new, struct Qdisc **old,
+			   struct netlink_ext_ack *extack)
 {
 	struct qfq_class *cl = (struct qfq_class *)arg;
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 6b85f8334b74..ea7d400b9eb2 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -382,7 +382,7 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl,
 }
 
 static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old)
+		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 9e01b80edfe7..1a33d6c3ac42 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -617,7 +617,7 @@ static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
 }
 
 static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old)
+		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 273228eb5ce0..db6bd23530d4 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -496,7 +496,7 @@ static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 }
 
 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old)
+		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
-- 
cgit v1.2.3


From e9bc3fa28bae7612f41e3538f241a2f87f629c94 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:18 -0500
Subject: net: sch: api: add extack support in qdisc_get_rtab

This patch adds extack support for the function qdisc_get_rtab which is
a common used function in the tc subsystem. Callers which are interested
in the receiving error can assign extack to get a more detailed
information why qdisc_get_rtab failed.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 3 ++-
 net/sched/act_police.c  | 4 ++--
 net/sched/sch_api.c     | 9 +++++++--
 net/sched/sch_cbq.c     | 7 ++++---
 net/sched/sch_htb.c     | 6 ++++--
 net/sched/sch_tbf.c     | 6 ++++--
 6 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 240469228851..a4f21c0b4a43 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -101,7 +101,8 @@ void qdisc_hash_del(struct Qdisc *q);
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
 struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle);
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
-					struct nlattr *tab);
+					struct nlattr *tab,
+					struct netlink_ext_ack *extack);
 void qdisc_put_rtab(struct qdisc_rate_table *tab);
 void qdisc_put_stab(struct qdisc_size_table *tab);
 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index bf483db993a1..95d3c9097b25 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -118,13 +118,13 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 	police = to_police(*a);
 	if (parm->rate.rate) {
 		err = -ENOMEM;
-		R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
+		R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE], NULL);
 		if (R_tab == NULL)
 			goto failure;
 
 		if (parm->peakrate.rate) {
 			P_tab = qdisc_get_rtab(&parm->peakrate,
-					       tb[TCA_POLICE_PEAKRATE]);
+					       tb[TCA_POLICE_PEAKRATE], NULL);
 			if (P_tab == NULL)
 				goto failure;
 		}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 4b950d72d13b..79a9fdf9471d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -393,13 +393,16 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 static struct qdisc_rate_table *qdisc_rtab_list;
 
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
-					struct nlattr *tab)
+					struct nlattr *tab,
+					struct netlink_ext_ack *extack)
 {
 	struct qdisc_rate_table *rtab;
 
 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
-	    nla_len(tab) != TC_RTAB_SIZE)
+	    nla_len(tab) != TC_RTAB_SIZE) {
+		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
 		return NULL;
+	}
 
 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
@@ -418,6 +421,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 			r->linklayer = __detect_linklayer(r, rtab->data);
 		rtab->next = qdisc_rtab_list;
 		qdisc_rtab_list = rtab;
+	} else {
+		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
 	}
 	return rtab;
 }
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index bb7e4ccd7caf..79f081eb6bb0 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1156,7 +1156,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
 
 	r = nla_data(tb[TCA_CBQ_RATE]);
 
-	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB]);
+	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
 	if (!q->link.R_tab)
 		return -EINVAL;
 
@@ -1484,7 +1484,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 
 		if (tb[TCA_CBQ_RATE]) {
 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
-					      tb[TCA_CBQ_RTAB]);
+					      tb[TCA_CBQ_RTAB], extack);
 			if (rtab == NULL)
 				return -EINVAL;
 		}
@@ -1537,7 +1537,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT])
 		return -EINVAL;
 
-	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
+	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
+			      extack);
 	if (rtab == NULL)
 		return -EINVAL;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 65762d57a70d..51be1b756e4e 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1357,10 +1357,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 	/* Keeping backward compatible with rate_table based iproute2 tc */
 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
-		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
+		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
+					      NULL));
 
 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
-		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
+		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
+					      NULL));
 
 	if (!cl) {		/* new class */
 		struct Qdisc *new_q;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index db6bd23530d4..1ab53ff80f46 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -327,11 +327,13 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 	qopt = nla_data(tb[TCA_TBF_PARMS]);
 	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
-					      tb[TCA_TBF_RTAB]));
+					      tb[TCA_TBF_RTAB],
+					      NULL));
 
 	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
-						      tb[TCA_TBF_PTAB]));
+						      tb[TCA_TBF_PTAB],
+						      NULL));
 
 	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
-- 
cgit v1.2.3


From 8d1a77f974ca61d39afa5bf0aeab210525d31475 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:19 -0500
Subject: net: sch: api: add extack support in tcf_block_get

This patch adds extack support for the function tcf_block_get which is
a common used function in the tc subsystem. Callers which are interested
in the receiving error can assign extack to get a more detailed
information why tcf_block_get failed.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    |  6 ++++--
 net/sched/cls_api.c      | 13 +++++++++----
 net/sched/sch_atm.c      |  6 ++++--
 net/sched/sch_cbq.c      |  4 ++--
 net/sched/sch_drr.c      |  2 +-
 net/sched/sch_dsmark.c   |  2 +-
 net/sched/sch_fq_codel.c |  2 +-
 net/sched/sch_hfsc.c     |  4 ++--
 net/sched/sch_htb.c      |  4 ++--
 net/sched/sch_ingress.c  |  8 +++++---
 net/sched/sch_multiq.c   |  2 +-
 net/sched/sch_prio.c     |  2 +-
 net/sched/sch_qfq.c      |  2 +-
 net/sched/sch_sfb.c      |  2 +-
 net/sched/sch_sfq.c      |  2 +-
 15 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0105445cab83..58bba9c769ea 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -39,9 +39,11 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
 void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		  struct netlink_ext_ack *extack);
 int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
-		      struct tcf_block_ext_info *ei);
+		      struct tcf_block_ext_info *ei,
+		      struct netlink_ext_ack *extack);
 void tcf_block_put(struct tcf_block *block);
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 22b977d40e1d..4591b87eaab5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -281,20 +281,24 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 }
 
 int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
-		      struct tcf_block_ext_info *ei)
+		      struct tcf_block_ext_info *ei,
+		      struct netlink_ext_ack *extack)
 {
 	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
 	struct tcf_chain *chain;
 	int err;
 
-	if (!block)
+	if (!block) {
+		NL_SET_ERR_MSG(extack, "Memory allocation for block failed");
 		return -ENOMEM;
+	}
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
 
 	/* Create chain 0 by default, it has to be always present. */
 	chain = tcf_chain_create(block, 0);
 	if (!chain) {
+		NL_SET_ERR_MSG(extack, "Failed to create new tcf chain");
 		err = -ENOMEM;
 		goto err_chain_create;
 	}
@@ -321,7 +325,8 @@ static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv)
 }
 
 int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		  struct netlink_ext_ack *extack)
 {
 	struct tcf_block_ext_info ei = {
 		.chain_head_change = tcf_chain_head_change_dflt,
@@ -329,7 +334,7 @@ int tcf_block_get(struct tcf_block **p_block,
 	};
 
 	WARN_ON(!p_filter_chain);
-	return tcf_block_get_ext(p_block, q, &ei);
+	return tcf_block_get_ext(p_block, q, &ei, extack);
 }
 EXPORT_SYMBOL(tcf_block_get);
 
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 8972ab72cda5..493d5c25d83a 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -283,7 +283,8 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		goto err_out;
 	}
 
-	error = tcf_block_get(&flow->block, &flow->filter_list, sch);
+	error = tcf_block_get(&flow->block, &flow->filter_list, sch,
+			      extack);
 	if (error) {
 		kfree(flow);
 		goto err_out;
@@ -550,7 +551,8 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
 		p->link.q = &noop_qdisc;
 	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
 
-	err = tcf_block_get(&p->link.block, &p->link.filter_list, sch);
+	err = tcf_block_get(&p->link.block, &p->link.filter_list, sch,
+			    extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 79f081eb6bb0..248ea26997b9 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1160,7 +1160,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!q->link.R_tab)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch);
+	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
 	if (err)
 		goto put_rtab;
 
@@ -1576,7 +1576,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (cl == NULL)
 		goto failure;
 
-	err = tcf_block_get(&cl->block, &cl->filter_list, sch);
+	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
 	if (err) {
 		kfree(cl);
 		return err;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 30e9cba54ddb..9dfff065e27d 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -417,7 +417,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
 	struct drr_sched *q = qdisc_priv(sch);
 	int err;
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 	err = qdisc_class_hash_init(&q->clhash);
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 92a36aa4c713..63f523b5e282 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -348,7 +348,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt)
 		goto errout;
 
-	err = tcf_block_get(&p->block, &p->filter_list, sch);
+	err = tcf_block_get(&p->block, &p->filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 06e5360c54d8..22fa13cf5d8b 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -484,7 +484,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
 			return err;
 	}
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 7f6a06ac4b9f..9ae288fcbed8 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1034,7 +1034,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (cl == NULL)
 		return -ENOBUFS;
 
-	err = tcf_block_get(&cl->block, &cl->filter_list, sch);
+	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
 	if (err) {
 		kfree(cl);
 		return err;
@@ -1409,7 +1409,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
 		return err;
 	q->eligible = RB_ROOT;
 
-	err = tcf_block_get(&q->root.block, &q->root.filter_list, sch);
+	err = tcf_block_get(&q->root.block, &q->root.filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 51be1b756e4e..54e1f860f1e5 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1032,7 +1032,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
@@ -1397,7 +1397,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		if (!cl)
 			goto failure;
 
-		err = tcf_block_get(&cl->block, &cl->filter_list, sch);
+		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
 		if (err) {
 			kfree(cl);
 			goto failure;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index b9de7be531dd..7ca2be20dd6f 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -78,7 +78,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
 	q->block_info.chain_head_change = clsact_chain_head_change;
 	q->block_info.chain_head_change_priv = &q->miniqp;
 
-	err = tcf_block_get_ext(&q->block, sch, &q->block_info);
+	err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
 	if (err)
 		return err;
 
@@ -186,7 +186,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 	q->ingress_block_info.chain_head_change = clsact_chain_head_change;
 	q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
 
-	err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info);
+	err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info,
+				extack);
 	if (err)
 		return err;
 
@@ -196,7 +197,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 	q->egress_block_info.chain_head_change = clsact_chain_head_change;
 	q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
 
-	err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info);
+	err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info,
+				extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 177d86de4b32..35cbaf8bd96a 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -248,7 +248,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8fbd65661d77..502352762f03 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -215,7 +215,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 7ec893f770d2..6ab58509cf49 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1424,7 +1424,7 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
 	int i, j, err;
 	u32 max_cl_shift, maxbudg_shift, max_classes;
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 1a33d6c3ac42..a1a11ded8e4f 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -556,7 +556,7 @@ static int sfb_init(struct Qdisc *sch, struct nlattr *opt,
 	struct sfb_sched_data *q = qdisc_priv(sch);
 	int err;
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7a217be39f2a..2f2678197760 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -731,7 +731,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->sch = sch;
 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 
-	err = tcf_block_get(&q->block, &q->filter_list, sch);
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From d0bd684dddab51ed017ece0359f26b038ec31940 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:20 -0500
Subject: net: sch: api: add extack support in qdisc_alloc

This patch adds extack support for the function qdisc_alloc which is
a common used function in the tc subsystem. Callers which are interested
in the receiving error can assign extack to get a more detailed
information why qdisc_alloc failed.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/sch_api.c       | 2 +-
 net/sched/sch_generic.c   | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3baadac9e7a5..faf6b2dbc1b2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -471,7 +471,8 @@ void qdisc_destroy(struct Qdisc *qdisc);
 void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
 			       unsigned int len);
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
-			  const struct Qdisc_ops *ops);
+			  const struct Qdisc_ops *ops,
+			  struct netlink_ext_ack *extack);
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 				const struct Qdisc_ops *ops, u32 parentid);
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 79a9fdf9471d..3a3a1da6b071 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1052,7 +1052,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		goto err_out;
 	}
 
-	sch = qdisc_alloc(dev_queue, ops);
+	sch = qdisc_alloc(dev_queue, ops, extack);
 	if (IS_ERR(sch)) {
 		err = PTR_ERR(sch);
 		goto err_out2;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 30bc38c5d7ae..34ef4366f8e0 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -755,7 +755,8 @@ static struct lock_class_key qdisc_tx_busylock;
 static struct lock_class_key qdisc_running_key;
 
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
-			  const struct Qdisc_ops *ops)
+			  const struct Qdisc_ops *ops,
+			  struct netlink_ext_ack *extack)
 {
 	void *p;
 	struct Qdisc *sch;
@@ -764,6 +765,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	struct net_device *dev;
 
 	if (!dev_queue) {
+		NL_SET_ERR_MSG(extack, "No device queue given");
 		err = -EINVAL;
 		goto errout;
 	}
@@ -835,7 +837,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 	if (!try_module_get(ops->owner))
 		return NULL;
 
-	sch = qdisc_alloc(dev_queue, ops);
+	sch = qdisc_alloc(dev_queue, ops, NULL);
 	if (IS_ERR(sch)) {
 		module_put(ops->owner);
 		return NULL;
-- 
cgit v1.2.3


From a38a98821c939e67e5906bddbed1d15af5ca860d Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:21 -0500
Subject: net: sch: api: add extack support in qdisc_create_dflt

This patch adds extack support for the function qdisc_create_dflt which is
a common used function in the tc subsystem. Callers which are interested
in the receiving error can assign extack to get a more detailed
information why qdisc_create_dflt failed. The function qdisc_create_dflt
will also call an init callback which can fail by any per-qdisc specific
handling.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h   |  3 ++-
 include/net/sch_generic.h |  3 ++-
 net/sched/sch_atm.c       |  5 +++--
 net/sched/sch_cbq.c       |  9 +++++----
 net/sched/sch_drr.c       |  7 ++++---
 net/sched/sch_dsmark.c    |  5 +++--
 net/sched/sch_fifo.c      |  6 ++++--
 net/sched/sch_generic.c   | 15 +++++++++------
 net/sched/sch_hfsc.c      |  8 ++++----
 net/sched/sch_htb.c       |  9 +++++----
 net/sched/sch_mq.c        |  3 ++-
 net/sched/sch_mqprio.c    |  2 +-
 net/sched/sch_multiq.c    |  2 +-
 net/sched/sch_prio.c      |  3 ++-
 net/sched/sch_qfq.c       |  8 ++++----
 net/sched/sch_red.c       |  3 ++-
 net/sched/sch_sfb.c       |  2 +-
 net/sched/sch_tbf.c       |  3 ++-
 18 files changed, 56 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index a4f21c0b4a43..e2c75f52557b 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -89,7 +89,8 @@ extern struct Qdisc_ops pfifo_head_drop_qdisc_ops;
 
 int fifo_set_limit(struct Qdisc *q, unsigned int limit);
 struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
-			       unsigned int limit);
+			       unsigned int limit,
+			       struct netlink_ext_ack *extack);
 
 int register_qdisc(struct Qdisc_ops *qops);
 int unregister_qdisc(struct Qdisc_ops *qops);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index faf6b2dbc1b2..ac029d5d88e4 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -474,7 +474,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops,
 			  struct netlink_ext_ack *extack);
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
-				const struct Qdisc_ops *ops, u32 parentid);
+				const struct Qdisc_ops *ops, u32 parentid,
+				struct netlink_ext_ack *extack);
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
 int skb_do_redirect(struct sk_buff *);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 493d5c25d83a..cd49afca9617 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -290,7 +290,8 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		goto err_out;
 	}
 
-	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
+				    extack);
 	if (!flow->q)
 		flow->q = &noop_qdisc;
 	pr_debug("atm_tc_change: qdisc %p\n", flow->q);
@@ -546,7 +547,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
 	INIT_LIST_HEAD(&p->link.list);
 	list_add(&p->link.list, &p->flows);
 	p->link.q = qdisc_create_dflt(sch->dev_queue,
-				      &pfifo_qdisc_ops, sch->handle);
+				      &pfifo_qdisc_ops, sch->handle, extack);
 	if (!p->link.q)
 		p->link.q = &noop_qdisc;
 	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 248ea26997b9..efe5bf15b031 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1172,7 +1172,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->link.common.classid = sch->handle;
 	q->link.qdisc = sch;
 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-				      sch->handle);
+				      sch->handle, NULL);
 	if (!q->link.q)
 		q->link.q = &noop_qdisc;
 	else
@@ -1376,8 +1376,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	struct cbq_class *cl = (struct cbq_class *)arg;
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev_queue,
-					&pfifo_qdisc_ops, cl->common.classid);
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					cl->common.classid, extack);
 		if (new == NULL)
 			return -ENOBUFS;
 	}
@@ -1596,7 +1596,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 
 	cl->R_tab = rtab;
 	rtab = NULL;
-	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
+				  NULL);
 	if (!cl->q)
 		cl->q = &noop_qdisc;
 	else
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 9dfff065e27d..bf638ce57c50 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -114,7 +114,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->common.classid = classid;
 	cl->quantum	   = quantum;
 	cl->qdisc	   = qdisc_create_dflt(sch->dev_queue,
-					       &pfifo_qdisc_ops, classid);
+					       &pfifo_qdisc_ops, classid,
+					       NULL);
 	if (cl->qdisc == NULL)
 		cl->qdisc = &noop_qdisc;
 	else
@@ -209,8 +210,8 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
 	struct drr_class *cl = (struct drr_class *)arg;
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev_queue,
-					&pfifo_qdisc_ops, cl->common.classid);
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					cl->common.classid, NULL);
 		if (new == NULL)
 			new = &noop_qdisc;
 	}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 63f523b5e282..049714c57075 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -71,7 +71,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
 
 	if (new == NULL) {
 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					sch->handle);
+					sch->handle, NULL);
 		if (new == NULL)
 			new = &noop_qdisc;
 	}
@@ -381,7 +381,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
 	p->default_index = default_index;
 	p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
 
-	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
+	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle,
+				 NULL);
 	if (p->q == NULL)
 		p->q = &noop_qdisc;
 	else
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index c65f23c70f40..24893d3b5d22 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -166,12 +166,14 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit)
 EXPORT_SYMBOL(fifo_set_limit);
 
 struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
-			       unsigned int limit)
+			       unsigned int limit,
+			       struct netlink_ext_ack *extack)
 {
 	struct Qdisc *q;
 	int err = -ENOMEM;
 
-	q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
+	q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1),
+			      extack);
 	if (q) {
 		err = fifo_set_limit(q, limit);
 		if (err < 0) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34ef4366f8e0..10aaa3b615ce 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -830,21 +830,24 @@ errout:
 
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 				const struct Qdisc_ops *ops,
-				unsigned int parentid)
+				unsigned int parentid,
+				struct netlink_ext_ack *extack)
 {
 	struct Qdisc *sch;
 
-	if (!try_module_get(ops->owner))
+	if (!try_module_get(ops->owner)) {
+		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
 		return NULL;
+	}
 
-	sch = qdisc_alloc(dev_queue, ops, NULL);
+	sch = qdisc_alloc(dev_queue, ops, extack);
 	if (IS_ERR(sch)) {
 		module_put(ops->owner);
 		return NULL;
 	}
 	sch->parent = parentid;
 
-	if (!ops->init || ops->init(sch, NULL, NULL) == 0)
+	if (!ops->init || ops->init(sch, NULL, extack) == 0)
 		return sch;
 
 	qdisc_destroy(sch);
@@ -956,7 +959,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
 	if (dev->priv_flags & IFF_NO_QUEUE)
 		ops = &noqueue_qdisc_ops;
 
-	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
+	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
 	if (!qdisc) {
 		netdev_info(dev, "activation failed\n");
 		return;
@@ -979,7 +982,7 @@ static void attach_default_qdiscs(struct net_device *dev)
 		dev->qdisc = txq->qdisc_sleeping;
 		qdisc_refcount_inc(dev->qdisc);
 	} else {
-		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
+		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
 		if (qdisc) {
 			dev->qdisc = qdisc;
 			qdisc->ops->attach(qdisc);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 9ae288fcbed8..3ae9877ea205 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1062,8 +1062,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->cl_common.classid = classid;
 	cl->sched     = q;
 	cl->cl_parent = parent;
-	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
-				      &pfifo_qdisc_ops, classid);
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				      classid, NULL);
 	if (cl->qdisc == NULL)
 		cl->qdisc = &noop_qdisc;
 	else
@@ -1185,7 +1185,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		return -EINVAL;
 	if (new == NULL) {
 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					cl->cl_common.classid);
+					cl->cl_common.classid, NULL);
 		if (new == NULL)
 			new = &noop_qdisc;
 	}
@@ -1416,7 +1416,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
 	q->root.cl_common.classid = sch->handle;
 	q->root.sched   = q;
 	q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					  sch->handle);
+					  sch->handle, NULL);
 	if (q->root.qdisc == NULL)
 		q->root.qdisc = &noop_qdisc;
 	else
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 54e1f860f1e5..1ea9846cc6ce 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1180,7 +1180,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		return -EINVAL;
 	if (new == NULL &&
 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-				     cl->common.classid)) == NULL)
+				     cl->common.classid, extack)) == NULL)
 		return -ENOBUFS;
 
 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
@@ -1290,7 +1290,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 
 	if (!cl->level && htb_parent_last_child(cl)) {
 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					  cl->parent->common.classid);
+					  cl->parent->common.classid,
+					  NULL);
 		last_child = 1;
 	}
 
@@ -1426,8 +1427,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		 * so that can't be used inside of sch_tree_lock
 		 * -- thanks to Karlis Peisenieks
 		 */
-		new_q = qdisc_create_dflt(sch->dev_queue,
-					  &pfifo_qdisc_ops, classid);
+		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					  classid, NULL);
 		sch_tree_lock(sch);
 		if (parent && !parent->level) {
 			unsigned int qlen = parent->un.leaf.q->q.qlen;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 50292e470432..f062a18e9162 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
 		dev_queue = netdev_get_tx_queue(dev, ntx);
 		qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
-						    TC_H_MIN(ntx + 1)));
+						    TC_H_MIN(ntx + 1)),
+					  extack);
 		if (!qdisc)
 			return -ENOMEM;
 		priv->qdiscs[ntx] = qdisc;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 29071cf329f3..0e9d761cdd80 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -230,7 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
 		qdisc = qdisc_create_dflt(dev_queue,
 					  get_default_qdisc_ops(dev, i),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
-						    TC_H_MIN(i + 1)));
+						    TC_H_MIN(i + 1)), extack);
 		if (!qdisc)
 			return -ENOMEM;
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 35cbaf8bd96a..1da7ea8de0ad 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -216,7 +216,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
 			child = qdisc_create_dflt(sch->dev_queue,
 						  &pfifo_qdisc_ops,
 						  TC_H_MAKE(sch->handle,
-							    i + 1));
+							    i + 1), extack);
 			if (child) {
 				sch_tree_lock(sch);
 				old = q->queues[i];
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 502352762f03..fe1510eb111f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -176,7 +176,8 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 	/* Before commit, make sure we can allocate all new qdiscs */
 	for (i = oldbands; i < qopt->bands; i++) {
 		queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					      TC_H_MAKE(sch->handle, i + 1));
+					      TC_H_MAKE(sch->handle, i + 1),
+					      extack);
 		if (!queues[i]) {
 			while (i > oldbands)
 				qdisc_destroy(queues[--i]);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 6ab58509cf49..bb1a9c11fc54 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -480,8 +480,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->common.classid = classid;
 	cl->deficit = lmax;
 
-	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
-				      &pfifo_qdisc_ops, classid);
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				      classid, NULL);
 	if (cl->qdisc == NULL)
 		cl->qdisc = &noop_qdisc;
 
@@ -601,8 +601,8 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
 	struct qfq_class *cl = (struct qfq_class *)arg;
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev_queue,
-					&pfifo_qdisc_ops, cl->common.classid);
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					cl->common.classid, NULL);
 		if (new == NULL)
 			new = &noop_qdisc;
 	}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index ea7d400b9eb2..ec0bd36e09a9 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -225,7 +225,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 
 	if (ctl->limit > 0) {
-		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
+		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit,
+					 extack);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
 	}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index a1a11ded8e4f..7cbdad8419b7 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -513,7 +513,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
 	if (limit == 0)
 		limit = qdisc_dev(sch)->tx_queue_len;
 
-	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit, extack);
 	if (IS_ERR(child))
 		return PTR_ERR(child);
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 1ab53ff80f46..83e76d046993 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -386,7 +386,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 		if (err)
 			goto done;
 	} else if (qopt->limit > 0) {
-		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
+		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
+					 extack);
 		if (IS_ERR(child)) {
 			err = PTR_ERR(child);
 			goto done;
-- 
cgit v1.2.3


From 62a6de62dc57b1b8791db7504846057128e5ebd9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:22 -0500
Subject: net: sch: sch_cbq: add extack support

This patch adds extack support for the cbq qdisc implementation by
adding NL_SET_ERR_MSG in validation of user input.
Also it serves to illustrate a use case of how the infrastructure ops
api changes are to be used by individual qdiscs.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_cbq.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index efe5bf15b031..f42025d53cfe 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1144,15 +1144,19 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
 	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	q->delay_timer.function = cbq_undelay;
 
-	if (!opt)
+	if (!opt) {
+		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
 		return -EINVAL;
+	}
 
-	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
+	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
 	if (err < 0)
 		return err;
 
-	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE])
+	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
+		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
 		return -EINVAL;
+	}
 
 	r = nla_data(tb[TCA_CBQ_RATE]);
 
@@ -1462,24 +1466,32 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	struct cbq_class *parent;
 	struct qdisc_rate_table *rtab = NULL;
 
-	if (!opt)
+	if (!opt) {
+		NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
 		return -EINVAL;
+	}
 
-	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
+	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
+	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
+		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
 		return -EOPNOTSUPP;
+	}
 
 	if (cl) {
 		/* Check parent */
 		if (parentid) {
 			if (cl->tparent &&
-			    cl->tparent->common.classid != parentid)
+			    cl->tparent->common.classid != parentid) {
+				NL_SET_ERR_MSG(extack, "Invalid parent id");
 				return -EINVAL;
-			if (!cl->tparent && parentid != TC_H_ROOT)
+			}
+			if (!cl->tparent && parentid != TC_H_ROOT) {
+				NL_SET_ERR_MSG(extack, "Parent must be root");
 				return -EINVAL;
+			}
 		}
 
 		if (tb[TCA_CBQ_RATE]) {
@@ -1496,6 +1508,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
 			if (err) {
+				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
 				qdisc_put_rtab(rtab);
 				return err;
 			}
@@ -1534,8 +1547,10 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (parentid == TC_H_ROOT)
 		return -EINVAL;
 
-	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT])
+	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
+		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
 		return -EINVAL;
+	}
 
 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
 			      extack);
@@ -1545,8 +1560,10 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (classid) {
 		err = -EINVAL;
 		if (TC_H_MAJ(classid ^ sch->handle) ||
-		    cbq_class_lookup(q, classid))
+		    cbq_class_lookup(q, classid)) {
+			NL_SET_ERR_MSG(extack, "Specified class not found");
 			goto failure;
+		}
 	} else {
 		int i;
 		classid = TC_H_MAKE(sch->handle, 0x8000);
@@ -1558,8 +1575,10 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 				break;
 		}
 		err = -ENOSR;
-		if (i >= 0x8000)
+		if (i >= 0x8000) {
+			NL_SET_ERR_MSG(extack, "Unable to generate classid");
 			goto failure;
+		}
 		classid = classid|q->hgenerator;
 	}
 
@@ -1567,8 +1586,10 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (parentid) {
 		parent = cbq_class_lookup(q, parentid);
 		err = -EINVAL;
-		if (!parent)
+		if (!parent) {
+			NL_SET_ERR_MSG(extack, "Failed to find parentid");
 			goto failure;
+		}
 	}
 
 	err = -ENOBUFS;
@@ -1588,6 +1609,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 					qdisc_root_sleeping_running(sch),
 					tca[TCA_RATE]);
 		if (err) {
+			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
 			tcf_block_put(cl->block);
 			kfree(cl);
 			goto failure;
-- 
cgit v1.2.3


From 710fb39689d194aa0acf7928f387487c25fb2b8f Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:23 -0500
Subject: net: sch: sch_cbs: add extack support

This patch adds extack support for the cbs qdisc implementation by
adding NL_SET_ERR_MSG in validation of user input.
Also it serves to illustrate a use case of how the infrastructure ops
api changes are to be used by individual qdiscs.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_cbs.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 8bf6e163d29c..cdd96b9a27bc 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -219,14 +219,17 @@ static void cbs_disable_offload(struct net_device *dev,
 }
 
 static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
-			      const struct tc_cbs_qopt *opt)
+			      const struct tc_cbs_qopt *opt,
+			      struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct tc_cbs_qopt_offload cbs = { };
 	int err;
 
-	if (!ops->ndo_setup_tc)
+	if (!ops->ndo_setup_tc) {
+		NL_SET_ERR_MSG(extack, "Specified device does not support cbs offload");
 		return -EOPNOTSUPP;
+	}
 
 	cbs.queue = q->queue;
 
@@ -237,8 +240,10 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
 	cbs.sendslope = opt->sendslope;
 
 	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Specified device failed to setup cbs hardware offload");
 		return err;
+	}
 
 	q->enqueue = cbs_enqueue_offload;
 	q->dequeue = cbs_dequeue_offload;
@@ -255,12 +260,14 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
 	struct tc_cbs_qopt *qopt;
 	int err;
 
-	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
+	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, extack);
 	if (err < 0)
 		return err;
 
-	if (!tb[TCA_CBS_PARMS])
+	if (!tb[TCA_CBS_PARMS]) {
+		NL_SET_ERR_MSG(extack, "Missing CBS parameter which are mandatory");
 		return -EINVAL;
+	}
 
 	qopt = nla_data(tb[TCA_CBS_PARMS]);
 
@@ -277,7 +284,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
 
 		cbs_disable_offload(dev, q);
 	} else {
-		err = cbs_enable_offload(dev, q, qopt);
+		err = cbs_enable_offload(dev, q, qopt, extack);
 		if (err < 0)
 			return err;
 	}
@@ -298,8 +305,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 
-	if (!opt)
+	if (!opt) {
+		NL_SET_ERR_MSG(extack, "Missing CBS qdisc options  which are mandatory");
 		return -EINVAL;
+	}
 
 	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
 
-- 
cgit v1.2.3


From a7c31693e1054fe9af08bc404cdd687c327d0fa6 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:24 -0500
Subject: net: sch: sch_drr: add extack support

This patch adds extack support for the drr qdisc implementation by
adding NL_SET_ERR_MSG in validation of user input.
Also it serves to illustrate a use case of how the infrastructure ops
api changes are to be used by individual qdiscs.

Cc: David Ahern <dsahern@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_drr.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index bf638ce57c50..e0b0cf8a9939 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -74,17 +74,21 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	u32 quantum;
 	int err;
 
-	if (!opt)
+	if (!opt) {
+		NL_SET_ERR_MSG(extack, "DRR options are required for this operation");
 		return -EINVAL;
+	}
 
-	err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, NULL);
+	err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, extack);
 	if (err < 0)
 		return err;
 
 	if (tb[TCA_DRR_QUANTUM]) {
 		quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
-		if (quantum == 0)
+		if (quantum == 0) {
+			NL_SET_ERR_MSG(extack, "Specified DRR quantum cannot be zero");
 			return -EINVAL;
+		}
 	} else
 		quantum = psched_mtu(qdisc_dev(sch));
 
@@ -95,8 +99,10 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 						    NULL,
 						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
-			if (err)
+			if (err) {
+				NL_SET_ERR_MSG(extack, "Failed to replace estimator");
 				return err;
+			}
 		}
 
 		sch_tree_lock(sch);
@@ -127,6 +133,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 					    qdisc_root_sleeping_running(sch),
 					    tca[TCA_RATE]);
 		if (err) {
+			NL_SET_ERR_MSG(extack, "Failed to replace estimator");
 			qdisc_destroy(cl->qdisc);
 			kfree(cl);
 			return err;
@@ -179,8 +186,10 @@ static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl,
 {
 	struct drr_sched *q = qdisc_priv(sch);
 
-	if (cl)
+	if (cl) {
+		NL_SET_ERR_MSG(extack, "DRR classid must be zero");
 		return NULL;
+	}
 
 	return q->block;
 }
-- 
cgit v1.2.3


From 4e58452b6fc391c35611693b12a3f97bd60f6da8 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:38 +0100
Subject: batman-adv: Let packet.h include its headers directly

The headers used by packet.h should also be included by it directly. main.h
is currently dealing with it in batman-adv, but this will no longer work
when this header is moved to include/uapi/linux/.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/batman-adv/main.h   | 2 --
 net/batman-adv/packet.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 5ac86df48c42..d5484ac381d3 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -217,10 +217,8 @@ enum batadv_uev_type {
 
 /* Kernel headers */
 
-#include <linux/bitops.h> /* for packet.h */
 #include <linux/compiler.h>
 #include <linux/etherdevice.h>
-#include <linux/if_ether.h> /* for packet.h */
 #include <linux/if_vlan.h>
 #include <linux/jiffies.h>
 #include <linux/percpu.h>
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index dccbd4a6f019..6b6563867455 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -20,6 +20,8 @@
 #define _NET_BATMAN_ADV_PACKET_H_
 
 #include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/if_ether.h>
 #include <linux/types.h>
 
 /**
-- 
cgit v1.2.3


From a6cb82b5c21d046a4383010045dc0489bec42884 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:39 +0100
Subject: batman-adv: Remove usage of BIT(x) in packet.h

The BIT(x) macro is no longer available for uapi headers because it is
defined outside of it (linux/bitops.h). The use of it must therefore be
avoided and replaced by an appropriate other representation.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/batman-adv/packet.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6b6563867455..44f20d03205b 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -20,7 +20,6 @@
 #define _NET_BATMAN_ADV_PACKET_H_
 
 #include <asm/byteorder.h>
-#include <linux/bitops.h>
 #include <linux/if_ether.h>
 #include <linux/types.h>
 
@@ -92,9 +91,9 @@ enum batadv_subtype {
  *     one hop neighbor on the interface where it was originally received.
  */
 enum batadv_iv_flags {
-	BATADV_NOT_BEST_NEXT_HOP   = BIT(0),
-	BATADV_PRIMARIES_FIRST_HOP = BIT(1),
-	BATADV_DIRECTLINK          = BIT(2),
+	BATADV_NOT_BEST_NEXT_HOP   = 1UL << 0,
+	BATADV_PRIMARIES_FIRST_HOP = 1UL << 1,
+	BATADV_DIRECTLINK          = 1UL << 2,
 };
 
 /**
@@ -123,9 +122,9 @@ enum batadv_icmp_packettype {
  * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
  */
 enum batadv_mcast_flags {
-	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= BIT(0),
-	BATADV_MCAST_WANT_ALL_IPV4		= BIT(1),
-	BATADV_MCAST_WANT_ALL_IPV6		= BIT(2),
+	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= 1UL << 0,
+	BATADV_MCAST_WANT_ALL_IPV4		= 1UL << 1,
+	BATADV_MCAST_WANT_ALL_IPV6		= 1UL << 2,
 };
 
 /* tt data subtypes */
@@ -139,10 +138,10 @@ enum batadv_mcast_flags {
  * @BATADV_TT_FULL_TABLE: contains full table to replace existing table
  */
 enum batadv_tt_data_flags {
-	BATADV_TT_OGM_DIFF   = BIT(0),
-	BATADV_TT_REQUEST    = BIT(1),
-	BATADV_TT_RESPONSE   = BIT(2),
-	BATADV_TT_FULL_TABLE = BIT(4),
+	BATADV_TT_OGM_DIFF   = 1UL << 0,
+	BATADV_TT_REQUEST    = 1UL << 1,
+	BATADV_TT_RESPONSE   = 1UL << 2,
+	BATADV_TT_FULL_TABLE = 1UL << 4,
 };
 
 /**
@@ -150,7 +149,7 @@ enum batadv_tt_data_flags {
  * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
  */
 enum batadv_vlan_flags {
-	BATADV_VLAN_HAS_TAG	= BIT(15),
+	BATADV_VLAN_HAS_TAG	= 1UL << 15,
 };
 
 /**
-- 
cgit v1.2.3


From adbf9b7324e7cb734e876cd311392c0880e890f7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:40 +0100
Subject: batman-adv: Remove kernel fixed width types in packet.h

The uapi headers use the __u8/__u16/... version of the fixed width types
instead of u8/u16/... The use of the latter must be avoided before
packet.h is copied to include/uapi/linux/.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/batman-adv/packet.h | 214 ++++++++++++++++++++++++------------------------
 1 file changed, 107 insertions(+), 107 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 44f20d03205b..3b2d2db993aa 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -29,7 +29,7 @@
  *
  * Return: 0 when not error was detected, != 0 otherwise
  */
-#define batadv_tp_is_error(n) ((u8)(n) > 127 ? 1 : 0)
+#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0)
 
 /**
  * enum batadv_packettype - types for batman-adv encapsulated packets
@@ -191,8 +191,8 @@ enum batadv_tvlv_type {
  * transport the claim type and the group id
  */
 struct batadv_bla_claim_dst {
-	u8     magic[3];	/* FF:43:05 */
-	u8     type;		/* bla_claimframe */
+	__u8   magic[3];	/* FF:43:05 */
+	__u8   type;		/* bla_claimframe */
 	__be16 group;		/* group id */
 };
 
@@ -212,15 +212,15 @@ struct batadv_bla_claim_dst {
  * @tvlv_len: length of tvlv data following the ogm header
  */
 struct batadv_ogm_packet {
-	u8     packet_type;
-	u8     version;
-	u8     ttl;
-	u8     flags;
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   flags;
 	__be32 seqno;
-	u8     orig[ETH_ALEN];
-	u8     prev_sender[ETH_ALEN];
-	u8     reserved;
-	u8     tq;
+	__u8   orig[ETH_ALEN];
+	__u8   prev_sender[ETH_ALEN];
+	__u8   reserved;
+	__u8   tq;
 	__be16 tvlv_len;
 	/* __packed is not needed as the struct size is divisible by 4,
 	 * and the largest data type in this struct has a size of 4.
@@ -241,12 +241,12 @@ struct batadv_ogm_packet {
  * @throughput: the currently flooded path throughput
  */
 struct batadv_ogm2_packet {
-	u8     packet_type;
-	u8     version;
-	u8     ttl;
-	u8     flags;
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   flags;
 	__be32 seqno;
-	u8     orig[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
 	__be16 tvlv_len;
 	__be32 throughput;
 	/* __packed is not needed as the struct size is divisible by 4,
@@ -265,9 +265,9 @@ struct batadv_ogm2_packet {
  * @elp_interval: currently used ELP sending interval in ms
  */
 struct batadv_elp_packet {
-	u8     packet_type;
-	u8     version;
-	u8     orig[ETH_ALEN];
+	__u8   packet_type;
+	__u8   version;
+	__u8   orig[ETH_ALEN];
 	__be32 seqno;
 	__be32 elp_interval;
 };
@@ -290,14 +290,14 @@ struct batadv_elp_packet {
  * members are padded the same way as they are in real packets.
  */
 struct batadv_icmp_header {
-	u8 packet_type;
-	u8 version;
-	u8 ttl;
-	u8 msg_type; /* see ICMP message types above */
-	u8 dst[ETH_ALEN];
-	u8 orig[ETH_ALEN];
-	u8 uid;
-	u8 align[3];
+	__u8 packet_type;
+	__u8 version;
+	__u8 ttl;
+	__u8 msg_type; /* see ICMP message types above */
+	__u8 dst[ETH_ALEN];
+	__u8 orig[ETH_ALEN];
+	__u8 uid;
+	__u8 align[3];
 };
 
 /**
@@ -313,14 +313,14 @@ struct batadv_icmp_header {
  * @seqno: ICMP sequence number
  */
 struct batadv_icmp_packet {
-	u8     packet_type;
-	u8     version;
-	u8     ttl;
-	u8     msg_type; /* see ICMP message types above */
-	u8     dst[ETH_ALEN];
-	u8     orig[ETH_ALEN];
-	u8     uid;
-	u8     reserved;
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   reserved;
 	__be16 seqno;
 };
 
@@ -342,15 +342,15 @@ struct batadv_icmp_packet {
  *  store it using network order
  */
 struct batadv_icmp_tp_packet {
-	u8  packet_type;
-	u8  version;
-	u8  ttl;
-	u8  msg_type; /* see ICMP message types above */
-	u8  dst[ETH_ALEN];
-	u8  orig[ETH_ALEN];
-	u8  uid;
-	u8  subtype;
-	u8  session[2];
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   subtype;
+	__u8   session[2];
 	__be32 seqno;
 	__be32 timestamp;
 };
@@ -381,16 +381,16 @@ enum batadv_icmp_tp_subtype {
  * @rr: route record array
  */
 struct batadv_icmp_packet_rr {
-	u8     packet_type;
-	u8     version;
-	u8     ttl;
-	u8     msg_type; /* see ICMP message types above */
-	u8     dst[ETH_ALEN];
-	u8     orig[ETH_ALEN];
-	u8     uid;
-	u8     rr_cur;
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   rr_cur;
 	__be16 seqno;
-	u8     rr[BATADV_RR_LEN][ETH_ALEN];
+	__u8   rr[BATADV_RR_LEN][ETH_ALEN];
 };
 
 #define BATADV_ICMP_MAX_PACKET_SIZE	sizeof(struct batadv_icmp_packet_rr)
@@ -416,11 +416,11 @@ struct batadv_icmp_packet_rr {
  * @dest: originator destination of the unicast packet
  */
 struct batadv_unicast_packet {
-	u8 packet_type;
-	u8 version;
-	u8 ttl;
-	u8 ttvn; /* destination translation table version number */
-	u8 dest[ETH_ALEN];
+	__u8 packet_type;
+	__u8 version;
+	__u8 ttl;
+	__u8 ttvn; /* destination translation table version number */
+	__u8 dest[ETH_ALEN];
 	/* "4 bytes boundary + 2 bytes" long to make the payload after the
 	 * following ethernet header again 4 bytes boundary aligned
 	 */
@@ -435,9 +435,9 @@ struct batadv_unicast_packet {
  */
 struct batadv_unicast_4addr_packet {
 	struct batadv_unicast_packet u;
-	u8 src[ETH_ALEN];
-	u8 subtype;
-	u8 reserved;
+	__u8 src[ETH_ALEN];
+	__u8 subtype;
+	__u8 reserved;
 	/* "4 bytes boundary + 2 bytes" long to make the payload after the
 	 * following ethernet header again 4 bytes boundary aligned
 	 */
@@ -457,22 +457,22 @@ struct batadv_unicast_4addr_packet {
  * @total_size: size of the merged packet
  */
 struct batadv_frag_packet {
-	u8     packet_type;
-	u8     version;  /* batman version field */
-	u8     ttl;
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
 #if defined(__BIG_ENDIAN_BITFIELD)
-	u8     no:4;
-	u8     priority:3;
-	u8     reserved:1;
+	__u8   no:4;
+	__u8   priority:3;
+	__u8   reserved:1;
 #elif defined(__LITTLE_ENDIAN_BITFIELD)
-	u8     reserved:1;
-	u8     priority:3;
-	u8     no:4;
+	__u8   reserved:1;
+	__u8   priority:3;
+	__u8   no:4;
 #else
 #error "unknown bitfield endianness"
 #endif
-	u8     dest[ETH_ALEN];
-	u8     orig[ETH_ALEN];
+	__u8   dest[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
 	__be16 seqno;
 	__be16 total_size;
 };
@@ -487,12 +487,12 @@ struct batadv_frag_packet {
  * @orig: originator of the broadcast packet
  */
 struct batadv_bcast_packet {
-	u8     packet_type;
-	u8     version;  /* batman version field */
-	u8     ttl;
-	u8     reserved;
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   reserved;
 	__be32 seqno;
-	u8     orig[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
 	/* "4 bytes boundary + 2 bytes" long to make the payload after the
 	 * following ethernet header again 4 bytes boundary aligned
 	 */
@@ -516,19 +516,19 @@ struct batadv_bcast_packet {
  * @coded_len: length of network coded part of the payload
  */
 struct batadv_coded_packet {
-	u8     packet_type;
-	u8     version;  /* batman version field */
-	u8     ttl;
-	u8     first_ttvn;
-	/* u8  first_dest[ETH_ALEN]; - saved in mac header destination */
-	u8     first_source[ETH_ALEN];
-	u8     first_orig_dest[ETH_ALEN];
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   first_ttvn;
+	/* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */
+	__u8   first_source[ETH_ALEN];
+	__u8   first_orig_dest[ETH_ALEN];
 	__be32 first_crc;
-	u8     second_ttl;
-	u8     second_ttvn;
-	u8     second_dest[ETH_ALEN];
-	u8     second_source[ETH_ALEN];
-	u8     second_orig_dest[ETH_ALEN];
+	__u8   second_ttl;
+	__u8   second_ttvn;
+	__u8   second_dest[ETH_ALEN];
+	__u8   second_source[ETH_ALEN];
+	__u8   second_orig_dest[ETH_ALEN];
 	__be32 second_crc;
 	__be16 coded_len;
 };
@@ -547,14 +547,14 @@ struct batadv_coded_packet {
  * @align: 2 bytes to align the header to a 4 byte boundary
  */
 struct batadv_unicast_tvlv_packet {
-	u8     packet_type;
-	u8     version;  /* batman version field */
-	u8     ttl;
-	u8     reserved;
-	u8     dst[ETH_ALEN];
-	u8     src[ETH_ALEN];
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   reserved;
+	__u8   dst[ETH_ALEN];
+	__u8   src[ETH_ALEN];
 	__be16 tvlv_len;
-	u16    align;
+	__u16  align;
 };
 
 /**
@@ -564,8 +564,8 @@ struct batadv_unicast_tvlv_packet {
  * @len: tvlv container length
  */
 struct batadv_tvlv_hdr {
-	u8     type;
-	u8     version;
+	__u8   type;
+	__u8   version;
 	__be16 len;
 };
 
@@ -588,8 +588,8 @@ struct batadv_tvlv_gateway_data {
  *  one batadv_tvlv_tt_vlan_data object per announced vlan
  */
 struct batadv_tvlv_tt_data {
-	u8     flags;
-	u8     ttvn;
+	__u8   flags;
+	__u8   ttvn;
 	__be16 num_vlan;
 };
 
@@ -603,7 +603,7 @@ struct batadv_tvlv_tt_data {
 struct batadv_tvlv_tt_vlan_data {
 	__be32 crc;
 	__be16 vid;
-	u16    reserved;
+	__u16  reserved;
 };
 
 /**
@@ -615,9 +615,9 @@ struct batadv_tvlv_tt_vlan_data {
  * @vid: VLAN identifier
  */
 struct batadv_tvlv_tt_change {
-	u8     flags;
-	u8     reserved[3];
-	u8     addr[ETH_ALEN];
+	__u8   flags;
+	__u8   reserved[3];
+	__u8   addr[ETH_ALEN];
 	__be16 vid;
 };
 
@@ -627,7 +627,7 @@ struct batadv_tvlv_tt_change {
  * @vid: VLAN identifier
  */
 struct batadv_tvlv_roam_adv {
-	u8     client[ETH_ALEN];
+	__u8   client[ETH_ALEN];
 	__be16 vid;
 };
 
@@ -637,8 +637,8 @@ struct batadv_tvlv_roam_adv {
  * @reserved: reserved field
  */
 struct batadv_tvlv_mcast_data {
-	u8 flags;
-	u8 reserved[3];
+	__u8 flags;
+	__u8 reserved[3];
 };
 
 #endif /* _NET_BATMAN_ADV_PACKET_H_ */
-- 
cgit v1.2.3


From fec149f5d3234c037ec761d1db4cc8c0550e9964 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:41 +0100
Subject: batman-adv: Convert packet.h to uapi header

The header file is used by different userspace programs to inject packets
or to decode sniffed packets. It should therefore be available to them as
userspace header.

Also other components in the kernel (like the flow dissector) require
access to the packet definitions to be able to decode ETH_P_BATMAN ethernet
packets.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                            |   1 +
 include/uapi/linux/batadv_packet.h     | 644 +++++++++++++++++++++++++++++++++
 net/batman-adv/bat_iv_ogm.c            |   2 +-
 net/batman-adv/bat_v.c                 |   2 +-
 net/batman-adv/bat_v_elp.c             |   2 +-
 net/batman-adv/bat_v_ogm.c             |   2 +-
 net/batman-adv/bridge_loop_avoidance.c |   2 +-
 net/batman-adv/distributed-arp-table.h |   2 +-
 net/batman-adv/fragmentation.c         |   2 +-
 net/batman-adv/gateway_client.c        |   2 +-
 net/batman-adv/gateway_common.c        |   2 +-
 net/batman-adv/hard-interface.c        |   2 +-
 net/batman-adv/icmp_socket.c           |   2 +-
 net/batman-adv/main.c                  |   2 +-
 net/batman-adv/main.h                  |   2 +-
 net/batman-adv/multicast.c             |   2 +-
 net/batman-adv/netlink.c               |   2 +-
 net/batman-adv/network-coding.c        |   2 +-
 net/batman-adv/packet.h                | 644 ---------------------------------
 net/batman-adv/routing.c               |   2 +-
 net/batman-adv/send.h                  |   3 +-
 net/batman-adv/soft-interface.c        |   2 +-
 net/batman-adv/sysfs.c                 |   2 +-
 net/batman-adv/tp_meter.c              |   2 +-
 net/batman-adv/translation-table.c     |   2 +-
 net/batman-adv/tvlv.c                  |   2 +-
 net/batman-adv/types.h                 |   3 +-
 27 files changed, 669 insertions(+), 670 deletions(-)
 create mode 100644 include/uapi/linux/batadv_packet.h
 delete mode 100644 net/batman-adv/packet.h

(limited to 'net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 129c591e0f34..753799d24cd9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2564,6 +2564,7 @@ S:	Maintained
 F:	Documentation/ABI/testing/sysfs-class-net-batman-adv
 F:	Documentation/ABI/testing/sysfs-class-net-mesh
 F:	Documentation/networking/batman-adv.rst
+F:	include/uapi/linux/batadv_packet.h
 F:	include/uapi/linux/batman_adv.h
 F:	net/batman-adv/
 
diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
new file mode 100644
index 000000000000..5cb360be2a11
--- /dev/null
+++ b/include/uapi/linux/batadv_packet.h
@@ -0,0 +1,644 @@
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _UAPI_LINUX_BATADV_PACKET_H_
+#define _UAPI_LINUX_BATADV_PACKET_H_
+
+#include <asm/byteorder.h>
+#include <linux/if_ether.h>
+#include <linux/types.h>
+
+/**
+ * batadv_tp_is_error() - Check throughput meter return code for error
+ * @n: throughput meter return code
+ *
+ * Return: 0 when not error was detected, != 0 otherwise
+ */
+#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0)
+
+/**
+ * enum batadv_packettype - types for batman-adv encapsulated packets
+ * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
+ * @BATADV_BCAST: broadcast packets carrying broadcast payload
+ * @BATADV_CODED: network coded packets
+ * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
+ * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V
+ *
+ * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
+ * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
+ *     payload packet
+ * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of
+ *     the sender
+ * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute
+ * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers
+ */
+enum batadv_packettype {
+	/* 0x00 - 0x3f: local packets or special rules for handling */
+	BATADV_IV_OGM           = 0x00,
+	BATADV_BCAST            = 0x01,
+	BATADV_CODED            = 0x02,
+	BATADV_ELP		= 0x03,
+	BATADV_OGM2		= 0x04,
+	/* 0x40 - 0x7f: unicast */
+#define BATADV_UNICAST_MIN     0x40
+	BATADV_UNICAST          = 0x40,
+	BATADV_UNICAST_FRAG     = 0x41,
+	BATADV_UNICAST_4ADDR    = 0x42,
+	BATADV_ICMP             = 0x43,
+	BATADV_UNICAST_TVLV     = 0x44,
+#define BATADV_UNICAST_MAX     0x7f
+	/* 0x80 - 0xff: reserved */
+};
+
+/**
+ * enum batadv_subtype - packet subtype for unicast4addr
+ * @BATADV_P_DATA: user payload
+ * @BATADV_P_DAT_DHT_GET: DHT request message
+ * @BATADV_P_DAT_DHT_PUT: DHT store message
+ * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT
+ */
+enum batadv_subtype {
+	BATADV_P_DATA			= 0x01,
+	BATADV_P_DAT_DHT_GET		= 0x02,
+	BATADV_P_DAT_DHT_PUT		= 0x03,
+	BATADV_P_DAT_CACHE_REPLY	= 0x04,
+};
+
+/* this file is included by batctl which needs these defines */
+#define BATADV_COMPAT_VERSION 15
+
+/**
+ * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets
+ * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was
+ *     previously received from someone else than the best neighbor.
+ * @BATADV_PRIMARIES_FIRST_HOP: flag unused.
+ * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a
+ *     one hop neighbor on the interface where it was originally received.
+ */
+enum batadv_iv_flags {
+	BATADV_NOT_BEST_NEXT_HOP   = 1UL << 0,
+	BATADV_PRIMARIES_FIRST_HOP = 1UL << 1,
+	BATADV_DIRECTLINK          = 1UL << 2,
+};
+
+/**
+ * enum batadv_icmp_packettype - ICMP message types
+ * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST
+ * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found
+ * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination
+ * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops
+ * @BATADV_PARAMETER_PROBLEM: return code for malformed messages
+ * @BATADV_TP: throughput meter packet
+ */
+enum batadv_icmp_packettype {
+	BATADV_ECHO_REPLY	       = 0,
+	BATADV_DESTINATION_UNREACHABLE = 3,
+	BATADV_ECHO_REQUEST	       = 8,
+	BATADV_TTL_EXCEEDED	       = 11,
+	BATADV_PARAMETER_PROBLEM       = 12,
+	BATADV_TP		       = 15,
+};
+
+/**
+ * enum batadv_mcast_flags - flags for multicast capabilities and settings
+ * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for
+ *  224.0.0.0/24 or ff02::1
+ * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets
+ * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
+ */
+enum batadv_mcast_flags {
+	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= 1UL << 0,
+	BATADV_MCAST_WANT_ALL_IPV4		= 1UL << 1,
+	BATADV_MCAST_WANT_ALL_IPV6		= 1UL << 2,
+};
+
+/* tt data subtypes */
+#define BATADV_TT_DATA_TYPE_MASK 0x0F
+
+/**
+ * enum batadv_tt_data_flags - flags for tt data tvlv
+ * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM
+ * @BATADV_TT_REQUEST: TT request message
+ * @BATADV_TT_RESPONSE: TT response message
+ * @BATADV_TT_FULL_TABLE: contains full table to replace existing table
+ */
+enum batadv_tt_data_flags {
+	BATADV_TT_OGM_DIFF   = 1UL << 0,
+	BATADV_TT_REQUEST    = 1UL << 1,
+	BATADV_TT_RESPONSE   = 1UL << 2,
+	BATADV_TT_FULL_TABLE = 1UL << 4,
+};
+
+/**
+ * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
+ * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
+ */
+enum batadv_vlan_flags {
+	BATADV_VLAN_HAS_TAG	= 1UL << 15,
+};
+
+/**
+ * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance
+ * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address
+ * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address
+ * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc
+ * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table
+ * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet
+ */
+enum batadv_bla_claimframe {
+	BATADV_CLAIM_TYPE_CLAIM		= 0x00,
+	BATADV_CLAIM_TYPE_UNCLAIM	= 0x01,
+	BATADV_CLAIM_TYPE_ANNOUNCE	= 0x02,
+	BATADV_CLAIM_TYPE_REQUEST	= 0x03,
+	BATADV_CLAIM_TYPE_LOOPDETECT	= 0x04,
+};
+
+/**
+ * enum batadv_tvlv_type - tvlv type definitions
+ * @BATADV_TVLV_GW: gateway tvlv
+ * @BATADV_TVLV_DAT: distributed arp table tvlv
+ * @BATADV_TVLV_NC: network coding tvlv
+ * @BATADV_TVLV_TT: translation table tvlv
+ * @BATADV_TVLV_ROAM: roaming advertisement tvlv
+ * @BATADV_TVLV_MCAST: multicast capability tvlv
+ */
+enum batadv_tvlv_type {
+	BATADV_TVLV_GW		= 0x01,
+	BATADV_TVLV_DAT		= 0x02,
+	BATADV_TVLV_NC		= 0x03,
+	BATADV_TVLV_TT		= 0x04,
+	BATADV_TVLV_ROAM	= 0x05,
+	BATADV_TVLV_MCAST	= 0x06,
+};
+
+#pragma pack(2)
+/* the destination hardware field in the ARP frame is used to
+ * transport the claim type and the group id
+ */
+struct batadv_bla_claim_dst {
+	__u8   magic[3];	/* FF:43:05 */
+	__u8   type;		/* bla_claimframe */
+	__be16 group;		/* group id */
+};
+
+#pragma pack()
+
+/**
+ * struct batadv_ogm_packet - ogm (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @flags: contains routing relevant flags - see enum batadv_iv_flags
+ * @seqno: sequence identification
+ * @orig: address of the source node
+ * @prev_sender: address of the previous sender
+ * @reserved: reserved byte for alignment
+ * @tq: transmission quality
+ * @tvlv_len: length of tvlv data following the ogm header
+ */
+struct batadv_ogm_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   flags;
+	__be32 seqno;
+	__u8   orig[ETH_ALEN];
+	__u8   prev_sender[ETH_ALEN];
+	__u8   reserved;
+	__u8   tq;
+	__be16 tvlv_len;
+	/* __packed is not needed as the struct size is divisible by 4,
+	 * and the largest data type in this struct has a size of 4.
+	 */
+};
+
+#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
+
+/**
+ * struct batadv_ogm2_packet - ogm2 (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the general header
+ * @ttl: time to live for this packet, part of the general header
+ * @flags: reseved for routing relevant flags - currently always 0
+ * @seqno: sequence number
+ * @orig: originator mac address
+ * @tvlv_len: length of the appended tvlv buffer (in bytes)
+ * @throughput: the currently flooded path throughput
+ */
+struct batadv_ogm2_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   flags;
+	__be32 seqno;
+	__u8   orig[ETH_ALEN];
+	__be16 tvlv_len;
+	__be32 throughput;
+	/* __packed is not needed as the struct size is divisible by 4,
+	 * and the largest data type in this struct has a size of 4.
+	 */
+};
+
+#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet)
+
+/**
+ * struct batadv_elp_packet - elp (neighbor discovery) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @orig: originator mac address
+ * @seqno: sequence number
+ * @elp_interval: currently used ELP sending interval in ms
+ */
+struct batadv_elp_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   orig[ETH_ALEN];
+	__be32 seqno;
+	__be32 elp_interval;
+};
+
+#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
+
+/**
+ * struct batadv_icmp_header - common members among all the ICMP packets
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @align: not used - useful for alignment purposes only
+ *
+ * This structure is used for ICMP packets parsing only and it is never sent
+ * over the wire. The alignment field at the end is there to ensure that
+ * members are padded the same way as they are in real packets.
+ */
+struct batadv_icmp_header {
+	__u8 packet_type;
+	__u8 version;
+	__u8 ttl;
+	__u8 msg_type; /* see ICMP message types above */
+	__u8 dst[ETH_ALEN];
+	__u8 orig[ETH_ALEN];
+	__u8 uid;
+	__u8 align[3];
+};
+
+/**
+ * struct batadv_icmp_packet - ICMP packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @reserved: not used - useful for alignment
+ * @seqno: ICMP sequence number
+ */
+struct batadv_icmp_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   reserved;
+	__be16 seqno;
+};
+
+/**
+ * struct batadv_icmp_tp_packet - ICMP TP Meter packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @subtype: TP packet subtype (see batadv_icmp_tp_subtype)
+ * @session: TP session identifier
+ * @seqno: the TP sequence number
+ * @timestamp: time when the packet has been sent. This value is filled in a
+ *  TP_MSG and echoed back in the next TP_ACK so that the sender can compute the
+ *  RTT. Since it is read only by the host which wrote it, there is no need to
+ *  store it using network order
+ */
+struct batadv_icmp_tp_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   subtype;
+	__u8   session[2];
+	__be32 seqno;
+	__be32 timestamp;
+};
+
+/**
+ * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes
+ * @BATADV_TP_MSG: Msg from sender to receiver
+ * @BATADV_TP_ACK: acknowledgment from receiver to sender
+ */
+enum batadv_icmp_tp_subtype {
+	BATADV_TP_MSG	= 0,
+	BATADV_TP_ACK,
+};
+
+#define BATADV_RR_LEN 16
+
+/**
+ * struct batadv_icmp_packet_rr - ICMP RouteRecord packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @rr_cur: number of entries the rr array
+ * @seqno: ICMP sequence number
+ * @rr: route record array
+ */
+struct batadv_icmp_packet_rr {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   rr_cur;
+	__be16 seqno;
+	__u8   rr[BATADV_RR_LEN][ETH_ALEN];
+};
+
+#define BATADV_ICMP_MAX_PACKET_SIZE	sizeof(struct batadv_icmp_packet_rr)
+
+/* All packet headers in front of an ethernet header have to be completely
+ * divisible by 2 but not by 4 to make the payload after the ethernet
+ * header again 4 bytes boundary aligned.
+ *
+ * A packing of 2 is necessary to avoid extra padding at the end of the struct
+ * caused by a structure member which is larger than two bytes. Otherwise
+ * the structure would not fulfill the previously mentioned rule to avoid the
+ * misalignment of the payload after the ethernet header. It may also lead to
+ * leakage of information when the padding it not initialized before sending.
+ */
+#pragma pack(2)
+
+/**
+ * struct batadv_unicast_packet - unicast packet for network payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @ttvn: translation table version number
+ * @dest: originator destination of the unicast packet
+ */
+struct batadv_unicast_packet {
+	__u8 packet_type;
+	__u8 version;
+	__u8 ttl;
+	__u8 ttvn; /* destination translation table version number */
+	__u8 dest[ETH_ALEN];
+	/* "4 bytes boundary + 2 bytes" long to make the payload after the
+	 * following ethernet header again 4 bytes boundary aligned
+	 */
+};
+
+/**
+ * struct batadv_unicast_4addr_packet - extended unicast packet
+ * @u: common unicast packet header
+ * @src: address of the source
+ * @subtype: packet subtype
+ * @reserved: reserved byte for alignment
+ */
+struct batadv_unicast_4addr_packet {
+	struct batadv_unicast_packet u;
+	__u8 src[ETH_ALEN];
+	__u8 subtype;
+	__u8 reserved;
+	/* "4 bytes boundary + 2 bytes" long to make the payload after the
+	 * following ethernet header again 4 bytes boundary aligned
+	 */
+};
+
+/**
+ * struct batadv_frag_packet - fragmented packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @dest: final destination used when routing fragments
+ * @orig: originator of the fragment used when merging the packet
+ * @no: fragment number within this sequence
+ * @priority: priority of frame, from ToS IP precedence or 802.1p
+ * @reserved: reserved byte for alignment
+ * @seqno: sequence identification
+ * @total_size: size of the merged packet
+ */
+struct batadv_frag_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+#if defined(__BIG_ENDIAN_BITFIELD)
+	__u8   no:4;
+	__u8   priority:3;
+	__u8   reserved:1;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8   reserved:1;
+	__u8   priority:3;
+	__u8   no:4;
+#else
+#error "unknown bitfield endianness"
+#endif
+	__u8   dest[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__be16 seqno;
+	__be16 total_size;
+};
+
+/**
+ * struct batadv_bcast_packet - broadcast packet for network payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: reserved byte for alignment
+ * @seqno: sequence identification
+ * @orig: originator of the broadcast packet
+ */
+struct batadv_bcast_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   reserved;
+	__be32 seqno;
+	__u8   orig[ETH_ALEN];
+	/* "4 bytes boundary + 2 bytes" long to make the payload after the
+	 * following ethernet header again 4 bytes boundary aligned
+	 */
+};
+
+/**
+ * struct batadv_coded_packet - network coded packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @first_source: original source of first included packet
+ * @first_orig_dest: original destinal of first included packet
+ * @first_crc: checksum of first included packet
+ * @first_ttvn: tt-version number of first included packet
+ * @second_ttl: ttl of second packet
+ * @second_dest: second receiver of this coded packet
+ * @second_source: original source of second included packet
+ * @second_orig_dest: original destination of second included packet
+ * @second_crc: checksum of second included packet
+ * @second_ttvn: tt version number of second included packet
+ * @coded_len: length of network coded part of the payload
+ */
+struct batadv_coded_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   first_ttvn;
+	/* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */
+	__u8   first_source[ETH_ALEN];
+	__u8   first_orig_dest[ETH_ALEN];
+	__be32 first_crc;
+	__u8   second_ttl;
+	__u8   second_ttvn;
+	__u8   second_dest[ETH_ALEN];
+	__u8   second_source[ETH_ALEN];
+	__u8   second_orig_dest[ETH_ALEN];
+	__be32 second_crc;
+	__be16 coded_len;
+};
+
+#pragma pack()
+
+/**
+ * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: reserved field (for packet alignment)
+ * @src: address of the source
+ * @dst: address of the destination
+ * @tvlv_len: length of tvlv data following the unicast tvlv header
+ * @align: 2 bytes to align the header to a 4 byte boundary
+ */
+struct batadv_unicast_tvlv_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   reserved;
+	__u8   dst[ETH_ALEN];
+	__u8   src[ETH_ALEN];
+	__be16 tvlv_len;
+	__u16  align;
+};
+
+/**
+ * struct batadv_tvlv_hdr - base tvlv header struct
+ * @type: tvlv container type (see batadv_tvlv_type)
+ * @version: tvlv container version
+ * @len: tvlv container length
+ */
+struct batadv_tvlv_hdr {
+	__u8   type;
+	__u8   version;
+	__be16 len;
+};
+
+/**
+ * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv
+ *  container
+ * @bandwidth_down: advertised uplink download bandwidth
+ * @bandwidth_up: advertised uplink upload bandwidth
+ */
+struct batadv_tvlv_gateway_data {
+	__be32 bandwidth_down;
+	__be32 bandwidth_up;
+};
+
+/**
+ * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
+ * @flags: translation table flags (see batadv_tt_data_flags)
+ * @ttvn: translation table version number
+ * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
+ *  one batadv_tvlv_tt_vlan_data object per announced vlan
+ */
+struct batadv_tvlv_tt_data {
+	__u8   flags;
+	__u8   ttvn;
+	__be16 num_vlan;
+};
+
+/**
+ * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through
+ *  the tt tvlv container
+ * @crc: crc32 checksum of the entries belonging to this vlan
+ * @vid: vlan identifier
+ * @reserved: unused, useful for alignment purposes
+ */
+struct batadv_tvlv_tt_vlan_data {
+	__be32 crc;
+	__be16 vid;
+	__u16  reserved;
+};
+
+/**
+ * struct batadv_tvlv_tt_change - translation table diff data
+ * @flags: status indicators concerning the non-mesh client (see
+ *  batadv_tt_client_flags)
+ * @reserved: reserved field - useful for alignment purposes only
+ * @addr: mac address of non-mesh client that triggered this tt change
+ * @vid: VLAN identifier
+ */
+struct batadv_tvlv_tt_change {
+	__u8   flags;
+	__u8   reserved[3];
+	__u8   addr[ETH_ALEN];
+	__be16 vid;
+};
+
+/**
+ * struct batadv_tvlv_roam_adv - roaming advertisement
+ * @client: mac address of roaming client
+ * @vid: VLAN identifier
+ */
+struct batadv_tvlv_roam_adv {
+	__u8   client[ETH_ALEN];
+	__be16 vid;
+};
+
+/**
+ * struct batadv_tvlv_mcast_data - payload of a multicast tvlv
+ * @flags: multicast flags announced by the orig node
+ * @reserved: reserved field
+ */
+struct batadv_tvlv_mcast_data {
+	__u8 flags;
+	__u8 reserved[3];
+};
+
+#endif /* _UAPI_LINUX_BATADV_PACKET_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 84c36430c25a..79e326383726 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -52,6 +52,7 @@
 #include <linux/workqueue.h>
 #include <net/genetlink.h>
 #include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -63,7 +64,6 @@
 #include "netlink.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 78ddf3afa83a..27e165ac9302 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -37,6 +37,7 @@
 #include <linux/workqueue.h>
 #include <net/genetlink.h>
 #include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -49,7 +50,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 
 struct sk_buff;
 
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 59ae96cef596..a83478c46597 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -42,13 +42,13 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <net/cfg80211.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_algo.h"
 #include "bat_v_ogm.h"
 #include "hard-interface.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index e415974c540d..ba59b77c605d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -39,13 +39,13 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_algo.h"
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e647450e5d0f..fad47853ad3c 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -50,6 +50,7 @@
 #include <net/genetlink.h>
 #include <net/netlink.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "hard-interface.h"
@@ -57,7 +58,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "soft-interface.h"
 #include "sysfs.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 3d47bedaf661..12897eb46268 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -24,9 +24,9 @@
 #include <linux/compiler.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "originator.h"
-#include "packet.h"
 
 struct seq_file;
 struct sk_buff;
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 4979350af9a7..22dde42fd80e 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -33,10 +33,10 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "soft-interface.h"
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 2488e25d0eef..37fe9a644f22 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -43,6 +43,7 @@
 #include <linux/stddef.h>
 #include <linux/udp.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "gateway_common.h"
@@ -50,7 +51,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "soft-interface.h"
 #include "sysfs.h"
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 83bfeecf661c..b3e156af2256 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -27,10 +27,10 @@
 #include <linux/netdevice.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "gateway_client.h"
 #include "log.h"
-#include "packet.h"
 #include "tvlv.h"
 
 /**
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 13d04dba0b3a..5f186bff284a 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -38,6 +38,7 @@
 #include <linux/spinlock.h>
 #include <net/net_namespace.h>
 #include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_v.h"
 #include "bridge_loop_avoidance.h"
@@ -46,7 +47,6 @@
 #include "gateway_client.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "soft-interface.h"
 #include "sysfs.h"
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index f2ef75b7fa73..8041cf106c37 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -44,11 +44,11 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/wait.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 
 static struct batadv_socket_client *batadv_socket_client_hash[256];
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 8bee4279d579..d31c8266e244 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -46,6 +46,7 @@
 #include <linux/workqueue.h>
 #include <net/dsfield.h>
 #include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -63,7 +64,6 @@
 #include "netlink.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "soft-interface.h"
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index d5484ac381d3..f7ba3f96d8f3 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -223,8 +223,8 @@ enum batadv_uev_type {
 #include <linux/jiffies.h>
 #include <linux/percpu.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
-#include "packet.h"
 #include "types.h"
 
 struct net_device;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 8a503c526b90..cbdeb47ec3f6 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -55,11 +55,11 @@
 #include <net/if_inet6.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
-#include "packet.h"
 #include "translation-table.h"
 #include "tvlv.h"
 
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 103d4bdcdbdb..a823d3899bad 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -40,6 +40,7 @@
 #include <net/genetlink.h>
 #include <net/netlink.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -47,7 +48,6 @@
 #include "gateway_client.h"
 #include "hard-interface.h"
 #include "originator.h"
-#include "packet.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 3758be7fd881..b48116bb24ef 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -49,12 +49,12 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "tvlv.h"
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
deleted file mode 100644
index 3b2d2db993aa..000000000000
--- a/net/batman-adv/packet.h
+++ /dev/null
@@ -1,644 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
- *
- * Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_PACKET_H_
-#define _NET_BATMAN_ADV_PACKET_H_
-
-#include <asm/byteorder.h>
-#include <linux/if_ether.h>
-#include <linux/types.h>
-
-/**
- * batadv_tp_is_error() - Check throughput meter return code for error
- * @n: throughput meter return code
- *
- * Return: 0 when not error was detected, != 0 otherwise
- */
-#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0)
-
-/**
- * enum batadv_packettype - types for batman-adv encapsulated packets
- * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
- * @BATADV_BCAST: broadcast packets carrying broadcast payload
- * @BATADV_CODED: network coded packets
- * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
- * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V
- *
- * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
- * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
- *     payload packet
- * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of
- *     the sender
- * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute
- * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers
- */
-enum batadv_packettype {
-	/* 0x00 - 0x3f: local packets or special rules for handling */
-	BATADV_IV_OGM           = 0x00,
-	BATADV_BCAST            = 0x01,
-	BATADV_CODED            = 0x02,
-	BATADV_ELP		= 0x03,
-	BATADV_OGM2		= 0x04,
-	/* 0x40 - 0x7f: unicast */
-#define BATADV_UNICAST_MIN     0x40
-	BATADV_UNICAST          = 0x40,
-	BATADV_UNICAST_FRAG     = 0x41,
-	BATADV_UNICAST_4ADDR    = 0x42,
-	BATADV_ICMP             = 0x43,
-	BATADV_UNICAST_TVLV     = 0x44,
-#define BATADV_UNICAST_MAX     0x7f
-	/* 0x80 - 0xff: reserved */
-};
-
-/**
- * enum batadv_subtype - packet subtype for unicast4addr
- * @BATADV_P_DATA: user payload
- * @BATADV_P_DAT_DHT_GET: DHT request message
- * @BATADV_P_DAT_DHT_PUT: DHT store message
- * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT
- */
-enum batadv_subtype {
-	BATADV_P_DATA			= 0x01,
-	BATADV_P_DAT_DHT_GET		= 0x02,
-	BATADV_P_DAT_DHT_PUT		= 0x03,
-	BATADV_P_DAT_CACHE_REPLY	= 0x04,
-};
-
-/* this file is included by batctl which needs these defines */
-#define BATADV_COMPAT_VERSION 15
-
-/**
- * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets
- * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was
- *     previously received from someone else than the best neighbor.
- * @BATADV_PRIMARIES_FIRST_HOP: flag unused.
- * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a
- *     one hop neighbor on the interface where it was originally received.
- */
-enum batadv_iv_flags {
-	BATADV_NOT_BEST_NEXT_HOP   = 1UL << 0,
-	BATADV_PRIMARIES_FIRST_HOP = 1UL << 1,
-	BATADV_DIRECTLINK          = 1UL << 2,
-};
-
-/**
- * enum batadv_icmp_packettype - ICMP message types
- * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST
- * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found
- * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination
- * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops
- * @BATADV_PARAMETER_PROBLEM: return code for malformed messages
- * @BATADV_TP: throughput meter packet
- */
-enum batadv_icmp_packettype {
-	BATADV_ECHO_REPLY	       = 0,
-	BATADV_DESTINATION_UNREACHABLE = 3,
-	BATADV_ECHO_REQUEST	       = 8,
-	BATADV_TTL_EXCEEDED	       = 11,
-	BATADV_PARAMETER_PROBLEM       = 12,
-	BATADV_TP		       = 15,
-};
-
-/**
- * enum batadv_mcast_flags - flags for multicast capabilities and settings
- * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for
- *  224.0.0.0/24 or ff02::1
- * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets
- * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
- */
-enum batadv_mcast_flags {
-	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= 1UL << 0,
-	BATADV_MCAST_WANT_ALL_IPV4		= 1UL << 1,
-	BATADV_MCAST_WANT_ALL_IPV6		= 1UL << 2,
-};
-
-/* tt data subtypes */
-#define BATADV_TT_DATA_TYPE_MASK 0x0F
-
-/**
- * enum batadv_tt_data_flags - flags for tt data tvlv
- * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM
- * @BATADV_TT_REQUEST: TT request message
- * @BATADV_TT_RESPONSE: TT response message
- * @BATADV_TT_FULL_TABLE: contains full table to replace existing table
- */
-enum batadv_tt_data_flags {
-	BATADV_TT_OGM_DIFF   = 1UL << 0,
-	BATADV_TT_REQUEST    = 1UL << 1,
-	BATADV_TT_RESPONSE   = 1UL << 2,
-	BATADV_TT_FULL_TABLE = 1UL << 4,
-};
-
-/**
- * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
- * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
- */
-enum batadv_vlan_flags {
-	BATADV_VLAN_HAS_TAG	= 1UL << 15,
-};
-
-/**
- * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance
- * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address
- * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address
- * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc
- * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table
- * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet
- */
-enum batadv_bla_claimframe {
-	BATADV_CLAIM_TYPE_CLAIM		= 0x00,
-	BATADV_CLAIM_TYPE_UNCLAIM	= 0x01,
-	BATADV_CLAIM_TYPE_ANNOUNCE	= 0x02,
-	BATADV_CLAIM_TYPE_REQUEST	= 0x03,
-	BATADV_CLAIM_TYPE_LOOPDETECT	= 0x04,
-};
-
-/**
- * enum batadv_tvlv_type - tvlv type definitions
- * @BATADV_TVLV_GW: gateway tvlv
- * @BATADV_TVLV_DAT: distributed arp table tvlv
- * @BATADV_TVLV_NC: network coding tvlv
- * @BATADV_TVLV_TT: translation table tvlv
- * @BATADV_TVLV_ROAM: roaming advertisement tvlv
- * @BATADV_TVLV_MCAST: multicast capability tvlv
- */
-enum batadv_tvlv_type {
-	BATADV_TVLV_GW		= 0x01,
-	BATADV_TVLV_DAT		= 0x02,
-	BATADV_TVLV_NC		= 0x03,
-	BATADV_TVLV_TT		= 0x04,
-	BATADV_TVLV_ROAM	= 0x05,
-	BATADV_TVLV_MCAST	= 0x06,
-};
-
-#pragma pack(2)
-/* the destination hardware field in the ARP frame is used to
- * transport the claim type and the group id
- */
-struct batadv_bla_claim_dst {
-	__u8   magic[3];	/* FF:43:05 */
-	__u8   type;		/* bla_claimframe */
-	__be16 group;		/* group id */
-};
-
-#pragma pack()
-
-/**
- * struct batadv_ogm_packet - ogm (routing protocol) packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @flags: contains routing relevant flags - see enum batadv_iv_flags
- * @seqno: sequence identification
- * @orig: address of the source node
- * @prev_sender: address of the previous sender
- * @reserved: reserved byte for alignment
- * @tq: transmission quality
- * @tvlv_len: length of tvlv data following the ogm header
- */
-struct batadv_ogm_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   flags;
-	__be32 seqno;
-	__u8   orig[ETH_ALEN];
-	__u8   prev_sender[ETH_ALEN];
-	__u8   reserved;
-	__u8   tq;
-	__be16 tvlv_len;
-	/* __packed is not needed as the struct size is divisible by 4,
-	 * and the largest data type in this struct has a size of 4.
-	 */
-};
-
-#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
-
-/**
- * struct batadv_ogm2_packet - ogm2 (routing protocol) packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the general header
- * @ttl: time to live for this packet, part of the general header
- * @flags: reseved for routing relevant flags - currently always 0
- * @seqno: sequence number
- * @orig: originator mac address
- * @tvlv_len: length of the appended tvlv buffer (in bytes)
- * @throughput: the currently flooded path throughput
- */
-struct batadv_ogm2_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   flags;
-	__be32 seqno;
-	__u8   orig[ETH_ALEN];
-	__be16 tvlv_len;
-	__be32 throughput;
-	/* __packed is not needed as the struct size is divisible by 4,
-	 * and the largest data type in this struct has a size of 4.
-	 */
-};
-
-#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet)
-
-/**
- * struct batadv_elp_packet - elp (neighbor discovery) packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @orig: originator mac address
- * @seqno: sequence number
- * @elp_interval: currently used ELP sending interval in ms
- */
-struct batadv_elp_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   orig[ETH_ALEN];
-	__be32 seqno;
-	__be32 elp_interval;
-};
-
-#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
-
-/**
- * struct batadv_icmp_header - common members among all the ICMP packets
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @align: not used - useful for alignment purposes only
- *
- * This structure is used for ICMP packets parsing only and it is never sent
- * over the wire. The alignment field at the end is there to ensure that
- * members are padded the same way as they are in real packets.
- */
-struct batadv_icmp_header {
-	__u8 packet_type;
-	__u8 version;
-	__u8 ttl;
-	__u8 msg_type; /* see ICMP message types above */
-	__u8 dst[ETH_ALEN];
-	__u8 orig[ETH_ALEN];
-	__u8 uid;
-	__u8 align[3];
-};
-
-/**
- * struct batadv_icmp_packet - ICMP packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @reserved: not used - useful for alignment
- * @seqno: ICMP sequence number
- */
-struct batadv_icmp_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   msg_type; /* see ICMP message types above */
-	__u8   dst[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__u8   uid;
-	__u8   reserved;
-	__be16 seqno;
-};
-
-/**
- * struct batadv_icmp_tp_packet - ICMP TP Meter packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @subtype: TP packet subtype (see batadv_icmp_tp_subtype)
- * @session: TP session identifier
- * @seqno: the TP sequence number
- * @timestamp: time when the packet has been sent. This value is filled in a
- *  TP_MSG and echoed back in the next TP_ACK so that the sender can compute the
- *  RTT. Since it is read only by the host which wrote it, there is no need to
- *  store it using network order
- */
-struct batadv_icmp_tp_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   msg_type; /* see ICMP message types above */
-	__u8   dst[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__u8   uid;
-	__u8   subtype;
-	__u8   session[2];
-	__be32 seqno;
-	__be32 timestamp;
-};
-
-/**
- * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes
- * @BATADV_TP_MSG: Msg from sender to receiver
- * @BATADV_TP_ACK: acknowledgment from receiver to sender
- */
-enum batadv_icmp_tp_subtype {
-	BATADV_TP_MSG	= 0,
-	BATADV_TP_ACK,
-};
-
-#define BATADV_RR_LEN 16
-
-/**
- * struct batadv_icmp_packet_rr - ICMP RouteRecord packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @rr_cur: number of entries the rr array
- * @seqno: ICMP sequence number
- * @rr: route record array
- */
-struct batadv_icmp_packet_rr {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   msg_type; /* see ICMP message types above */
-	__u8   dst[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__u8   uid;
-	__u8   rr_cur;
-	__be16 seqno;
-	__u8   rr[BATADV_RR_LEN][ETH_ALEN];
-};
-
-#define BATADV_ICMP_MAX_PACKET_SIZE	sizeof(struct batadv_icmp_packet_rr)
-
-/* All packet headers in front of an ethernet header have to be completely
- * divisible by 2 but not by 4 to make the payload after the ethernet
- * header again 4 bytes boundary aligned.
- *
- * A packing of 2 is necessary to avoid extra padding at the end of the struct
- * caused by a structure member which is larger than two bytes. Otherwise
- * the structure would not fulfill the previously mentioned rule to avoid the
- * misalignment of the payload after the ethernet header. It may also lead to
- * leakage of information when the padding it not initialized before sending.
- */
-#pragma pack(2)
-
-/**
- * struct batadv_unicast_packet - unicast packet for network payload
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @ttvn: translation table version number
- * @dest: originator destination of the unicast packet
- */
-struct batadv_unicast_packet {
-	__u8 packet_type;
-	__u8 version;
-	__u8 ttl;
-	__u8 ttvn; /* destination translation table version number */
-	__u8 dest[ETH_ALEN];
-	/* "4 bytes boundary + 2 bytes" long to make the payload after the
-	 * following ethernet header again 4 bytes boundary aligned
-	 */
-};
-
-/**
- * struct batadv_unicast_4addr_packet - extended unicast packet
- * @u: common unicast packet header
- * @src: address of the source
- * @subtype: packet subtype
- * @reserved: reserved byte for alignment
- */
-struct batadv_unicast_4addr_packet {
-	struct batadv_unicast_packet u;
-	__u8 src[ETH_ALEN];
-	__u8 subtype;
-	__u8 reserved;
-	/* "4 bytes boundary + 2 bytes" long to make the payload after the
-	 * following ethernet header again 4 bytes boundary aligned
-	 */
-};
-
-/**
- * struct batadv_frag_packet - fragmented packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @dest: final destination used when routing fragments
- * @orig: originator of the fragment used when merging the packet
- * @no: fragment number within this sequence
- * @priority: priority of frame, from ToS IP precedence or 802.1p
- * @reserved: reserved byte for alignment
- * @seqno: sequence identification
- * @total_size: size of the merged packet
- */
-struct batadv_frag_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-#if defined(__BIG_ENDIAN_BITFIELD)
-	__u8   no:4;
-	__u8   priority:3;
-	__u8   reserved:1;
-#elif defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8   reserved:1;
-	__u8   priority:3;
-	__u8   no:4;
-#else
-#error "unknown bitfield endianness"
-#endif
-	__u8   dest[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__be16 seqno;
-	__be16 total_size;
-};
-
-/**
- * struct batadv_bcast_packet - broadcast packet for network payload
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @reserved: reserved byte for alignment
- * @seqno: sequence identification
- * @orig: originator of the broadcast packet
- */
-struct batadv_bcast_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-	__u8   reserved;
-	__be32 seqno;
-	__u8   orig[ETH_ALEN];
-	/* "4 bytes boundary + 2 bytes" long to make the payload after the
-	 * following ethernet header again 4 bytes boundary aligned
-	 */
-};
-
-/**
- * struct batadv_coded_packet - network coded packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @first_source: original source of first included packet
- * @first_orig_dest: original destinal of first included packet
- * @first_crc: checksum of first included packet
- * @first_ttvn: tt-version number of first included packet
- * @second_ttl: ttl of second packet
- * @second_dest: second receiver of this coded packet
- * @second_source: original source of second included packet
- * @second_orig_dest: original destination of second included packet
- * @second_crc: checksum of second included packet
- * @second_ttvn: tt version number of second included packet
- * @coded_len: length of network coded part of the payload
- */
-struct batadv_coded_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-	__u8   first_ttvn;
-	/* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */
-	__u8   first_source[ETH_ALEN];
-	__u8   first_orig_dest[ETH_ALEN];
-	__be32 first_crc;
-	__u8   second_ttl;
-	__u8   second_ttvn;
-	__u8   second_dest[ETH_ALEN];
-	__u8   second_source[ETH_ALEN];
-	__u8   second_orig_dest[ETH_ALEN];
-	__be32 second_crc;
-	__be16 coded_len;
-};
-
-#pragma pack()
-
-/**
- * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @reserved: reserved field (for packet alignment)
- * @src: address of the source
- * @dst: address of the destination
- * @tvlv_len: length of tvlv data following the unicast tvlv header
- * @align: 2 bytes to align the header to a 4 byte boundary
- */
-struct batadv_unicast_tvlv_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-	__u8   reserved;
-	__u8   dst[ETH_ALEN];
-	__u8   src[ETH_ALEN];
-	__be16 tvlv_len;
-	__u16  align;
-};
-
-/**
- * struct batadv_tvlv_hdr - base tvlv header struct
- * @type: tvlv container type (see batadv_tvlv_type)
- * @version: tvlv container version
- * @len: tvlv container length
- */
-struct batadv_tvlv_hdr {
-	__u8   type;
-	__u8   version;
-	__be16 len;
-};
-
-/**
- * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv
- *  container
- * @bandwidth_down: advertised uplink download bandwidth
- * @bandwidth_up: advertised uplink upload bandwidth
- */
-struct batadv_tvlv_gateway_data {
-	__be32 bandwidth_down;
-	__be32 bandwidth_up;
-};
-
-/**
- * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
- * @flags: translation table flags (see batadv_tt_data_flags)
- * @ttvn: translation table version number
- * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
- *  one batadv_tvlv_tt_vlan_data object per announced vlan
- */
-struct batadv_tvlv_tt_data {
-	__u8   flags;
-	__u8   ttvn;
-	__be16 num_vlan;
-};
-
-/**
- * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through
- *  the tt tvlv container
- * @crc: crc32 checksum of the entries belonging to this vlan
- * @vid: vlan identifier
- * @reserved: unused, useful for alignment purposes
- */
-struct batadv_tvlv_tt_vlan_data {
-	__be32 crc;
-	__be16 vid;
-	__u16  reserved;
-};
-
-/**
- * struct batadv_tvlv_tt_change - translation table diff data
- * @flags: status indicators concerning the non-mesh client (see
- *  batadv_tt_client_flags)
- * @reserved: reserved field - useful for alignment purposes only
- * @addr: mac address of non-mesh client that triggered this tt change
- * @vid: VLAN identifier
- */
-struct batadv_tvlv_tt_change {
-	__u8   flags;
-	__u8   reserved[3];
-	__u8   addr[ETH_ALEN];
-	__be16 vid;
-};
-
-/**
- * struct batadv_tvlv_roam_adv - roaming advertisement
- * @client: mac address of roaming client
- * @vid: VLAN identifier
- */
-struct batadv_tvlv_roam_adv {
-	__u8   client[ETH_ALEN];
-	__be16 vid;
-};
-
-/**
- * struct batadv_tvlv_mcast_data - payload of a multicast tvlv
- * @flags: multicast flags announced by the orig node
- * @reserved: reserved field
- */
-struct batadv_tvlv_mcast_data {
-	__u8 flags;
-	__u8 reserved[3];
-};
-
-#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index eb835bde502a..b6891e8b741c 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -34,6 +34,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bitarray.h"
 #include "bridge_loop_avoidance.h"
@@ -44,7 +45,6 @@
 #include "log.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 8c7399dd06ca..1e8c79093623 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -24,8 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
-
-#include "packet.h"
+#include <uapi/linux/batadv_packet.h>
 
 struct sk_buff;
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 1eb5555c5fe4..900c5ce21cd4 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -49,6 +49,7 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_algo.h"
 #include "bridge_loop_avoidance.h"
@@ -60,7 +61,6 @@
 #include "multicast.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "sysfs.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 56fb42551453..c1578fa0b952 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -39,6 +39,7 @@
 #include <linux/string.h>
 #include <linux/stringify.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bridge_loop_avoidance.h"
 #include "distributed-arp-table.h"
@@ -47,7 +48,6 @@
 #include "hard-interface.h"
 #include "log.h"
 #include "network-coding.h"
-#include "packet.h"
 #include "soft-interface.h"
 
 static struct net_device *batadv_kobj_to_netdev(struct kobject *obj)
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 7dcf2aa4deb5..8b576712d0c1 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -49,13 +49,13 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "hard-interface.h"
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 
 /**
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 0e53be3f8df0..7550a9ccd695 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -52,6 +52,7 @@
 #include <net/genetlink.h>
 #include <net/netlink.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bridge_loop_avoidance.h"
@@ -60,7 +61,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "soft-interface.h"
 #include "tvlv.h"
 
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index e189f026974c..5ffcb45ac6ff 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -36,9 +36,9 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "tvlv.h"
 
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 77b145eba193..bb1578410e0c 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -35,10 +35,9 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
-#include "packet.h"
-
 struct seq_file;
 
 #ifdef CONFIG_BATMAN_ADV_DAT
-- 
cgit v1.2.3


From 5b0890a97204627d75a333fc30f29f737e2bfad6 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:42 +0100
Subject: flow_dissector: Parse batman-adv unicast headers

The batman-adv unicast packets contain a full layer 2 frame in encapsulated
form. The flow dissector must therefore be able to parse the batman-adv
unicast header to reach the layer 2+3 information.

  +--------------------+
  | ip(v6)hdr          |
  +--------------------+
  | inner ethhdr       |
  +--------------------+
  | batadv unicast hdr |
  +--------------------+
  | outer ethhdr       |
  +--------------------+

The obtained information from the upper layer can then be used by RPS to
schedule the processing on separate cores. This allows better distribution
of multiple flows from the same neighbor to different cores.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index cc75488d3653..02db7b122a73 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -24,6 +24,7 @@
 #include <linux/tcp.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
+#include <uapi/linux/batadv_packet.h>
 
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
@@ -437,6 +438,57 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	return FLOW_DISSECT_RET_PROTO_AGAIN;
 }
 
+/**
+ * __skb_flow_dissect_batadv() - dissect batman-adv header
+ * @skb: sk_buff to with the batman-adv header
+ * @key_control: flow dissectors control key
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @p_proto: pointer used to update the protocol to process next
+ * @p_nhoff: pointer used to update inner network header offset
+ * @hlen: packet header length
+ * @flags: any combination of FLOW_DISSECTOR_F_*
+ *
+ * ETH_P_BATMAN packets are tried to be dissected. Only
+ * &struct batadv_unicast packets are actually processed because they contain an
+ * inner ethernet header and are usually followed by actual network header. This
+ * allows the flow dissector to continue processing the packet.
+ *
+ * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
+ *  FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
+ *  otherwise FLOW_DISSECT_RET_OUT_BAD
+ */
+static enum flow_dissect_ret
+__skb_flow_dissect_batadv(const struct sk_buff *skb,
+			  struct flow_dissector_key_control *key_control,
+			  void *data, __be16 *p_proto, int *p_nhoff, int hlen,
+			  unsigned int flags)
+{
+	struct {
+		struct batadv_unicast_packet batadv_unicast;
+		struct ethhdr eth;
+	} *hdr, _hdr;
+
+	hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
+				   &_hdr);
+	if (!hdr)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	*p_proto = hdr->eth.h_proto;
+	*p_nhoff += sizeof(*hdr);
+
+	key_control->flags |= FLOW_DIS_ENCAPSULATION;
+	if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+		return FLOW_DISSECT_RET_OUT_GOOD;
+
+	return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
 static void
 __skb_flow_dissect_tcp(const struct sk_buff *skb,
 		       struct flow_dissector *flow_dissector,
@@ -815,6 +867,11 @@ proto_again:
 					       nhoff, hlen);
 		break;
 
+	case htons(ETH_P_BATMAN):
+		fdret = __skb_flow_dissect_batadv(skb, key_control, data,
+						  &proto, &nhoff, hlen, flags);
+		break;
+
 	default:
 		fdret = FLOW_DISSECT_RET_OUT_BAD;
 		break;
-- 
cgit v1.2.3


From 09ee9dba9611cd382fd360a99ad1c2fa23bfdca8 Mon Sep 17 00:00:00 2001
From: Tobias Brunner <tobias@strongswan.org>
Date: Thu, 21 Dec 2017 17:32:24 +0100
Subject: ipv6: Reinject IPv6 packets if IPsec policy matches after SNAT

If SNAT modifies the source address the resulting packet might match
an IPsec policy, reinject the packet if that's the case.

The exact same thing is already done for IPv4.

Signed-off-by: Tobias Brunner <tobias@strongswan.org>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ece2781a31b2..bcdb615aed6e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -138,6 +138,14 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 		return ret;
 	}
 
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+	/* Policy lookup after SNAT yielded a new policy */
+	if (skb_dst(skb)->xfrm) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(net, sk, skb);
+	}
+#endif
+
 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 	    dst_allfrag(skb_dst(skb)) ||
 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
-- 
cgit v1.2.3


From fb7df5e4000a6bc19e350a00a93b16c602bd2dd8 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Thu, 21 Dec 2017 10:29:10 -0800
Subject: tcp: md5: Handle RCU dereference of md5sig_info

Dereference tp->md5sig_info in tcp_v4_destroy_sock() the same way it is
done in the adjacent call to tcp_clear_md5_list().

Resolves this sparse warning:

net/ipv4/tcp_ipv4.c:1914:17: warning: incorrect type in argument 1 (different address spaces)
net/ipv4/tcp_ipv4.c:1914:17:    expected struct callback_head *head
net/ipv4/tcp_ipv4.c:1914:17:    got struct callback_head [noderef] <asn:4>*<noident>

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Acked-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dd945b114215..5d203248123e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	/* Clean up the MD5 key list, if any */
 	if (tp->md5sig_info) {
 		tcp_clear_md5_list(sk);
-		kfree_rcu(tp->md5sig_info, rcu);
+		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
 		tp->md5sig_info = NULL;
 	}
 #endif
-- 
cgit v1.2.3


From 214bb1c78a4dd94ba748f1707ecc70041abe1fd3 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 21 Dec 2017 15:51:22 -0800
Subject: net: erspan: remove md NULL check

The 'md' is allocated from 'tun_dst = ip_tun_rx_dst' and
since we've checked 'tun_dst', 'md' will never be NULL.
The patch removes it at both ipv4 and ipv6 erspan.

Fixes: afb4c97d90e6 ("ip6_gre: fix potential memory leak in ip6erspan_rcv")
Fixes: 50670b6ee9bc ("ip_gre: fix potential memory leak in erspan_rcv")
Cc: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 5 -----
 net/ipv6/ip6_gre.c | 4 ----
 2 files changed, 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 78365094f56c..b61f2285816d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -313,11 +313,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return PACKET_REJECT;
 
 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
-			if (!md) {
-				dst_release((struct dst_entry *)tun_dst);
-				return PACKET_REJECT;
-			}
-
 			memcpy(md, pkt_md, sizeof(*md));
 			md->version = ver;
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 97f148f15429..b345b7e484c5 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -550,10 +550,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 
 			info = &tun_dst->u.tun_info;
 			md = ip_tunnel_info_opts(info);
-			if (!md) {
-				dst_release((struct dst_entry *)tun_dst);
-				return PACKET_REJECT;
-			}
 
 			memcpy(md, pkt_md, sizeof(*md));
 			md->version = ver;
-- 
cgit v1.2.3


From 820da5357572715c6235ba3b3daa2d5b43a1198f Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 22 Dec 2017 15:10:17 +0100
Subject: l2tp: fix missing print session offset info

Report offset parameter in L2TP_CMD_SESSION_GET command if
it has been configured by userspace

Fixes: 309795f4bec ("l2tp: Add netlink control API for L2TP")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_netlink.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index a1f24fb2be98..7e9c50125556 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -761,6 +761,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 
 	if ((session->ifname[0] &&
 	     nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
+	    (session->offset &&
+	     nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) ||
 	    (session->cookie_len &&
 	     nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
 		     &session->cookie[0])) ||
-- 
cgit v1.2.3


From f15bc54eeecd86dfba3885aab839cd1f45172a38 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Fri, 22 Dec 2017 15:10:18 +0100
Subject: l2tp: add peer_offset parameter

Introduce peer_offset parameter in order to add the capability
to specify two different values for payload offset on tx/rx side.
If just offset is provided by userspace use it for rx side as well
in order to maintain compatibility with older l2tp versions

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  1 +
 net/l2tp/l2tp_core.c      |  3 ++-
 net/l2tp/l2tp_core.h      | 13 ++++++++++---
 net/l2tp/l2tp_debugfs.c   |  8 +++++---
 net/l2tp/l2tp_netlink.c   | 21 ++++++++++++++++++++-
 5 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index d84ce5c1c9aa..d6fee55dbded 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -127,6 +127,7 @@ enum {
 	L2TP_ATTR_UDP_ZERO_CSUM6_TX,	/* flag */
 	L2TP_ATTR_UDP_ZERO_CSUM6_RX,	/* flag */
 	L2TP_ATTR_PAD,
+	L2TP_ATTR_PEER_OFFSET,		/* u16 */
 	__L2TP_ATTR_MAX,
 };
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 115918ad8eca..6ff64717da1e 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -792,7 +792,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 			ptr += 2 + offset;
 		}
 	} else
-		ptr += session->offset;
+		ptr += session->peer_offset;
 
 	offset = ptr - optr;
 	if (!pskb_may_pull(skb, offset))
@@ -1785,6 +1785,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->lns_mode = cfg->lns_mode;
 			session->reorder_timeout = cfg->reorder_timeout;
 			session->offset = cfg->offset;
+			session->peer_offset = cfg->peer_offset;
 			session->l2specific_type = cfg->l2specific_type;
 			session->l2specific_len = cfg->l2specific_len;
 			session->cookie_len = cfg->cookie_len;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 9534e16965cc..c6fe7cc42a05 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -59,7 +59,8 @@ struct l2tp_session_cfg {
 	int			debug;		/* bitmask of debug message
 						 * categories */
 	u16			vlan_id;	/* VLAN pseudowire only */
-	u16			offset;		/* offset to payload */
+	u16			offset;		/* offset to tx payload */
+	u16			peer_offset;	/* offset to rx payload */
 	u16			l2specific_len;	/* Layer 2 specific length */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
@@ -86,8 +87,14 @@ struct l2tp_session {
 	int			cookie_len;
 	u8			peer_cookie[8];
 	int			peer_cookie_len;
-	u16			offset;		/* offset from end of L2TP header
-						   to beginning of data */
+	u16			offset;		/* offset from end of L2TP
+						 * header to beginning of
+						 * tx data
+						 */
+	u16			peer_offset;	/* offset from end of L2TP
+						 * header to beginning of
+						 * rx data
+						 */
 	u16			l2specific_len;
 	u16			l2specific_type;
 	u16			hdr_len;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index eb69411bcb47..4cc30b38aba4 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -180,8 +180,9 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 		   session->lns_mode ? "LNS" : "LAC",
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
-	seq_printf(m, "   offset %hu l2specific %hu/%hu\n",
-		   session->offset, session->l2specific_type, session->l2specific_len);
+	seq_printf(m, "   offset %hu peer_offset %hu l2specific %hu/%hu\n",
+		   session->offset, session->peer_offset,
+		   session->l2specific_type, session->l2specific_len);
 	if (session->cookie_len) {
 		seq_printf(m, "   cookie %02x%02x%02x%02x",
 			   session->cookie[0], session->cookie[1],
@@ -228,7 +229,8 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
 		seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
 		seq_puts(m, "  SESSION ID, peer ID, PWTYPE\n");
 		seq_puts(m, "   refcnt cnt\n");
-		seq_puts(m, "   offset OFFSET l2specific TYPE/LEN\n");
+		seq_puts(m, "   offset OFFSET peer_offset OFFSET");
+		seq_puts(m, " l2specific TYPE/LEN\n");
 		seq_puts(m, "   [ cookie ]\n");
 		seq_puts(m, "   [ peer cookie ]\n");
 		seq_puts(m, "   config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n");
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 7e9c50125556..d7d4d7a7a54d 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -547,9 +547,25 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	}
 
 	if (tunnel->version > 2) {
-		if (info->attrs[L2TP_ATTR_OFFSET])
+		if (info->attrs[L2TP_ATTR_PEER_OFFSET]) {
+			struct nlattr *peer_offset;
+
+			peer_offset = info->attrs[L2TP_ATTR_PEER_OFFSET];
+			cfg.peer_offset = nla_get_u16(peer_offset);
+		}
+
+		if (info->attrs[L2TP_ATTR_OFFSET]) {
 			cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
 
+			/* in order to maintain compatibility with older
+			 * versions where offset was used for both tx and
+			 * rx side, update rx side with offset if peer_offset
+			 * is not provided by userspace
+			 */
+			if (!info->attrs[L2TP_ATTR_PEER_OFFSET])
+				cfg.peer_offset = cfg.offset;
+		}
+
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
 
@@ -763,6 +779,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 	     nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
 	    (session->offset &&
 	     nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) ||
+	    (session->peer_offset &&
+	     nla_put_u16(skb, L2TP_ATTR_PEER_OFFSET, session->peer_offset)) ||
 	    (session->cookie_len &&
 	     nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
 		     &session->cookie[0])) ||
@@ -903,6 +921,7 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
 	[L2TP_ATTR_PW_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_ENCAP_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_OFFSET]		= { .type = NLA_U16, },
+	[L2TP_ATTR_PEER_OFFSET]		= { .type = NLA_U16, },
 	[L2TP_ATTR_DATA_SEQ]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_TYPE]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_LEN]		= { .type = NLA_U8, },
-- 
cgit v1.2.3


From 7ae0c649c47f1c5d2db8cee6dd75855970af1669 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Fri, 22 Dec 2017 09:38:59 -0800
Subject: rds; Reset rs->rs_bound_addr in rds_add_bound() failure path

If the rds_sock is not added to the bind_hash_table, we must
reset rs_bound_addr so that rds_remove_bound will not trip on
this rds_sock.

rds_add_bound() does a rds_sock_put() in this failure path, so
failing to reset rs_bound_addr will result in a socket refcount
bug, and will trigger a WARN_ON with the stack shown below when
the application subsequently tries to close the PF_RDS socket.

     WARNING: CPU: 20 PID: 19499 at net/rds/af_rds.c:496 \
		rds_sock_destruct+0x15/0x30 [rds]
       :
     __sk_destruct+0x21/0x190
     rds_remove_bound.part.13+0xb6/0x140 [rds]
     rds_release+0x71/0x120 [rds]
     sock_release+0x1a/0x70
     sock_close+0xe/0x20
     __fput+0xd5/0x210
     task_work_run+0x82/0xa0
     do_exit+0x2ce/0xb30
     ? syscall_trace_enter+0x1cc/0x2b0
     do_group_exit+0x39/0xa0
     SyS_exit_group+0x10/0x10
     do_syscall_64+0x61/0x1a0

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/bind.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 75d43dc8e96b..5aa3a64aa4f0 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -114,6 +114,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 			  rs, &addr, (int)ntohs(*port));
 			break;
 		} else {
+			rs->rs_bound_addr = 0;
 			rds_sock_put(rs);
 			ret = -ENOMEM;
 			break;
-- 
cgit v1.2.3


From b319109396d6ddf1587a54e33f89fd3bea7c66db Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Fri, 22 Dec 2017 09:39:00 -0800
Subject: rds: tcp: initialize t_tcp_detached to false

Commit f10b4cff98c6 ("rds: tcp: atomically purge entries from
rds_tcp_conn_list during netns delete") adds the field t_tcp_detached,
but this needs to be initialized explicitly to false.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 39f502d47969..a61a498187ce 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -290,6 +290,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 		tc->t_cpath = &conn->c_path[i];
 
 		spin_lock_irq(&rds_tcp_conn_lock);
+		tc->t_tcp_node_detached = false;
 		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
 		spin_unlock_irq(&rds_tcp_conn_lock);
 		rdsdebug("rds_conn_path [%d] tc %p\n", i,
-- 
cgit v1.2.3


From 66261da169263f90c431741886b90b0982dda981 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Fri, 22 Dec 2017 09:39:01 -0800
Subject: rds: tcp: cleanup if kmem_cache_alloc fails in rds_tcp_conn_alloc()

If kmem_cache_alloc() fails in the middle of the for() loop,
cleanup anything that might have been allocated so far.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index a61a498187ce..2e554ef6d75f 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -270,16 +270,33 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr)
 	return -EADDRNOTAVAIL;
 }
 
+static void rds_tcp_conn_free(void *arg)
+{
+	struct rds_tcp_connection *tc = arg;
+	unsigned long flags;
+
+	rdsdebug("freeing tc %p\n", tc);
+
+	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+	if (!tc->t_tcp_node_detached)
+		list_del(&tc->t_tcp_node);
+	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+
+	kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
 static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 {
 	struct rds_tcp_connection *tc;
-	int i;
+	int i, j;
+	int ret = 0;
 
 	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
 		tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
-		if (!tc)
-			return -ENOMEM;
-
+		if (!tc) {
+			ret = -ENOMEM;
+			break;
+		}
 		mutex_init(&tc->t_conn_path_lock);
 		tc->t_sock = NULL;
 		tc->t_tinc = NULL;
@@ -296,22 +313,11 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 		rdsdebug("rds_conn_path [%d] tc %p\n", i,
 			 conn->c_path[i].cp_transport_data);
 	}
-
-	return 0;
-}
-
-static void rds_tcp_conn_free(void *arg)
-{
-	struct rds_tcp_connection *tc = arg;
-	unsigned long flags;
-	rdsdebug("freeing tc %p\n", tc);
-
-	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
-	if (!tc->t_tcp_node_detached)
-		list_del(&tc->t_tcp_node);
-	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
-
-	kmem_cache_free(rds_tcp_conn_slab, tc);
+	if (ret) {
+		for (j = 0; j < i; j++)
+			rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
+	}
+	return ret;
 }
 
 static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
-- 
cgit v1.2.3


From bf5c25d608613eaf4dcdba5a9cac5b2afe67d635 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:17 -0500
Subject: skbuff: in skb_segment, call zerocopy functions once per nskb

This is a net-next follow-up to commit 268b79067942 ("skbuff: orphan
frags before zerocopy clone"), which fixed a bug in net, but added a
call to skb_zerocopy_clone at each frag to do so.

When segmenting skbs with user frags, either the user frags must be
replaced with private copies and uarg released, or the uarg must have
its refcount increased for each new skb.

skb_orphan_frags does the first, except for cases that can handle
reference counting. skb_zerocopy_clone then does the second.

Call these once per nskb, instead of once per frag.

That is, in the common case. With a frag list, also refresh when the
origin skb (frag_skb) changes.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a3cb0be4c6f3..00b0757830e2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3656,6 +3656,10 @@ normal:
 		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
 					      SKBTX_SHARED_FRAG;
 
+		if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
+		    skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+			goto err;
+
 		while (pos < offset + len) {
 			if (i >= nfrags) {
 				BUG_ON(skb_headlen(list_skb));
@@ -3667,6 +3671,11 @@ normal:
 
 				BUG_ON(!nfrags);
 
+				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
+				    skb_zerocopy_clone(nskb, frag_skb,
+						       GFP_ATOMIC))
+					goto err;
+
 				list_skb = list_skb->next;
 			}
 
@@ -3678,11 +3687,6 @@ normal:
 				goto err;
 			}
 
-			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
-				goto err;
-			if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
-				goto err;
-
 			*nskb_frag = *frag;
 			__skb_frag_ref(nskb_frag);
 			size = skb_frag_size(nskb_frag);
-- 
cgit v1.2.3


From 111856c758d9a06145da446e0db8f71988eebf02 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:18 -0500
Subject: tcp: push full zerocopy packets

Skbs that reach MAX_SKB_FRAGS cannot be extended further. Do the
same for zerocopy frags as non-zerocopy frags and set the PSH bit.
This improves GRO assembly.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 67d39b79c801..44102484a76f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1371,8 +1371,10 @@ new_segment:
 			pfrag->offset += copy;
 		} else {
 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
-			if (err == -EMSGSIZE || err == -EEXIST)
+			if (err == -EMSGSIZE || err == -EEXIST) {
+				tcp_mark_push(tp, skb);
 				goto new_segment;
+			}
 			if (err < 0)
 				goto do_error;
 			copy = err;
-- 
cgit v1.2.3


From 02583adeff12fa0a4d72558853a926867afb226c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:19 -0500
Subject: tcp: place all zerocopy payload in frags

This avoids an unnecessary copy of 1-2KB and improves tso_fragment,
which has to fall back to tcp_fragment if skb->len != skb_data_len.

It also avoids a surprising inconsistency in notifications:
Zerocopy packets sent over loopback have their frags copied, so set
SO_EE_CODE_ZEROCOPY_COPIED in the notification. But this currently
does not happen for small packets, because when all data fits in the
linear fragment, data is not copied in skb_orphan_frags_rx.

Reported-by: Tom Deseyn <tom.deseyn@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 44102484a76f..947348872c3e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1186,7 +1186,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 	int flags, err, copied = 0;
 	int mss_now = 0, size_goal, copied_syn = 0;
 	bool process_backlog = false;
-	bool sg;
+	bool sg, zc = false;
 	long timeo;
 
 	flags = msg->msg_flags;
@@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
-		if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+		zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG;
+		if (!zc)
 			uarg->zerocopy = 0;
 	}
 
@@ -1325,13 +1326,13 @@ new_segment:
 			copy = msg_data_left(msg);
 
 		/* Where to copy to? */
-		if (skb_availroom(skb) > 0) {
+		if (skb_availroom(skb) > 0 && !zc) {
 			/* We have some space in skb head. Superb! */
 			copy = min_t(int, copy, skb_availroom(skb));
 			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
 			if (err)
 				goto do_fault;
-		} else if (!uarg || !uarg->zerocopy) {
+		} else if (!zc) {
 			bool merge = true;
 			int i = skb_shinfo(skb)->nr_frags;
 			struct page_frag *pfrag = sk_page_frag(sk);
-- 
cgit v1.2.3


From 8ddab50839e29e965460b2cf794fd2b06a946893 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:20 -0500
Subject: tcp: do not allocate linear memory for zerocopy skbs

Zerocopy payload is now always stored in frags, and space for headers
is reversed, so this memory is unused.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 947348872c3e..7ac583a2b9fe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1104,12 +1104,15 @@ static int linear_payload_sz(bool first_skb)
 	return 0;
 }
 
-static int select_size(const struct sock *sk, bool sg, bool first_skb)
+static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int tmp = tp->mss_cache;
 
 	if (sg) {
+		if (zc)
+			return 0;
+
 		if (sk_can_gso(sk)) {
 			tmp = linear_payload_sz(first_skb);
 		} else {
@@ -1282,6 +1285,7 @@ restart:
 
 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 			bool first_skb;
+			int linear;
 
 new_segment:
 			/* Allocate new segment. If the interface is SG,
@@ -1295,9 +1299,8 @@ new_segment:
 				goto restart;
 			}
 			first_skb = tcp_rtx_and_write_queues_empty(sk);
-			skb = sk_stream_alloc_skb(sk,
-						  select_size(sk, sg, first_skb),
-						  sk->sk_allocation,
+			linear = select_size(sk, sg, first_skb, zc);
+			skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
 						  first_skb);
 			if (!skb)
 				goto wait_for_memory;
-- 
cgit v1.2.3


From 8ec69574031bb8e0a19cf318c093acc871abd965 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Dec 2017 16:52:10 +0100
Subject: net: sched: don't set extack message in case the qdisc will be
 created

If the qdisc is not found here, it is going to be created. Therefore,
this is not an error path. Remove the extack message set and don't
confuse user with error message in case the qdisc was created
successfully.

Fixes: 09215598119e ("net: sched: sch_api: handle generic qdisc errors")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3a3a1da6b071..81ecf5bec26d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1402,10 +1402,8 @@ replay:
 					return -EINVAL;
 				}
 				q = qdisc_lookup(dev, tcm->tcm_handle);
-				if (!q) {
-					NL_SET_ERR_MSG(extack, "No qdisc found for specified handle");
+				if (!q)
 					goto create_n_graft;
-				}
 				if (n->nlmsg_flags & NLM_F_EXCL) {
 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
 					return -EEXIST;
-- 
cgit v1.2.3


From 8234af2db3614d78b49e77ef46ea8cfab6586568 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 25 Dec 2017 10:51:41 +0200
Subject: net_sch: red: Fix the new offload indication

Update the offload flag, TCQ_F_OFFLOADED, in each dump call (and ignore
the offloading function return value in relation to this flag).
This is done because a qdisc is being initialized, and therefore offloaded
before being grafted. Since the ability of the driver to offload the qdisc
depends on its location, a qdisc can be offloaded and un-offloaded by graft
calls, that doesn't effect the qdisc itself.

Fixes: 428a68af3a7c ("net: sched: Move to new offload indication in RED"
Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index ec0bd36e09a9..a392eaa4a0b4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -157,7 +157,6 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		.handle = sch->handle,
 		.parent = sch->parent,
 	};
-	int err;
 
 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 		return -EOPNOTSUPP;
@@ -172,14 +171,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.command = TC_RED_DESTROY;
 	}
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
-
-	if (!err && enable)
-		sch->flags |= TCQ_F_OFFLOADED;
-	else
-		sch->flags &= ~TCQ_F_OFFLOADED;
-
-	return err;
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
 }
 
 static void red_destroy(struct Qdisc *sch)
@@ -297,12 +289,22 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 			.stats.qstats = &sch->qstats,
 		},
 	};
+	int err;
+
+	sch->flags &= ~TCQ_F_OFFLOADED;
 
-	if (!(sch->flags & TCQ_F_OFFLOADED))
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					    &hw_stats);
+	if (err == -EOPNOTSUPP)
 		return 0;
 
-	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-					     &hw_stats);
+	if (!err)
+		sch->flags |= TCQ_F_OFFLOADED;
+
+	return err;
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3


From 44edf2f89791d162f4dc5ec3718d21f3d6644403 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 25 Dec 2017 10:51:42 +0200
Subject: net: sched: Move offload check till after dump call

Move the check of the offload state to after the qdisc dump action was
called, so the qdisc could update it if it was changed.

Fixes: 7a4fa29106d9 ("net: sched: Add TCA_HW_OFFLOAD")
Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 81ecf5bec26d..8a04c36e579f 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -807,11 +807,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
-	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
-		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
-
+	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+		goto nla_put_failure;
 	qlen = qdisc_qlen_sum(q);
 
 	stab = rtnl_dereference(q->stab);
-- 
cgit v1.2.3


From 9540d977618c31586035870a56bd2d1cc2b4a9ba Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 27 Dec 2017 17:05:52 +0800
Subject: net: sched: fix skb leak in dev_requeue_skb()

When dev_requeue_skb() is called with bulked skb list, only the
first skb of the list will be requeued to qdisc layer, and leak
the others without free them.

TCP is broken due to skb leak since no free skb will be considered
as still in the host queue and never be retransmitted. This happend
when dev_requeue_skb() called from qdisc_restart().
  qdisc_restart
  |-- dequeue_skb
  |-- sch_direct_xmit()
      |-- dev_requeue_skb() <-- skb may bluked

Fix dev_requeue_skb() to requeue the full bluked list. Also change
to use __skb_queue_tail() in __dev_requeue_skb() to avoid skb out
of order.

Fixes: a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cc069b2acf0e..a883c501d5ec 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -112,10 +112,16 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
 
 static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	__skb_queue_head(&q->gso_skb, skb);
-	q->qstats.requeues++;
-	qdisc_qstats_backlog_inc(q, skb);
-	q->q.qlen++;	/* it's still part of the queue */
+	while (skb) {
+		struct sk_buff *next = skb->next;
+
+		__skb_queue_tail(&q->gso_skb, skb);
+		q->qstats.requeues++;
+		qdisc_qstats_backlog_inc(q, skb);
+		q->q.qlen++;	/* it's still part of the queue */
+
+		skb = next;
+	}
 	__netif_schedule(q);
 
 	return 0;
@@ -126,12 +132,19 @@ static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
 	spinlock_t *lock = qdisc_lock(q);
 
 	spin_lock(lock);
-	__skb_queue_tail(&q->gso_skb, skb);
+	while (skb) {
+		struct sk_buff *next = skb->next;
+
+		__skb_queue_tail(&q->gso_skb, skb);
+
+		qdisc_qstats_cpu_requeues_inc(q);
+		qdisc_qstats_cpu_backlog_inc(q, skb);
+		qdisc_qstats_cpu_qlen_inc(q);
+
+		skb = next;
+	}
 	spin_unlock(lock);
 
-	qdisc_qstats_cpu_requeues_inc(q);
-	qdisc_qstats_cpu_backlog_inc(q, skb);
-	qdisc_qstats_cpu_qlen_inc(q);
 	__netif_schedule(q);
 
 	return 0;
-- 
cgit v1.2.3


From 62262ffd95fba33c2b0dfcbf3ef3a254101120c7 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Wed, 27 Dec 2017 15:51:36 +0100
Subject: net: dccp: drop unneeded newline

DCCP_CRIT prints some other text and then a newline after the message
string, so the message string does not need to include a newline
explicitly.  Done using Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ackvec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 3de0d0362d7f..2a24f7d171a5 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -228,7 +228,7 @@ static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
 	}
 
 	if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
-		DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+		DCCP_CRIT("Ack Vector buffer overflow: dropping old entries");
 		av->av_overflow = true;
 	}
 
-- 
cgit v1.2.3


From e0b10844d9e617a1a5ce2ddf73d38aaa0a47a2a4 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Wed, 27 Dec 2017 15:51:38 +0100
Subject: openvswitch: drop unneeded newline

OVS_NLERR prints a newline at the end of the message string, so the
message string does not need to include a newline explicitly.  Done
using Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index b27c5c6d9cab..62f36cc938ca 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1266,14 +1266,14 @@ static int parse_nat(const struct nlattr *attr,
 		/* Do not allow flags if no type is given. */
 		if (info->range.flags) {
 			OVS_NLERR(log,
-				  "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+				  "NAT flags may be given only when NAT range (SRC or DST) is also specified."
 				  );
 			return -EINVAL;
 		}
 		info->nat = OVS_CT_NAT;   /* NAT existing connections. */
 	} else if (!info->commit) {
 		OVS_NLERR(log,
-			  "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+			  "NAT attributes may be specified only when CT COMMIT flag is also specified."
 			  );
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From bbb6189df4077cde8592cd2f804bb1122067dd32 Mon Sep 17 00:00:00 2001
From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Wed, 27 Dec 2017 18:27:58 +0100
Subject: inet_diag: Add equal-operator for ports

inet_diag currently provides less/greater than or equal operators for
comparing ports when filtering sockets. An equal comparison can be
performed by combining the two existing operators, or a user can for
example request a port range and then do the final filtering in
userspace. However, these approaches both have drawbacks. Implementing
equal using LE/GE causes the size and complexity of a filter to grow
quickly as the number of ports increase, while it on busy machines would
be great if the kernel only returns information about relevant sockets.

This patch introduces source and destination port equal operators.
INET_DIAG_BC_S_EQ is used to match a source port, INET_DIAG_BC_D_EQ a
destination port, and usage is the same as for the existing port
operators.  I.e., the port to match is stored in the no-member of the
next inet_diag_bc_op-struct in the filter.

Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h | 2 ++
 net/ipv4/inet_diag.c           | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 817d807e9481..14565d703291 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -92,6 +92,8 @@ enum {
 	INET_DIAG_BC_D_COND,
 	INET_DIAG_BC_DEV_COND,   /* u32 ifindex */
 	INET_DIAG_BC_MARK_COND,
+	INET_DIAG_BC_S_EQ,
+	INET_DIAG_BC_D_EQ,
 };
 
 struct inet_diag_hostcond {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c9c35b61a027..a383f299ce24 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
 		case INET_DIAG_BC_JMP:
 			yes = 0;
 			break;
+		case INET_DIAG_BC_S_EQ:
+			yes = entry->sport == op[1].no;
+			break;
 		case INET_DIAG_BC_S_GE:
 			yes = entry->sport >= op[1].no;
 			break;
 		case INET_DIAG_BC_S_LE:
 			yes = entry->sport <= op[1].no;
 			break;
+		case INET_DIAG_BC_D_EQ:
+			yes = entry->dport == op[1].no;
+			break;
 		case INET_DIAG_BC_D_GE:
 			yes = entry->dport >= op[1].no;
 			break;
@@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
 			if (!valid_devcond(bc, len, &min_len))
 				return -EINVAL;
 			break;
+		case INET_DIAG_BC_S_EQ:
 		case INET_DIAG_BC_S_GE:
 		case INET_DIAG_BC_S_LE:
+		case INET_DIAG_BC_D_EQ:
 		case INET_DIAG_BC_D_GE:
 		case INET_DIAG_BC_D_LE:
 			if (!valid_port_comparison(bc, len, &min_len))
-- 
cgit v1.2.3


From c3fde1bd28f7c720d7bc587e85e54706df4f8163 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 29 Dec 2017 11:45:51 +0900
Subject: net: tcp: Add trace events for TCP congestion window tracing

This adds an event to trace TCP stat variables with
slightly intrusive trace-event. This uses ftrace/perf
event log buffer to trace those state, no needs to
prepare own ring-buffer, nor custom user apps.

User can use ftrace to trace this event as below;

  # cd /sys/kernel/debug/tracing
  # echo 1 > events/tcp/tcp_probe/enable
  (run workloads)
  # cat trace

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       |  3 ++
 2 files changed, 100 insertions(+)

(limited to 'net')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index bb00459d2d4d..b5ae3fbb74c8 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM tcp
 
@@ -8,6 +9,7 @@
 #include <linux/tcp.h>
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
+#include <net/tcp.h>
 
 #define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
 	do {							\
@@ -254,6 +256,101 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+
+#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk)			\
+	do {								\
+		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
+									\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_sport;			\
+		v4->sin_addr.s_addr = inet->inet_saddr;			\
+		v4 = (void *)__entry->daddr;				\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_dport;			\
+		v4->sin_addr.s_addr = inet->inet_daddr;			\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)				\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+									\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_sport;		\
+			v6->sin6_addr = inet6_sk(sk)->saddr;		\
+			v6 = (void *)__entry->daddr;			\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_dport;		\
+			v6->sin6_addr = sk->sk_v6_daddr;		\
+		} else							\
+			TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);	\
+	} while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)		\
+	TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
+
+#endif
+
+TRACE_EVENT(tcp_probe,
+
+	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		/* sockaddr_in6 is always bigger than sockaddr_in */
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u32, mark)
+		__field(__u16, length)
+		__field(__u32, snd_nxt)
+		__field(__u32, snd_una)
+		__field(__u32, snd_cwnd)
+		__field(__u32, ssthresh)
+		__field(__u32, snd_wnd)
+		__field(__u32, srtt)
+		__field(__u32, rcv_wnd)
+	),
+
+	TP_fast_assign(
+		const struct tcp_sock *tp = tcp_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+		/* For filtering use */
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+		__entry->mark = skb->mark;
+
+		__entry->length = skb->len;
+		__entry->snd_nxt = tp->snd_nxt;
+		__entry->snd_una = tp->snd_una;
+		__entry->snd_cwnd = tp->snd_cwnd;
+		__entry->snd_wnd = tp->snd_wnd;
+		__entry->rcv_wnd = tp->rcv_wnd;
+		__entry->ssthresh = tcp_current_ssthresh(sk);
+		__entry->srtt = tp->srtt_us >> 3;
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x "
+		  "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u "
+		  "rcv_wnd=%u",
+		  __entry->saddr, __entry->daddr, __entry->mark,
+		  __entry->length, __entry->snd_nxt, __entry->snd_una,
+		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
+		  __entry->srtt, __entry->rcv_wnd)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4d55c4b338ee..ff71b18d9682 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	unsigned int len = skb->len;
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	/* TCP congestion window tracking */
+	trace_tcp_probe(sk, skb);
+
 	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
-- 
cgit v1.2.3


From 6987990c3e16b01f9a4805cb1d1292381e9d6bff Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 29 Dec 2017 11:46:21 +0900
Subject: net: tcp: Remove TCP probe module

Remove TCP probe module since jprobe has been deprecated.
That function is now replaced by tcp/tcp_probe trace-event.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig          |  17 ---
 net/ipv4/Makefile    |   1 -
 net/ipv4/tcp_probe.c | 301 ---------------------------------------------------
 3 files changed, 319 deletions(-)
 delete mode 100644 net/ipv4/tcp_probe.c

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index 9dba2715919d..efe930db3c08 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -336,23 +336,6 @@ config NET_PKTGEN
 	  To compile this code as a module, choose M here: the
 	  module will be called pktgen.
 
-config NET_TCPPROBE
-	tristate "TCP connection probing"
-	depends on INET && PROC_FS && KPROBES
-	---help---
-	This module allows for capturing the changes to TCP connection
-	state in response to incoming packets. It is used for debugging
-	TCP congestion avoidance modules. If you don't understand
-	what was just said, you don't need it: say N.
-
-	Documentation on how to use TCP connection probing can be found
-	at:
-	
-	  http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
-
-	To compile this code as a module, choose M here: the
-	module will be called tcp_probe.
-
 config NET_DROP_MONITOR
 	tristate "Network packet drop alerting service"
 	depends on INET && TRACEPOINTS
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c6c8ad1d4b6d..47a0a6649a9d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
deleted file mode 100644
index 697f4c67b2e3..000000000000
--- a/net/ipv4/tcp_probe.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * tcpprobe - Observe the TCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/ktime.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/tcp.h>
-
-MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
-MODULE_DESCRIPTION("TCP cwnd snooper");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("1.1");
-
-static int port __read_mostly;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int bufsize __read_mostly = 4096;
-MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
-module_param(bufsize, uint, 0);
-
-static unsigned int fwmark __read_mostly;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int full __read_mostly;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "tcpprobe";
-
-struct tcp_log {
-	ktime_t tstamp;
-	union {
-		struct sockaddr		raw;
-		struct sockaddr_in	v4;
-		struct sockaddr_in6	v6;
-	}	src, dst;
-	u16	length;
-	u32	snd_nxt;
-	u32	snd_una;
-	u32	snd_wnd;
-	u32	rcv_wnd;
-	u32	snd_cwnd;
-	u32	ssthresh;
-	u32	srtt;
-};
-
-static struct {
-	spinlock_t	lock;
-	wait_queue_head_t wait;
-	ktime_t		start;
-	u32		lastcwnd;
-
-	unsigned long	head, tail;
-	struct tcp_log	*log;
-} tcp_probe;
-
-static inline int tcp_probe_used(void)
-{
-	return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
-}
-
-static inline int tcp_probe_avail(void)
-{
-	return bufsize - tcp_probe_used() - 1;
-}
-
-#define tcp_probe_copy_fl_to_si4(inet, si4, mem)		\
-	do {							\
-		si4.sin_family = AF_INET;			\
-		si4.sin_port = inet->inet_##mem##port;		\
-		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\
-	} while (0)						\
-
-/*
- * Hook inserted to be called before each receive packet.
- * Note: arguments must match tcp_rcv_established()!
- */
-static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-				 const struct tcphdr *th)
-{
-	unsigned int len = skb->len;
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_sock *inet = inet_sk(sk);
-
-	/* Only update if port or skb mark matches */
-	if (((port == 0 && fwmark == 0) ||
-	     ntohs(inet->inet_dport) == port ||
-	     ntohs(inet->inet_sport) == port ||
-	     (fwmark > 0 && skb->mark == fwmark)) &&
-	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
-
-		spin_lock(&tcp_probe.lock);
-		/* If log fills, just silently drop */
-		if (tcp_probe_avail() > 1) {
-			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
-
-			p->tstamp = ktime_get();
-			switch (sk->sk_family) {
-			case AF_INET:
-				tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
-				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
-				break;
-			case AF_INET6:
-				memset(&p->src.v6, 0, sizeof(p->src.v6));
-				memset(&p->dst.v6, 0, sizeof(p->dst.v6));
-#if IS_ENABLED(CONFIG_IPV6)
-				p->src.v6.sin6_family = AF_INET6;
-				p->src.v6.sin6_port = inet->inet_sport;
-				p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
-
-				p->dst.v6.sin6_family = AF_INET6;
-				p->dst.v6.sin6_port = inet->inet_dport;
-				p->dst.v6.sin6_addr = sk->sk_v6_daddr;
-#endif
-				break;
-			default:
-				BUG();
-			}
-
-			p->length = len;
-			p->snd_nxt = tp->snd_nxt;
-			p->snd_una = tp->snd_una;
-			p->snd_cwnd = tp->snd_cwnd;
-			p->snd_wnd = tp->snd_wnd;
-			p->rcv_wnd = tp->rcv_wnd;
-			p->ssthresh = tcp_current_ssthresh(sk);
-			p->srtt = tp->srtt_us >> 3;
-
-			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
-		}
-		tcp_probe.lastcwnd = tp->snd_cwnd;
-		spin_unlock(&tcp_probe.lock);
-
-		wake_up(&tcp_probe.wait);
-	}
-
-	jprobe_return();
-}
-
-static struct jprobe tcp_jprobe = {
-	.kp = {
-		.symbol_name	= "tcp_rcv_established",
-	},
-	.entry	= jtcp_rcv_established,
-};
-
-static int tcpprobe_open(struct inode *inode, struct file *file)
-{
-	/* Reset (empty) log */
-	spin_lock_bh(&tcp_probe.lock);
-	tcp_probe.head = tcp_probe.tail = 0;
-	tcp_probe.start = ktime_get();
-	spin_unlock_bh(&tcp_probe.lock);
-
-	return 0;
-}
-
-static int tcpprobe_sprint(char *tbuf, int n)
-{
-	const struct tcp_log *p
-		= tcp_probe.log + tcp_probe.tail;
-	struct timespec64 ts
-		= ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
-
-	return scnprintf(tbuf, n,
-			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
-			(unsigned long)ts.tv_sec,
-			(unsigned long)ts.tv_nsec,
-			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
-			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
-}
-
-static ssize_t tcpprobe_read(struct file *file, char __user *buf,
-			     size_t len, loff_t *ppos)
-{
-	int error = 0;
-	size_t cnt = 0;
-
-	if (!buf)
-		return -EINVAL;
-
-	while (cnt < len) {
-		char tbuf[256];
-		int width;
-
-		/* Wait for data in buffer */
-		error = wait_event_interruptible(tcp_probe.wait,
-						 tcp_probe_used() > 0);
-		if (error)
-			break;
-
-		spin_lock_bh(&tcp_probe.lock);
-		if (tcp_probe.head == tcp_probe.tail) {
-			/* multiple readers race? */
-			spin_unlock_bh(&tcp_probe.lock);
-			continue;
-		}
-
-		width = tcpprobe_sprint(tbuf, sizeof(tbuf));
-
-		if (cnt + width < len)
-			tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
-
-		spin_unlock_bh(&tcp_probe.lock);
-
-		/* if record greater than space available
-		   return partial buffer (so far) */
-		if (cnt + width >= len)
-			break;
-
-		if (copy_to_user(buf + cnt, tbuf, width))
-			return -EFAULT;
-		cnt += width;
-	}
-
-	return cnt == 0 ? error : cnt;
-}
-
-static const struct file_operations tcpprobe_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = tcpprobe_open,
-	.read    = tcpprobe_read,
-	.llseek  = noop_llseek,
-};
-
-static __init int tcpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	/* Warning: if the function signature of tcp_rcv_established,
-	 * has been changed, you also have to change the signature of
-	 * jtcp_rcv_established, otherwise you end up right here!
-	 */
-	BUILD_BUG_ON(__same_type(tcp_rcv_established,
-				 jtcp_rcv_established) == 0);
-
-	init_waitqueue_head(&tcp_probe.wait);
-	spin_lock_init(&tcp_probe.lock);
-
-	if (bufsize == 0)
-		return -EINVAL;
-
-	bufsize = roundup_pow_of_two(bufsize);
-	tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
-	if (!tcp_probe.log)
-		goto err0;
-
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
-		goto err0;
-
-	ret = register_jprobe(&tcp_jprobe);
-	if (ret)
-		goto err1;
-
-	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
-		port, fwmark, bufsize);
-	return 0;
- err1:
-	remove_proc_entry(procname, init_net.proc_net);
- err0:
-	kfree(tcp_probe.log);
-	return ret;
-}
-module_init(tcpprobe_init);
-
-static __exit void tcpprobe_exit(void)
-{
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&tcp_jprobe);
-	kfree(tcp_probe.log);
-}
-module_exit(tcpprobe_exit);
-- 
cgit v1.2.3


From 103d750c88fe6b42dbe7abc4d204027f343ee125 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 29 Dec 2017 11:46:51 +0900
Subject: net: sctp: Add SCTP ACK tracking trace event

Add SCTP ACK tracking trace event to trace the changes of SCTP
association state in response to incoming packets.
It is used for debugging SCTP congestion control algorithms,
and will replace sctp_probe module.

Note that this event a bit tricky. Since this consists of 2
events (sctp_probe and sctp_probe_path) so you have to enable
both events as below.

  # cd /sys/kernel/debug/tracing
  # echo 1 > events/sctp/sctp_probe/enable
  # echo 1 > events/sctp/sctp_probe_path/enable

Or, you can enable all the events under sctp.

  # echo 1 > events/sctp/enable

Since sctp_probe_path event is always invoked from sctp_probe
event, you can not see any output if you only enable
sctp_probe_path.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/sctp.h | 99 +++++++++++++++++++++++++++++++++++++++++++++
 net/sctp/sm_statefuns.c     |  5 +++
 2 files changed, 104 insertions(+)
 create mode 100644 include/trace/events/sctp.h

(limited to 'net')

diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h
new file mode 100644
index 000000000000..7475c7be165a
--- /dev/null
+++ b/include/trace/events/sctp.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sctp
+
+#if !defined(_TRACE_SCTP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCTP_H
+
+#include <net/sctp/structs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(sctp_probe_path,
+
+	TP_PROTO(struct sctp_transport *sp,
+		 const struct sctp_association *asoc),
+
+	TP_ARGS(sp, asoc),
+
+	TP_STRUCT__entry(
+		__field(__u64, asoc)
+		__field(__u32, primary)
+		__array(__u8, ipaddr, sizeof(union sctp_addr))
+		__field(__u32, state)
+		__field(__u32, cwnd)
+		__field(__u32, ssthresh)
+		__field(__u32, flight_size)
+		__field(__u32, partial_bytes_acked)
+		__field(__u32, pathmtu)
+	),
+
+	TP_fast_assign(
+		__entry->asoc = (unsigned long)asoc;
+		__entry->primary = (sp == asoc->peer.primary_path);
+		memcpy(__entry->ipaddr, &sp->ipaddr, sizeof(union sctp_addr));
+		__entry->state = sp->state;
+		__entry->cwnd = sp->cwnd;
+		__entry->ssthresh = sp->ssthresh;
+		__entry->flight_size = sp->flight_size;
+		__entry->partial_bytes_acked = sp->partial_bytes_acked;
+		__entry->pathmtu = sp->pathmtu;
+	),
+
+	TP_printk("asoc=%#llx%s ipaddr=%pISpc state=%u cwnd=%u ssthresh=%u "
+		  "flight_size=%u partial_bytes_acked=%u pathmtu=%u",
+		  __entry->asoc, __entry->primary ? "(*)" : "",
+		  __entry->ipaddr, __entry->state, __entry->cwnd,
+		  __entry->ssthresh, __entry->flight_size,
+		  __entry->partial_bytes_acked, __entry->pathmtu)
+);
+
+TRACE_EVENT(sctp_probe,
+
+	TP_PROTO(const struct sctp_endpoint *ep,
+		 const struct sctp_association *asoc,
+		 struct sctp_chunk *chunk),
+
+	TP_ARGS(ep, asoc, chunk),
+
+	TP_STRUCT__entry(
+		__field(__u64, asoc)
+		__field(__u32, mark)
+		__field(__u16, bind_port)
+		__field(__u16, peer_port)
+		__field(__u32, pathmtu)
+		__field(__u32, rwnd)
+		__field(__u16, unack_data)
+	),
+
+	TP_fast_assign(
+		struct sk_buff *skb = chunk->skb;
+
+		__entry->asoc = (unsigned long)asoc;
+		__entry->mark = skb->mark;
+		__entry->bind_port = ep->base.bind_addr.port;
+		__entry->peer_port = asoc->peer.port;
+		__entry->pathmtu = asoc->pathmtu;
+		__entry->rwnd = asoc->peer.rwnd;
+		__entry->unack_data = asoc->unack_data;
+
+		if (trace_sctp_probe_path_enabled()) {
+			struct sctp_transport *sp;
+
+			list_for_each_entry(sp, &asoc->peer.transport_addr_list,
+					    transports) {
+				trace_sctp_probe_path(sp, asoc);
+			}
+		}
+	),
+
+	TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d "
+		  "rwnd=%u unack_data=%d",
+		  __entry->asoc, __entry->mark, __entry->bind_port,
+		  __entry->peer_port, __entry->pathmtu, __entry->rwnd,
+		  __entry->unack_data)
+);
+
+#endif /* _TRACE_SCTP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 541f34735346..eb7905ffe5f2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -59,6 +59,9 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/structs.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sctp.h>
+
 static struct sctp_packet *sctp_abort_pkt_new(
 					struct net *net,
 					const struct sctp_endpoint *ep,
@@ -3219,6 +3222,8 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
 	struct sctp_sackhdr *sackh;
 	__u32 ctsn;
 
+	trace_sctp_probe(ep, asoc, chunk);
+
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
-- 
cgit v1.2.3


From fa4475f79251a0539e64c08b8b039be23d107dc9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 29 Dec 2017 11:47:20 +0900
Subject: net: sctp: Remove debug SCTP probe module

Remove SCTP probe module since jprobe has been deprecated.
That function is now replaced by sctp/sctp_probe and
sctp/sctp_probe_path trace-events.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/Kconfig  |  12 ---
 net/sctp/Makefile |   3 -
 net/sctp/probe.c  | 244 ------------------------------------------------------
 3 files changed, 259 deletions(-)
 delete mode 100644 net/sctp/probe.c

(limited to 'net')

diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index d9c04dc1b3f3..c740b189d4ba 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -37,18 +37,6 @@ menuconfig IP_SCTP
 
 if IP_SCTP
 
-config NET_SCTPPROBE
-	tristate "SCTP: Association probing"
-        depends on PROC_FS && KPROBES
-        ---help---
-        This module allows for capturing the changes to SCTP association
-        state in response to incoming packets. It is used for debugging
-        SCTP congestion control algorithms. If you don't understand
-        what was just said, you don't need it: say N.
-
-        To compile this code as a module, choose M here: the
-        module will be called sctp_probe.
-
 config SCTP_DBG_OBJCNT
 	bool "SCTP: Debug object counts"
 	depends on PROC_FS
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 54bd9c1a8aa1..6776582ec449 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,7 +4,6 @@
 #
 
 obj-$(CONFIG_IP_SCTP) += sctp.o
-obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
 obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
 
 sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
@@ -16,8 +15,6 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
 	  stream_sched_rr.o stream_interleave.o
 
-sctp_probe-y := probe.o
-
 sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
 sctp-$(CONFIG_PROC_FS) += proc.o
 sctp-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
deleted file mode 100644
index 1280f85a598d..000000000000
--- a/net/sctp/probe.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * sctp_probe - Observe the SCTP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for SCTP from Stephen Hemminger's code
- * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/sctp.h>
-#include <linux/proc_fs.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-MODULE_SOFTDEP("pre: sctp");
-MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>");
-MODULE_DESCRIPTION("SCTP snooper");
-MODULE_LICENSE("GPL");
-
-static int port __read_mostly = 0;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int fwmark __read_mostly = 0;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int bufsize __read_mostly = 64 * 1024;
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-static int full __read_mostly = 1;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "sctpprobe";
-
-static struct {
-	struct kfifo	  fifo;
-	spinlock_t	  lock;
-	wait_queue_head_t wait;
-	struct timespec64 tstart;
-} sctpw;
-
-static __printf(1, 2) void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	len = vscnprintf(tbuf, sizeof(tbuf), fmt, args);
-	va_end(args);
-
-	kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
-	wake_up(&sctpw.wait);
-}
-
-static int sctpprobe_open(struct inode *inode, struct file *file)
-{
-	kfifo_reset(&sctpw.fifo);
-	ktime_get_ts64(&sctpw.tstart);
-
-	return 0;
-}
-
-static ssize_t sctpprobe_read(struct file *file, char __user *buf,
-			      size_t len, loff_t *ppos)
-{
-	int error = 0, cnt = 0;
-	unsigned char *tbuf;
-
-	if (!buf)
-		return -EINVAL;
-
-	if (len == 0)
-		return 0;
-
-	tbuf = vmalloc(len);
-	if (!tbuf)
-		return -ENOMEM;
-
-	error = wait_event_interruptible(sctpw.wait,
-					 kfifo_len(&sctpw.fifo) != 0);
-	if (error)
-		goto out_free;
-
-	cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
-	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
-	vfree(tbuf);
-
-	return error ? error : cnt;
-}
-
-static const struct file_operations sctpprobe_fops = {
-	.owner	= THIS_MODULE,
-	.open	= sctpprobe_open,
-	.read	= sctpprobe_read,
-	.llseek = noop_llseek,
-};
-
-static enum sctp_disposition jsctp_sf_eat_sack(
-					struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
-{
-	struct sctp_chunk *chunk = arg;
-	struct sk_buff *skb = chunk->skb;
-	struct sctp_transport *sp;
-	static __u32 lcwnd = 0;
-	struct timespec64 now;
-
-	sp = asoc->peer.primary_path;
-
-	if (((port == 0 && fwmark == 0) ||
-	     asoc->peer.port == port ||
-	     ep->base.bind_addr.port == port ||
-	     (fwmark > 0 && skb->mark == fwmark)) &&
-	    (full || sp->cwnd != lcwnd)) {
-		lcwnd = sp->cwnd;
-
-		ktime_get_ts64(&now);
-		now = timespec64_sub(now, sctpw.tstart);
-
-		printl("%lu.%06lu ", (unsigned long) now.tv_sec,
-		       (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-
-		printl("%p %5d %5d %5d %8d %5d ", asoc,
-		       ep->base.bind_addr.port, asoc->peer.port,
-		       asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data);
-
-		list_for_each_entry(sp, &asoc->peer.transport_addr_list,
-					transports) {
-			if (sp == asoc->peer.primary_path)
-				printl("*");
-
-			printl("%pISc %2u %8u %8u %8u %8u %8u ",
-			       &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh,
-			       sp->flight_size, sp->partial_bytes_acked,
-			       sp->pathmtu);
-		}
-		printl("\n");
-	}
-
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe sctp_recv_probe = {
-	.kp	= {
-		.symbol_name = "sctp_sf_eat_sack_6_2",
-	},
-	.entry	= jsctp_sf_eat_sack,
-};
-
-static __init int sctp_setup_jprobe(void)
-{
-	int ret = register_jprobe(&sctp_recv_probe);
-
-	if (ret) {
-		if (request_module("sctp"))
-			goto out;
-		ret = register_jprobe(&sctp_recv_probe);
-	}
-
-out:
-	return ret;
-}
-
-static __init int sctpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	/* Warning: if the function signature of sctp_sf_eat_sack_6_2,
-	 * has been changed, you also have to change the signature of
-	 * jsctp_sf_eat_sack, otherwise you end up right here!
-	 */
-	BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2,
-				 jsctp_sf_eat_sack) == 0);
-
-	init_waitqueue_head(&sctpw.wait);
-	spin_lock_init(&sctpw.lock);
-	if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL))
-		return ret;
-
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net,
-			 &sctpprobe_fops))
-		goto free_kfifo;
-
-	ret = sctp_setup_jprobe();
-	if (ret)
-		goto remove_proc;
-
-	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
-		port, fwmark, bufsize);
-	return 0;
-
-remove_proc:
-	remove_proc_entry(procname, init_net.proc_net);
-free_kfifo:
-	kfifo_free(&sctpw.fifo);
-	return ret;
-}
-
-static __exit void sctpprobe_exit(void)
-{
-	kfifo_free(&sctpw.fifo);
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&sctp_recv_probe);
-}
-
-module_init(sctpprobe_init);
-module_exit(sctpprobe_exit);
-- 
cgit v1.2.3


From ee549be6f061188f306133e3a66ce3d3c6758811 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 29 Dec 2017 11:47:55 +0900
Subject: net: dccp: Add DCCP sendmsg trace event

Add DCCP sendmsg trace event (dccp/dccp_probe) for
replacing dccpprobe. User can trace this event via
ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/net_probe_common.h | 44 +++++++++++++++++
 include/trace/events/tcp.h              | 39 +--------------
 net/dccp/Makefile                       |  3 ++
 net/dccp/proto.c                        |  5 ++
 net/dccp/trace.h                        | 84 +++++++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 38 deletions(-)
 create mode 100644 include/trace/events/net_probe_common.h
 create mode 100644 net/dccp/trace.h

(limited to 'net')

diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h
new file mode 100644
index 000000000000..3930119cab08
--- /dev/null
+++ b/include/trace/events/net_probe_common.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(_TRACE_NET_PROBE_COMMON_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_PROBE_COMMON_H
+
+#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk)			\
+	do {								\
+		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
+									\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_sport;			\
+		v4->sin_addr.s_addr = inet->inet_saddr;			\
+		v4 = (void *)__entry->daddr;				\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_dport;			\
+		v4->sin_addr.s_addr = inet->inet_daddr;			\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)				\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+									\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_sport;		\
+			v6->sin6_addr = inet6_sk(sk)->saddr;		\
+			v6 = (void *)__entry->daddr;			\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_dport;		\
+			v6->sin6_addr = sk->sk_v6_daddr;		\
+		} else							\
+			TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);	\
+	} while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)		\
+	TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
+
+#endif
+
+#endif
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index b5ae3fbb74c8..878b2be7ce77 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -256,44 +256,7 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
-
-#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk)			\
-	do {								\
-		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
-									\
-		v4->sin_family = AF_INET;				\
-		v4->sin_port = inet->inet_sport;			\
-		v4->sin_addr.s_addr = inet->inet_saddr;			\
-		v4 = (void *)__entry->daddr;				\
-		v4->sin_family = AF_INET;				\
-		v4->sin_port = inet->inet_dport;			\
-		v4->sin_addr.s_addr = inet->inet_daddr;			\
-	} while (0)
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-#define TP_STORE_ADDR_PORTS(__entry, inet, sk)				\
-	do {								\
-		if (sk->sk_family == AF_INET6) {			\
-			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
-									\
-			v6->sin6_family = AF_INET6;			\
-			v6->sin6_port = inet->inet_sport;		\
-			v6->sin6_addr = inet6_sk(sk)->saddr;		\
-			v6 = (void *)__entry->daddr;			\
-			v6->sin6_family = AF_INET6;			\
-			v6->sin6_port = inet->inet_dport;		\
-			v6->sin6_addr = sk->sk_v6_daddr;		\
-		} else							\
-			TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);	\
-	} while (0)
-
-#else
-
-#define TP_STORE_ADDR_PORTS(__entry, inet, sk)		\
-	TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
-
-#endif
+#include <trace/events/net_probe_common.h>
 
 TRACE_EVENT(tcp_probe,
 
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2e7b56097bc4..4215f13a63af 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -27,3 +27,6 @@ dccp-$(CONFIG_SYSCTL) += sysctl.o
 
 dccp_diag-y := diag.o
 dccp_probe-y := probe.o
+
+# build with local directory for trace.h
+CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7a75a1d3568b..fa7e92e08920 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -38,6 +38,9 @@
 #include "dccp.h"
 #include "feat.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
 EXPORT_SYMBOL_GPL(dccp_statistics);
@@ -761,6 +764,8 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int rc, size;
 	long timeo;
 
+	trace_dccp_probe(sk, len);
+
 	if (len > dp->dccps_mss_cache)
 		return -EMSGSIZE;
 
diff --git a/net/dccp/trace.h b/net/dccp/trace.h
new file mode 100644
index 000000000000..5062421beee9
--- /dev/null
+++ b/net/dccp/trace.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dccp
+
+#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DCCP_H
+
+#include <net/sock.h>
+#include "dccp.h"
+#include "ccids/ccid3.h"
+#include <linux/tracepoint.h>
+#include <trace/events/net_probe_common.h>
+
+TRACE_EVENT(dccp_probe,
+
+	TP_PROTO(struct sock *sk, size_t size),
+
+	TP_ARGS(sk, size),
+
+	TP_STRUCT__entry(
+		/* sockaddr_in6 is always bigger than sockaddr_in */
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, size)
+		__field(__u16, tx_s)
+		__field(__u32, tx_rtt)
+		__field(__u32, tx_p)
+		__field(__u32, tx_x_calc)
+		__field(__u64, tx_x_recv)
+		__field(__u64, tx_x)
+		__field(__u32, tx_t_ipi)
+	),
+
+	TP_fast_assign(
+		const struct inet_sock *inet = inet_sk(sk);
+		struct ccid3_hc_tx_sock *hc = NULL;
+
+		if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+			hc = ccid3_hc_tx_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+		/* For filtering use */
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		__entry->size = size;
+		if (hc) {
+			__entry->tx_s = hc->tx_s;
+			__entry->tx_rtt = hc->tx_rtt;
+			__entry->tx_p = hc->tx_p;
+			__entry->tx_x_calc = hc->tx_x_calc;
+			__entry->tx_x_recv = hc->tx_x_recv >> 6;
+			__entry->tx_x = hc->tx_x >> 6;
+			__entry->tx_t_ipi = hc->tx_t_ipi;
+		} else {
+			__entry->tx_s = 0;
+			memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi -
+			       (void *)&__entry->tx_rtt +
+			       sizeof(__entry->tx_t_ipi));
+		}
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d "
+		  "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d",
+		  __entry->saddr, __entry->daddr, __entry->size,
+		  __entry->tx_s, __entry->tx_rtt, __entry->tx_p,
+		  __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x,
+		  __entry->tx_t_ipi)
+);
+
+#endif /* _TRACE_TCP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From a56c1470c2d589069504907c82d0044037124f66 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 29 Dec 2017 11:48:25 +0900
Subject: net: dccp: Remove dccpprobe module

Remove DCCP probe module since jprobe has been deprecated.
That function is now replaced by dccp/dccp_probe trace-event.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/Kconfig  |  17 -----
 net/dccp/Makefile |   2 -
 net/dccp/probe.c  | 203 ------------------------------------------------------
 3 files changed, 222 deletions(-)
 delete mode 100644 net/dccp/probe.c

(limited to 'net')

diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 8c0ef71bed2f..b270e84d9c13 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -39,23 +39,6 @@ config IP_DCCP_DEBUG
 
 	  Just say N.
 
-config NET_DCCPPROBE
-	tristate "DCCP connection probing"
-	depends on PROC_FS && KPROBES
-	---help---
-	This module allows for capturing the changes to DCCP connection
-	state in response to incoming packets. It is used for debugging
-	DCCP congestion avoidance modules. If you don't understand
-	what was just said, you don't need it: say N.
-
-	Documentation on how to use DCCP connection probing can be found
-	at:
-	
-	  http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
-
-	To compile this code as a module, choose M here: the
-	module will be called dccp_probe.
-
 
 endmenu
 
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 4215f13a63af..5b4ff37bc806 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -21,12 +21,10 @@ obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
 dccp_ipv6-y := ipv6.o
 
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
-obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
 dccp-$(CONFIG_SYSCTL) += sysctl.o
 
 dccp_diag-y := diag.o
-dccp_probe-y := probe.o
 
 # build with local directory for trace.h
 CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
deleted file mode 100644
index 3d3fda05b32d..000000000000
--- a/net/dccp/probe.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * dccp_probe - Observe the DCCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for DCCP from Stephen Hemminger's code
- * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/dccp.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/vmalloc.h>
-#include <linux/time64.h>
-#include <linux/gfp.h>
-#include <net/net_namespace.h>
-
-#include "dccp.h"
-#include "ccid.h"
-#include "ccids/ccid3.h"
-
-static int port;
-
-static int bufsize = 64 * 1024;
-
-static const char procname[] = "dccpprobe";
-
-static struct {
-	struct kfifo	  fifo;
-	spinlock_t	  lock;
-	wait_queue_head_t wait;
-	struct timespec64 tstart;
-} dccpw;
-
-static void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	struct timespec64 now;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	getnstimeofday64(&now);
-
-	now = timespec64_sub(now, dccpw.tstart);
-
-	len = sprintf(tbuf, "%lu.%06lu ",
-		      (unsigned long) now.tv_sec,
-		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
-	va_end(args);
-
-	kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
-	wake_up(&dccpw.wait);
-}
-
-static int jdccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	struct ccid3_hc_tx_sock *hc = NULL;
-
-	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
-		hc = ccid3_hc_tx_sk(sk);
-
-	if (port == 0 || ntohs(inet->inet_dport) == port ||
-	    ntohs(inet->inet_sport) == port) {
-		if (hc)
-			printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
-			       &inet->inet_saddr, ntohs(inet->inet_sport),
-			       &inet->inet_daddr, ntohs(inet->inet_dport), size,
-			       hc->tx_s, hc->tx_rtt, hc->tx_p,
-			       hc->tx_x_calc, hc->tx_x_recv >> 6,
-			       hc->tx_x >> 6, hc->tx_t_ipi);
-		else
-			printl("%pI4:%u %pI4:%u %d\n",
-			       &inet->inet_saddr, ntohs(inet->inet_sport),
-			       &inet->inet_daddr, ntohs(inet->inet_dport),
-			       size);
-	}
-
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe dccp_send_probe = {
-	.kp	= {
-		.symbol_name = "dccp_sendmsg",
-	},
-	.entry	= jdccp_sendmsg,
-};
-
-static int dccpprobe_open(struct inode *inode, struct file *file)
-{
-	kfifo_reset(&dccpw.fifo);
-	getnstimeofday64(&dccpw.tstart);
-	return 0;
-}
-
-static ssize_t dccpprobe_read(struct file *file, char __user *buf,
-			      size_t len, loff_t *ppos)
-{
-	int error = 0, cnt = 0;
-	unsigned char *tbuf;
-
-	if (!buf)
-		return -EINVAL;
-
-	if (len == 0)
-		return 0;
-
-	tbuf = vmalloc(len);
-	if (!tbuf)
-		return -ENOMEM;
-
-	error = wait_event_interruptible(dccpw.wait,
-					 kfifo_len(&dccpw.fifo) != 0);
-	if (error)
-		goto out_free;
-
-	cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
-	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
-	vfree(tbuf);
-
-	return error ? error : cnt;
-}
-
-static const struct file_operations dccpprobe_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = dccpprobe_open,
-	.read    = dccpprobe_read,
-	.llseek  = noop_llseek,
-};
-
-static __init int dccpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	init_waitqueue_head(&dccpw.wait);
-	spin_lock_init(&dccpw.lock);
-	if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
-		return ret;
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
-		goto err0;
-
-	ret = register_jprobe(&dccp_send_probe);
-	if (ret) {
-		ret = request_module("dccp");
-		if (!ret)
-			ret = register_jprobe(&dccp_send_probe);
-	}
-
-	if (ret)
-		goto err1;
-
-	pr_info("DCCP watch registered (port=%d)\n", port);
-	return 0;
-err1:
-	remove_proc_entry(procname, init_net.proc_net);
-err0:
-	kfifo_free(&dccpw.fifo);
-	return ret;
-}
-module_init(dccpprobe_init);
-
-static __exit void dccpprobe_exit(void)
-{
-	kfifo_free(&dccpw.fifo);
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&dccp_send_probe);
-
-}
-module_exit(dccpprobe_exit);
-
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
-MODULE_DESCRIPTION("DCCP snooper");
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 21602e1a5578925f610155a0bcf056d1ea9de964 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 29 Dec 2017 11:05:45 -0800
Subject: net: dsa: Fix dsa_legacy_register() return value

We need to make the dsa_legacy_register() stub return 0 in order for
dsa_init_module() to successfully register and continue registering the
ETH_P_XDSA packet handler.

Fixes: 2a93c1a3651f ("net: dsa: Allow compiling out legacy support")
Reported-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index b03665e8fb4e..cefb0c3c6d51 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -103,7 +103,7 @@ void dsa_legacy_unregister(void);
 #else
 static inline int dsa_legacy_register(void)
 {
-	return -ENODEV;
+	return 0;
 }
 
 static inline void dsa_legacy_unregister(void) { }
-- 
cgit v1.2.3


From 3a3713ec360138f806c6fc368d1de570f692b347 Mon Sep 17 00:00:00 2001
From: Peter Große <pegro@friiks.de>
Date: Wed, 13 Dec 2017 18:29:46 +0100
Subject: mac80211: Fix setting TX power on monitor interfaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of calling ieee80211_recalc_txpower on monitor interfaces
directly, call it using the virtual monitor interface, if one exists.

In case of a single monitor interface given, reject setting TX power,
if no virtual monitor interface exists.

That being checked, don't warn in ieee80211_bss_info_change_notify,
after setting TX power on a monitor interface.

Fixes warning:
------------[ cut here ]------------
 WARNING: CPU: 0 PID: 2193 at net/mac80211/driver-ops.h:167
 ieee80211_bss_info_change_notify+0x111/0x190 Modules linked in: uvcvideo
 videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_core
rndis_host cdc_ether usbnet mii tp_smapi(O) thinkpad_ec(O) ohci_hcd vboxpci(O)
 vboxnetadp(O) vboxnetflt(O) v boxdrv(O) x86_pkg_temp_thermal kvm_intel kvm
 irqbypass iwldvm iwlwifi ehci_pci ehci_hcd tpm_tis tpm_tis_core tpm CPU: 0
 PID: 2193 Comm: iw Tainted: G           O    4.12.12-gentoo #2 task:
 ffff880186fd5cc0 task.stack: ffffc90001b54000 RIP:
 0010:ieee80211_bss_info_change_notify+0x111/0x190 RSP: 0018:ffffc90001b57a10
 EFLAGS: 00010246 RAX: 0000000000000006 RBX: ffff8801052ce840 RCX:
 0000000000000064 RDX: 00000000fffffffc RSI: 0000000000040000 RDI:
 ffff8801052ce840 RBP: ffffc90001b57a38 R08: 0000000000000062 R09:
 0000000000000000 R10: ffff8802144b5000 R11: ffff880049dc4614 R12:
 0000000000040000 R13: 0000000000000064 R14: ffff8802105f0760 R15:
 ffffc90001b57b48 FS:  00007f92644b4580(0000) GS:ffff88021e200000(0000)
 knlGS:0000000000000000 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f9263c109f0 CR3: 00000001df850000 CR4: 00000000000406f0
 Call Trace:
  ieee80211_recalc_txpower+0x33/0x40
  ieee80211_set_tx_power+0x40/0x180
  nl80211_set_wiphy+0x32e/0x950

Reported-by: Peter Große <pegro@friiks.de>
Signed-off-by: Peter Große <pegro@friiks.de>

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c        | 28 +++++++++++++++++++++++++++-
 net/mac80211/driver-ops.h |  3 ++-
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b77ee342b5f8..46028e12e216 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2376,10 +2376,17 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
 	struct ieee80211_sub_if_data *sdata;
 	enum nl80211_tx_power_setting txp_type = type;
 	bool update_txp_type = false;
+	bool has_monitor = false;
 
 	if (wdev) {
 		sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
 
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+			sdata = rtnl_dereference(local->monitor_sdata);
+			if (!sdata)
+				return -EOPNOTSUPP;
+		}
+
 		switch (type) {
 		case NL80211_TX_POWER_AUTOMATIC:
 			sdata->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
@@ -2418,15 +2425,34 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
 
 	mutex_lock(&local->iflist_mtx);
 	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+			has_monitor = true;
+			continue;
+		}
 		sdata->user_power_level = local->user_power_level;
 		if (txp_type != sdata->vif.bss_conf.txpower_type)
 			update_txp_type = true;
 		sdata->vif.bss_conf.txpower_type = txp_type;
 	}
-	list_for_each_entry(sdata, &local->interfaces, list)
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
+			continue;
 		ieee80211_recalc_txpower(sdata, update_txp_type);
+	}
 	mutex_unlock(&local->iflist_mtx);
 
+	if (has_monitor) {
+		sdata = rtnl_dereference(local->monitor_sdata);
+		if (sdata) {
+			sdata->user_power_level = local->user_power_level;
+			if (txp_type != sdata->vif.bss_conf.txpower_type)
+				update_txp_type = true;
+			sdata->vif.bss_conf.txpower_type = txp_type;
+
+			ieee80211_recalc_txpower(sdata, update_txp_type);
+		}
+	}
+
 	return 0;
 }
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index c7f93fd9ca7a..4d82fe7d627c 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -165,7 +165,8 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
 	if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
 			 sdata->vif.type == NL80211_IFTYPE_NAN ||
 			 (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
-			  !sdata->vif.mu_mimo_owner)))
+			  !sdata->vif.mu_mimo_owner &&
+			  !(changed & BSS_CHANGED_TXPOWER))))
 		return;
 
 	if (!check_sdata_in_driver(sdata))
-- 
cgit v1.2.3


From adb552c31915415fdb374172085f174f459727ea Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Sun, 30 Jul 2017 23:51:01 +0200
Subject: can: raw: raw_bind(): bail out if can_family is not AF_CAN

Until now CAN raw's bind() doesn't check if the can_familiy in the
struct sockaddr_can is set to AF_CAN. This patch adds the missing check.

Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/raw.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/can/raw.c b/net/can/raw.c
index 864c80dbdb72..f2ecc43376a1 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -401,6 +401,8 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 
 	if (len < sizeof(*addr))
 		return -EINVAL;
+	if (addr->can_family != AF_CAN)
+		return -EINVAL;
 
 	lock_sock(sk);
 
-- 
cgit v1.2.3


From ff847ee47be27621f978921919f035fcd87d6d08 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Sat, 3 Jun 2017 20:10:03 +0200
Subject: can: af_can: give struct holding the CAN per device receive lists a
 sensible name

This patch adds a "can_" prefix to the "struct dev_rcv_lists" to better
reflect the meaning and improbe code readability.

The conversion is done with:

	sed -i \
		-e "s/struct dev_rcv_lists/struct can_dev_rcv_lists/g" \
		net/can/*.[ch] include/net/netns/can.h

Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/net/netns/can.h |  4 ++--
 net/can/af_can.c        | 20 ++++++++++----------
 net/can/af_can.h        |  2 +-
 net/can/proc.c          |  8 ++++----
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/can.h b/include/net/netns/can.h
index ecf238b8862c..ca9bd9fba5b5 100644
--- a/include/net/netns/can.h
+++ b/include/net/netns/can.h
@@ -8,7 +8,7 @@
 
 #include <linux/spinlock.h>
 
-struct dev_rcv_lists;
+struct can_dev_rcv_lists;
 struct s_stats;
 struct s_pstats;
 
@@ -28,7 +28,7 @@ struct netns_can {
 #endif
 
 	/* receive filters subscribed for 'all' CAN devices */
-	struct dev_rcv_lists *can_rx_alldev_list;
+	struct can_dev_rcv_lists *can_rx_alldev_list;
 	spinlock_t can_rcvlists_lock;
 	struct timer_list can_stattimer;/* timer for statistics update */
 	struct s_stats *can_stats;	/* packet statistics */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 003b2d6d655f..f22b886ed081 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -321,13 +321,13 @@ EXPORT_SYMBOL(can_send);
  * af_can rx path
  */
 
-static struct dev_rcv_lists *find_dev_rcv_lists(struct net *net,
+static struct can_dev_rcv_lists *find_dev_rcv_lists(struct net *net,
 						struct net_device *dev)
 {
 	if (!dev)
 		return net->can.can_rx_alldev_list;
 	else
-		return (struct dev_rcv_lists *)dev->ml_priv;
+		return (struct can_dev_rcv_lists *)dev->ml_priv;
 }
 
 /**
@@ -381,7 +381,7 @@ static unsigned int effhash(canid_t can_id)
  *  Reduced can_id to have a preprocessed filter compare value.
  */
 static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
-					struct dev_rcv_lists *d)
+					struct can_dev_rcv_lists *d)
 {
 	canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */
 
@@ -464,7 +464,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
 {
 	struct receiver *r;
 	struct hlist_head *rl;
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 	struct s_pstats *can_pstats = net->can.can_pstats;
 	int err = 0;
 
@@ -542,7 +542,7 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
 	struct receiver *r = NULL;
 	struct hlist_head *rl;
 	struct s_pstats *can_pstats = net->can.can_pstats;
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 
 	if (dev && dev->type != ARPHRD_CAN)
 		return;
@@ -615,7 +615,7 @@ static inline void deliver(struct sk_buff *skb, struct receiver *r)
 	r->matches++;
 }
 
-static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb)
+static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
 {
 	struct receiver *r;
 	int matches = 0;
@@ -682,7 +682,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb)
 
 static void can_receive(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 	struct net *net = dev_net(dev);
 	struct s_stats *can_stats = net->can.can_stats;
 	int matches;
@@ -829,7 +829,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
 			void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 
 	if (dev->type != ARPHRD_CAN)
 		return NOTIFY_DONE;
@@ -874,7 +874,7 @@ static int can_pernet_init(struct net *net)
 {
 	spin_lock_init(&net->can.can_rcvlists_lock);
 	net->can.can_rx_alldev_list =
-		kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL);
+		kzalloc(sizeof(struct can_dev_rcv_lists), GFP_KERNEL);
 	if (!net->can.can_rx_alldev_list)
 		goto out;
 	net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL);
@@ -920,7 +920,7 @@ static void can_pernet_exit(struct net *net)
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
 		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
-			struct dev_rcv_lists *d = dev->ml_priv;
+			struct can_dev_rcv_lists *d = dev->ml_priv;
 
 			BUG_ON(d->entries);
 			kfree(d);
diff --git a/net/can/af_can.h b/net/can/af_can.h
index eca6463c6213..9cb3719632bd 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -67,7 +67,7 @@ struct receiver {
 enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX };
 
 /* per device receive filters linked at dev->ml_priv */
-struct dev_rcv_lists {
+struct can_dev_rcv_lists {
 	struct hlist_head rx[RX_MAX];
 	struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ];
 	struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ];
diff --git a/net/can/proc.c b/net/can/proc.c
index 0c59f876fe6f..45e38a3085bc 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -338,7 +338,7 @@ static const struct file_operations can_version_proc_fops = {
 
 static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
 					     struct net_device *dev,
-					     struct dev_rcv_lists *d)
+					     struct can_dev_rcv_lists *d)
 {
 	if (!hlist_empty(&d->rx[idx])) {
 		can_print_recv_banner(m);
@@ -353,7 +353,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 	/* double cast to prevent GCC warning */
 	int idx = (int)(long)PDE_DATA(m->file->f_inode);
 	struct net_device *dev;
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 	struct net *net = m->private;
 
 	seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
@@ -417,7 +417,7 @@ static inline void can_rcvlist_proc_show_array(struct seq_file *m,
 static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 {
 	struct net_device *dev;
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 	struct net *net = m->private;
 
 	/* RX_SFF */
@@ -461,7 +461,7 @@ static const struct file_operations can_rcvlist_sff_proc_fops = {
 static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 {
 	struct net_device *dev;
-	struct dev_rcv_lists *d;
+	struct can_dev_rcv_lists *d;
 	struct net *net = m->private;
 
 	/* RX_EFF */
-- 
cgit v1.2.3


From 863def15b9755d9016df4d93addf3127f1dc67f4 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 3 Jan 2018 22:48:04 +0000
Subject: l2tp: revert "l2tp: add peer_offset parameter"

Revert commit f15bc54eeecd ("l2tp: add peer_offset parameter"). This
is removed because it is adding another configurable offset and
configurable offsets are being removed.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  1 -
 net/l2tp/l2tp_core.c      |  3 +--
 net/l2tp/l2tp_core.h      | 13 +++----------
 net/l2tp/l2tp_debugfs.c   |  8 +++-----
 net/l2tp/l2tp_netlink.c   | 21 +--------------------
 5 files changed, 8 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index d6fee55dbded..d84ce5c1c9aa 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -127,7 +127,6 @@ enum {
 	L2TP_ATTR_UDP_ZERO_CSUM6_TX,	/* flag */
 	L2TP_ATTR_UDP_ZERO_CSUM6_RX,	/* flag */
 	L2TP_ATTR_PAD,
-	L2TP_ATTR_PEER_OFFSET,		/* u16 */
 	__L2TP_ATTR_MAX,
 };
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 6ff64717da1e..115918ad8eca 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -792,7 +792,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 			ptr += 2 + offset;
 		}
 	} else
-		ptr += session->peer_offset;
+		ptr += session->offset;
 
 	offset = ptr - optr;
 	if (!pskb_may_pull(skb, offset))
@@ -1785,7 +1785,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->lns_mode = cfg->lns_mode;
 			session->reorder_timeout = cfg->reorder_timeout;
 			session->offset = cfg->offset;
-			session->peer_offset = cfg->peer_offset;
 			session->l2specific_type = cfg->l2specific_type;
 			session->l2specific_len = cfg->l2specific_len;
 			session->cookie_len = cfg->cookie_len;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index c6fe7cc42a05..9534e16965cc 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -59,8 +59,7 @@ struct l2tp_session_cfg {
 	int			debug;		/* bitmask of debug message
 						 * categories */
 	u16			vlan_id;	/* VLAN pseudowire only */
-	u16			offset;		/* offset to tx payload */
-	u16			peer_offset;	/* offset to rx payload */
+	u16			offset;		/* offset to payload */
 	u16			l2specific_len;	/* Layer 2 specific length */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
@@ -87,14 +86,8 @@ struct l2tp_session {
 	int			cookie_len;
 	u8			peer_cookie[8];
 	int			peer_cookie_len;
-	u16			offset;		/* offset from end of L2TP
-						 * header to beginning of
-						 * tx data
-						 */
-	u16			peer_offset;	/* offset from end of L2TP
-						 * header to beginning of
-						 * rx data
-						 */
+	u16			offset;		/* offset from end of L2TP header
+						   to beginning of data */
 	u16			l2specific_len;
 	u16			l2specific_type;
 	u16			hdr_len;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 4cc30b38aba4..eb69411bcb47 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -180,9 +180,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 		   session->lns_mode ? "LNS" : "LAC",
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
-	seq_printf(m, "   offset %hu peer_offset %hu l2specific %hu/%hu\n",
-		   session->offset, session->peer_offset,
-		   session->l2specific_type, session->l2specific_len);
+	seq_printf(m, "   offset %hu l2specific %hu/%hu\n",
+		   session->offset, session->l2specific_type, session->l2specific_len);
 	if (session->cookie_len) {
 		seq_printf(m, "   cookie %02x%02x%02x%02x",
 			   session->cookie[0], session->cookie[1],
@@ -229,8 +228,7 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
 		seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
 		seq_puts(m, "  SESSION ID, peer ID, PWTYPE\n");
 		seq_puts(m, "   refcnt cnt\n");
-		seq_puts(m, "   offset OFFSET peer_offset OFFSET");
-		seq_puts(m, " l2specific TYPE/LEN\n");
+		seq_puts(m, "   offset OFFSET l2specific TYPE/LEN\n");
 		seq_puts(m, "   [ cookie ]\n");
 		seq_puts(m, "   [ peer cookie ]\n");
 		seq_puts(m, "   config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n");
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index d7d4d7a7a54d..7e9c50125556 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -547,25 +547,9 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	}
 
 	if (tunnel->version > 2) {
-		if (info->attrs[L2TP_ATTR_PEER_OFFSET]) {
-			struct nlattr *peer_offset;
-
-			peer_offset = info->attrs[L2TP_ATTR_PEER_OFFSET];
-			cfg.peer_offset = nla_get_u16(peer_offset);
-		}
-
-		if (info->attrs[L2TP_ATTR_OFFSET]) {
+		if (info->attrs[L2TP_ATTR_OFFSET])
 			cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
 
-			/* in order to maintain compatibility with older
-			 * versions where offset was used for both tx and
-			 * rx side, update rx side with offset if peer_offset
-			 * is not provided by userspace
-			 */
-			if (!info->attrs[L2TP_ATTR_PEER_OFFSET])
-				cfg.peer_offset = cfg.offset;
-		}
-
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
 
@@ -779,8 +763,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 	     nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
 	    (session->offset &&
 	     nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) ||
-	    (session->peer_offset &&
-	     nla_put_u16(skb, L2TP_ATTR_PEER_OFFSET, session->peer_offset)) ||
 	    (session->cookie_len &&
 	     nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
 		     &session->cookie[0])) ||
@@ -921,7 +903,6 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
 	[L2TP_ATTR_PW_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_ENCAP_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_OFFSET]		= { .type = NLA_U16, },
-	[L2TP_ATTR_PEER_OFFSET]		= { .type = NLA_U16, },
 	[L2TP_ATTR_DATA_SEQ]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_TYPE]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_LEN]		= { .type = NLA_U8, },
-- 
cgit v1.2.3


From de3b58bc359a861d5132300f53f95e83f71954b3 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 3 Jan 2018 22:48:05 +0000
Subject: l2tp: revert "l2tp: fix missing print session offset info"

Revert commit 820da5357572 ("l2tp: fix missing print session offset
info").  The peer_offset parameter is removed.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_netlink.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 7e9c50125556..a1f24fb2be98 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -761,8 +761,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 
 	if ((session->ifname[0] &&
 	     nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
-	    (session->offset &&
-	     nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) ||
 	    (session->cookie_len &&
 	     nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
 		     &session->cookie[0])) ||
-- 
cgit v1.2.3


From 900631ee6a2651dc4fbaecb8ef9fa5f1e3378853 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 3 Jan 2018 22:48:06 +0000
Subject: l2tp: remove configurable payload offset

If L2TP_ATTR_OFFSET is set to a non-zero value in L2TPv3 tunnels, it
results in L2TPv3 packets being transmitted which might not be
compliant with the L2TPv3 RFC. This patch has l2tp ignore the offset
setting and send all packets with no offset.

In more detail:

L2TPv2 supports a variable offset from the L2TPv2 header to the
payload. The offset value is indicated by an optional field in the
L2TP header.  Our L2TP implementation already detects the presence of
the optional offset and skips that many bytes when handling data
received packets. All transmitted packets are always transmitted with
no offset.

L2TPv3 has no optional offset field in the L2TPv3 packet
header. Instead, L2TPv3 defines optional fields in a "Layer-2 Specific
Sublayer". At the time when the original L2TP code was written, there
was talk at IETF of offset being implemented in a new Layer-2 Specific
Sublayer. A L2TP_ATTR_OFFSET netlink attribute was added so that this
offset could be configured and the intention was to allow it to be
also used to set the tx offset for L2TPv2. However, no L2TPv3 offset
was ever specified and the L2TP_ATTR_OFFSET parameter was forgotten
about.

Setting L2TP_ATTR_OFFSET results in L2TPv3 packets being transmitted
with the specified number of bytes padding between L2TPv3 header and
payload. This is not compliant with L2TPv3 RFC3931. This change
removes the configurable offset altogether while retaining
L2TP_ATTR_OFFSET for backwards compatibility. Any L2TP_ATTR_OFFSET
value is ignored.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c    | 14 ++++----------
 net/l2tp/l2tp_core.h    |  3 ---
 net/l2tp/l2tp_debugfs.c |  4 ++--
 net/l2tp/l2tp_netlink.c |  3 ---
 4 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 115918ad8eca..786cd7f6a5e8 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -780,10 +780,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 		}
 	}
 
-	/* Session data offset is handled differently for L2TPv2 and
-	 * L2TPv3. For L2TPv2, there is an optional 16-bit value in
-	 * the header. For L2TPv3, the offset is negotiated using AVPs
-	 * in the session setup control protocol.
+	/* Session data offset is defined only for L2TPv2 and is
+	 * indicated by an optional 16-bit value in the header.
 	 */
 	if (tunnel->version == L2TP_HDR_VER_2) {
 		/* If offset bit set, skip it. */
@@ -791,8 +789,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 			offset = ntohs(*(__be16 *)ptr);
 			ptr += 2 + offset;
 		}
-	} else
-		ptr += session->offset;
+	}
 
 	offset = ptr - optr;
 	if (!pskb_may_pull(skb, offset))
@@ -1068,8 +1065,6 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
 		}
 		bufp += session->l2specific_len;
 	}
-	if (session->offset)
-		bufp += session->offset;
 
 	return bufp - optr;
 }
@@ -1734,7 +1729,7 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 		if (session->send_seq)
 			session->hdr_len += 4;
 	} else {
-		session->hdr_len = 4 + session->cookie_len + session->l2specific_len + session->offset;
+		session->hdr_len = 4 + session->cookie_len + session->l2specific_len;
 		if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP)
 			session->hdr_len += 4;
 	}
@@ -1784,7 +1779,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->recv_seq = cfg->recv_seq;
 			session->lns_mode = cfg->lns_mode;
 			session->reorder_timeout = cfg->reorder_timeout;
-			session->offset = cfg->offset;
 			session->l2specific_type = cfg->l2specific_type;
 			session->l2specific_len = cfg->l2specific_len;
 			session->cookie_len = cfg->cookie_len;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 9534e16965cc..c2e9bbd79b35 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -59,7 +59,6 @@ struct l2tp_session_cfg {
 	int			debug;		/* bitmask of debug message
 						 * categories */
 	u16			vlan_id;	/* VLAN pseudowire only */
-	u16			offset;		/* offset to payload */
 	u16			l2specific_len;	/* Layer 2 specific length */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
@@ -86,8 +85,6 @@ struct l2tp_session {
 	int			cookie_len;
 	u8			peer_cookie[8];
 	int			peer_cookie_len;
-	u16			offset;		/* offset from end of L2TP header
-						   to beginning of data */
 	u16			l2specific_len;
 	u16			l2specific_type;
 	u16			hdr_len;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index eb69411bcb47..2c30587d1a14 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -180,8 +180,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 		   session->lns_mode ? "LNS" : "LAC",
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
-	seq_printf(m, "   offset %hu l2specific %hu/%hu\n",
-		   session->offset, session->l2specific_type, session->l2specific_len);
+	seq_printf(m, "   offset 0 l2specific %hu/%hu\n",
+		   session->l2specific_type, session->l2specific_len);
 	if (session->cookie_len) {
 		seq_printf(m, "   cookie %02x%02x%02x%02x",
 			   session->cookie[0], session->cookie[1],
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index a1f24fb2be98..e1ca29f79821 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -547,9 +547,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	}
 
 	if (tunnel->version > 2) {
-		if (info->attrs[L2TP_ATTR_OFFSET])
-			cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
-
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
 
-- 
cgit v1.2.3


From e3f2c4a3db1413bebfd502f7ac94fb55e3ba8c84 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Wed, 3 Jan 2018 21:47:10 -0500
Subject: ip: do not set RFS core on error queue reads

We should only record RPS on normal reads and writes.
In single threaded processes, all calls record the same state. In
multi-threaded processes where a separate thread processes
errors, the RFS table mispredicts.

Note that, when CONFIG_RPS is disabled, sock_rps_record_flow
is a noop and no branch is added as a result of this patch.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index bab98a4fedad..54cccdd8b1e3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -790,7 +790,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 	int addr_len = 0;
 	int err;
 
-	sock_rps_record_flow(sk);
+	if (likely(!(flags & MSG_ERRQUEUE)))
+		sock_rps_record_flow(sk);
 
 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
 				   flags & ~MSG_DONTWAIT, &addr_len);
-- 
cgit v1.2.3


From 0a38806f31729c8931383d2ce944115312855931 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Wed, 3 Jan 2018 21:47:11 -0500
Subject: net: revert "Update RFS target at poll for tcp/udp"

On multi-threaded processes, one common architecture is to have
one (or a small number of) threads polling sockets, and a
considerably larger pool of threads reading form and writing to the
sockets. When we set RPS core on tcp_poll() or udp_poll() we essentially
steer all packets of all the polled FDs to one (or small number of)
cores, creaing a bottleneck and/or RPS misprediction.

Another common architecture is to shard FDs among threads pinned
to cores. In such a setting, setting RPS core in tcp_poll() and
udp_poll() is redundant because the RFS core is correctly
set in recvmsg and sendmsg.

Thus, revert the following commit:
c3f1dbaf6e28 ("net: Update RFS target at poll for tcp/udp").

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 --
 net/ipv4/udp.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7ac583a2b9fe..f68cb33d50d1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -498,8 +498,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int state;
 
-	sock_rps_record_flow(sk);
-
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	state = inet_sk_state_load(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e9c0d1e1772e..db72619e07e4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2490,8 +2490,6 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
 		mask |= POLLIN | POLLRDNORM;
 
-	sock_rps_record_flow(sk);
-
 	/* Check for false positives due to checksum errors */
 	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
-- 
cgit v1.2.3


From bf08c34086d159edde5c54902dfa2caa4d9fbd8c Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 3 Jan 2018 22:13:00 -0800
Subject: net: dsa: Move padding into Broadcom tagger

Instead of having the different master network device drivers
potentially used by DSA/Broadcom tags, move the padding necessary for
the switches to accept short packets where it makes most sense: within
tag_brcm.c. This avoids multiplying the number of similar commits to
e.g: bgmac, bcmsysport, etc.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_brcm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index e6e0b7b6025c..2b06bb91318b 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -70,6 +70,18 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 	if (skb_cow_head(skb, BRCM_TAG_LEN) < 0)
 		return NULL;
 
+	/* The Ethernet switch we are interfaced with needs packets to be at
+	 * least 64 bytes (including FCS) otherwise they will be discarded when
+	 * they enter the switch port logic. When Broadcom tags are enabled, we
+	 * need to make sure that packets are at least 68 bytes
+	 * (including FCS and tag) because the length verification is done after
+	 * the Broadcom tag is stripped off the ingress packet.
+	 *
+	 * Let dsa_slave_xmit() free the SKB
+	 */
+	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
+		return NULL;
+
 	skb_push(skb, BRCM_TAG_LEN);
 
 	if (offset)
-- 
cgit v1.2.3


From 38266ca17c5f142a25d4563335e143cbd62e65a7 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 4 Jan 2018 15:20:44 +0100
Subject: tipc: some clarifying name changes

We rename some functions and variables, to make their purpose clearer.

- tipc_group::congested -> tipc_group::small_win. Members in this list
  are not necessarily (and typically) congested. Instead, they may
  *potentially* be subject to congestion because their send window is
  less than ADV_IDLE, and therefore need to be checked during message
  transmission.

- tipc_group_is_receiver() -> tipc_group_is_sender(). This socket will
  accept messages coming from members fulfilling this condition, i.e.,
  they are senders from this member's viewpoint.

- tipc_group_is_enabled() -> tipc_group_is_receiver(). Members
  fulfilling this condition will accept messages sent from the current
  socket, i.e., they are receivers from its viewpoint.

There are no functional changes in this commit.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 8e12ab55346b..0d743b98823f 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -64,7 +64,7 @@ enum mbr_state {
 struct tipc_member {
 	struct rb_node tree_node;
 	struct list_head list;
-	struct list_head congested;
+	struct list_head small_win;
 	struct sk_buff *event_msg;
 	struct sk_buff_head deferredq;
 	struct tipc_group *group;
@@ -82,7 +82,7 @@ struct tipc_member {
 
 struct tipc_group {
 	struct rb_root members;
-	struct list_head congested;
+	struct list_head small_win;
 	struct list_head pending;
 	struct list_head active;
 	struct list_head reclaiming;
@@ -136,12 +136,12 @@ u16 tipc_group_bc_snd_nxt(struct tipc_group *grp)
 	return grp->bc_snd_nxt;
 }
 
-static bool tipc_group_is_enabled(struct tipc_member *m)
+static bool tipc_group_is_receiver(struct tipc_member *m)
 {
 	return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING;
 }
 
-static bool tipc_group_is_receiver(struct tipc_member *m)
+static bool tipc_group_is_sender(struct tipc_member *m)
 {
 	return m && m->state >= MBR_JOINED;
 }
@@ -168,7 +168,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	if (!grp)
 		return NULL;
 	tipc_nlist_init(&grp->dests, tipc_own_addr(net));
-	INIT_LIST_HEAD(&grp->congested);
+	INIT_LIST_HEAD(&grp->small_win);
 	INIT_LIST_HEAD(&grp->active);
 	INIT_LIST_HEAD(&grp->pending);
 	INIT_LIST_HEAD(&grp->reclaiming);
@@ -232,7 +232,7 @@ static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp,
 	struct tipc_member *m;
 
 	m = tipc_group_find_member(grp, node, port);
-	if (m && tipc_group_is_enabled(m))
+	if (m && tipc_group_is_receiver(m))
 		return m;
 	return NULL;
 }
@@ -285,7 +285,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp,
 	if (!m)
 		return NULL;
 	INIT_LIST_HEAD(&m->list);
-	INIT_LIST_HEAD(&m->congested);
+	INIT_LIST_HEAD(&m->small_win);
 	__skb_queue_head_init(&m->deferredq);
 	m->group = grp;
 	m->node = node;
@@ -314,7 +314,7 @@ static void tipc_group_delete_member(struct tipc_group *grp,
 		grp->bc_ackers--;
 
 	list_del_init(&m->list);
-	list_del_init(&m->congested);
+	list_del_init(&m->small_win);
 	tipc_group_decr_active(grp, m);
 
 	/* If last member on a node, remove node from dest list */
@@ -343,7 +343,7 @@ void tipc_group_update_member(struct tipc_member *m, int len)
 	struct tipc_group *grp = m->group;
 	struct tipc_member *_m, *tmp;
 
-	if (!tipc_group_is_enabled(m))
+	if (!tipc_group_is_receiver(m))
 		return;
 
 	m->window -= len;
@@ -351,16 +351,16 @@ void tipc_group_update_member(struct tipc_member *m, int len)
 	if (m->window >= ADV_IDLE)
 		return;
 
-	list_del_init(&m->congested);
+	list_del_init(&m->small_win);
 
-	/* Sort member into congested members' list */
-	list_for_each_entry_safe(_m, tmp, &grp->congested, congested) {
+	/* Sort member into small_window members' list */
+	list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) {
 		if (m->window > _m->window)
 			continue;
-		list_add_tail(&m->congested, &_m->congested);
+		list_add_tail(&m->small_win, &_m->small_win);
 		return;
 	}
-	list_add_tail(&m->congested, &grp->congested);
+	list_add_tail(&m->small_win, &grp->small_win);
 }
 
 void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack)
@@ -372,7 +372,7 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack)
 
 	for (n = rb_first(&grp->members); n; n = rb_next(n)) {
 		m = container_of(n, struct tipc_member, tree_node);
-		if (tipc_group_is_enabled(m)) {
+		if (tipc_group_is_receiver(m)) {
 			tipc_group_update_member(m, len);
 			m->bc_acked = prev;
 			ackers++;
@@ -427,10 +427,10 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len)
 	if (grp->bc_ackers)
 		return true;
 
-	if (list_empty(&grp->congested))
+	if (list_empty(&grp->small_win))
 		return false;
 
-	m = list_first_entry(&grp->congested, struct tipc_member, congested);
+	m = list_first_entry(&grp->small_win, struct tipc_member, small_win);
 	if (m->window >= len)
 		return false;
 
@@ -485,7 +485,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
 		goto drop;
 
 	m = tipc_group_find_member(grp, node, port);
-	if (!tipc_group_is_receiver(m))
+	if (!tipc_group_is_sender(m))
 		goto drop;
 
 	if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt))
@@ -691,7 +691,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			msg_set_grp_bc_seqno(ehdr, m->bc_syncpt);
 			__skb_queue_tail(inputq, m->event_msg);
 		}
-		list_del_init(&m->congested);
+		list_del_init(&m->small_win);
 		tipc_group_update_member(m, 0);
 		return;
 	case GRP_LEAVE_MSG:
@@ -699,7 +699,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			return;
 		m->bc_syncpt = msg_grp_bc_syncpt(hdr);
 		list_del_init(&m->list);
-		list_del_init(&m->congested);
+		list_del_init(&m->small_win);
 		*usr_wakeup = true;
 
 		/* Wait until WITHDRAW event is received */
@@ -719,7 +719,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		m->window += msg_adv_win(hdr);
 		*usr_wakeup = m->usr_pending;
 		m->usr_pending = false;
-		list_del_init(&m->congested);
+		list_del_init(&m->small_win);
 		return;
 	case GRP_ACK_MSG:
 		if (!m)
@@ -840,7 +840,7 @@ void tipc_group_member_evt(struct tipc_group *grp,
 		if (m->window < ADV_IDLE)
 			tipc_group_update_member(m, 0);
 		else
-			list_del_init(&m->congested);
+			list_del_init(&m->small_win);
 	} else if (event == TIPC_WITHDRAWN) {
 		if (!m)
 			goto drop;
@@ -873,7 +873,7 @@ void tipc_group_member_evt(struct tipc_group *grp,
 			__skb_queue_tail(inputq, skb);
 		}
 		list_del_init(&m->list);
-		list_del_init(&m->congested);
+		list_del_init(&m->small_win);
 	}
 	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
 	return;
-- 
cgit v1.2.3


From d84d1b3b6b8c296de8d394002bc3a0ca910d7460 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 4 Jan 2018 15:20:45 +0100
Subject: tipc: simplify small window members' sorting algorithm

We simplify the sorting algorithm in tipc_update_member(). We also make
the remaining conditional call to this function unconditional, since the
same condition now is tested for inside the said function.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 0d743b98823f..fb7fe971e51b 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -355,12 +355,10 @@ void tipc_group_update_member(struct tipc_member *m, int len)
 
 	/* Sort member into small_window members' list */
 	list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) {
-		if (m->window > _m->window)
-			continue;
-		list_add_tail(&m->small_win, &_m->small_win);
-		return;
+		if (_m->window > m->window)
+			break;
 	}
-	list_add_tail(&m->small_win, &grp->small_win);
+	list_add_tail(&m->small_win, &_m->small_win);
 }
 
 void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack)
@@ -837,10 +835,7 @@ void tipc_group_member_evt(struct tipc_group *grp,
 		m->instance = instance;
 		TIPC_SKB_CB(skb)->orig_member = m->instance;
 		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
-		if (m->window < ADV_IDLE)
-			tipc_group_update_member(m, 0);
-		else
-			list_del_init(&m->small_win);
+		tipc_group_update_member(m, 0);
 	} else if (event == TIPC_WITHDRAWN) {
 		if (!m)
 			goto drop;
-- 
cgit v1.2.3


From c90ecbfaf50d2d7db25c531d9169be7e47435f3f Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 4 Jan 2018 06:52:59 -0800
Subject: rds: Use atomic flag to track connections being destroyed

Replace c_destroy_in_prog by using a bit in cp_flags that
can set/tested atomically.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c  | 7 ++++---
 net/rds/rds.h         | 4 ++--
 net/rds/tcp_connect.c | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 6492c0b608a4..1eed197e694f 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -366,7 +366,7 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
 	 * to the conn hash, so we never trigger a reconnect on this
 	 * conn - the reconnect is always triggered by the active peer. */
 	cancel_delayed_work_sync(&cp->cp_conn_w);
-	if (conn->c_destroy_in_prog)
+	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
 		return;
 	rcu_read_lock();
 	if (!hlist_unhashed(&conn->c_hash_node)) {
@@ -384,6 +384,8 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
 {
 	struct rds_message *rm, *rtmp;
 
+	set_bit(RDS_DESTROY_PENDING, &cp->cp_flags);
+
 	if (!cp->cp_transport_data)
 		return;
 
@@ -426,7 +428,6 @@ void rds_conn_destroy(struct rds_connection *conn)
 		 "%pI4\n", conn, &conn->c_laddr,
 		 &conn->c_faddr);
 
-	conn->c_destroy_in_prog = 1;
 	/* Ensure conn will not be scheduled for reconnect */
 	spin_lock_irq(&rds_conn_lock);
 	hlist_del_init_rcu(&conn->c_hash_node);
@@ -685,7 +686,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
 {
 	atomic_set(&cp->cp_state, RDS_CONN_ERROR);
 
-	if (!destroy && cp->cp_conn->c_destroy_in_prog)
+	if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
 		return;
 
 	queue_work(rds_wq, &cp->cp_down_w);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index d09f6c1facb4..374ae83b60d4 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -88,6 +88,7 @@ enum {
 #define RDS_RECONNECT_PENDING	1
 #define RDS_IN_XMIT		2
 #define RDS_RECV_REFILL		3
+#define	RDS_DESTROY_PENDING	4
 
 /* Max number of multipaths per RDS connection. Must be a power of 2 */
 #define	RDS_MPATH_WORKERS	8
@@ -139,8 +140,7 @@ struct rds_connection {
 	__be32			c_faddr;
 	unsigned int		c_loopback:1,
 				c_ping_triggered:1,
-				c_destroy_in_prog:1,
-				c_pad_to_32:29;
+				c_pad_to_32:30;
 	int			c_npaths;
 	struct rds_connection	*c_passive;
 	struct rds_transport	*c_trans;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 46f74dad0e16..534c67aeb20f 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
 		 cp->cp_conn, tc, sock);
 
 	if (sock) {
-		if (cp->cp_conn->c_destroy_in_prog)
+		if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
 			rds_tcp_set_linger(sock);
 		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
 		lock_sock(sock->sk);
-- 
cgit v1.2.3


From 3db6e0d172c94bd9953a1347c55ffb64b1d2e74f Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 4 Jan 2018 06:53:00 -0800
Subject: rds: use RCU to synchronize work-enqueue with connection teardown

rds_sendmsg() can enqueue work on cp_send_w from process context, but
it should not enqueue this work if connection teardown  has commenced
(else we risk enquing work after rds_conn_path_destroy() has assumed that
all work has been cancelled/flushed).

Similarly some other functions like rds_cong_queue_updates
and rds_tcp_data_ready are called in softirq context, and may end
up enqueuing work on rds_wq after rds_conn_path_destroy() has assumed
that all workqs are quiesced.

Check the RDS_DESTROY_PENDING bit and use rcu synchronization to avoid
all these races.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/cong.c       | 10 +++++++---
 net/rds/connection.c | 21 +++++++++++++++++----
 net/rds/send.c       | 37 ++++++++++++++++++++++++++++++++-----
 net/rds/tcp_recv.c   |  8 ++++++--
 net/rds/tcp_send.c   |  5 ++++-
 net/rds/threads.c    | 20 +++++++++++++++-----
 6 files changed, 81 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/rds/cong.c b/net/rds/cong.c
index 8398fee7c866..8d19fd25dce3 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -219,7 +219,11 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 	spin_lock_irqsave(&rds_cong_lock, flags);
 
 	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
-		if (!test_and_set_bit(0, &conn->c_map_queued)) {
+		struct rds_conn_path *cp = &conn->c_path[0];
+
+		rcu_read_lock();
+		if (!test_and_set_bit(0, &conn->c_map_queued) &&
+		    !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
 			rds_stats_inc(s_cong_update_queued);
 			/* We cannot inline the call to rds_send_xmit() here
 			 * for two reasons (both pertaining to a TCP transport):
@@ -235,9 +239,9 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 			 *    therefore trigger warnings.
 			 * Defer the xmit to rds_send_worker() instead.
 			 */
-			queue_delayed_work(rds_wq,
-					   &conn->c_path[0].cp_send_w, 0);
+			queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
 		}
+		rcu_read_unlock();
 	}
 
 	spin_unlock_irqrestore(&rds_cong_lock, flags);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 1eed197e694f..b10c0ef36d8d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -366,8 +366,6 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
 	 * to the conn hash, so we never trigger a reconnect on this
 	 * conn - the reconnect is always triggered by the active peer. */
 	cancel_delayed_work_sync(&cp->cp_conn_w);
-	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
-		return;
 	rcu_read_lock();
 	if (!hlist_unhashed(&conn->c_hash_node)) {
 		rcu_read_unlock();
@@ -390,6 +388,7 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
 		return;
 
 	/* make sure lingering queued work won't try to ref the conn */
+	synchronize_rcu();
 	cancel_delayed_work_sync(&cp->cp_send_w);
 	cancel_delayed_work_sync(&cp->cp_recv_w);
 
@@ -407,6 +406,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
 	if (cp->cp_xmit_rm)
 		rds_message_put(cp->cp_xmit_rm);
 
+	WARN_ON(delayed_work_pending(&cp->cp_send_w));
+	WARN_ON(delayed_work_pending(&cp->cp_recv_w));
+	WARN_ON(delayed_work_pending(&cp->cp_conn_w));
+	WARN_ON(work_pending(&cp->cp_down_w));
+
 	cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
 }
 
@@ -686,10 +690,13 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
 {
 	atomic_set(&cp->cp_state, RDS_CONN_ERROR);
 
-	if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	rcu_read_lock();
+	if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+		rcu_read_unlock();
 		return;
-
+	}
 	queue_work(rds_wq, &cp->cp_down_w);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rds_conn_path_drop);
 
@@ -706,9 +713,15 @@ EXPORT_SYMBOL_GPL(rds_conn_drop);
  */
 void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
 {
+	rcu_read_lock();
+	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+		rcu_read_unlock();
+		return;
+	}
 	if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
 	    !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
 		queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
 
diff --git a/net/rds/send.c b/net/rds/send.c
index f72466c63f0c..d3e32d1f3c7d 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -162,6 +162,12 @@ restart:
 		goto out;
 	}
 
+	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+		release_in_xmit(cp);
+		ret = -ENETUNREACH; /* dont requeue send work */
+		goto out;
+	}
+
 	/*
 	 * we record the send generation after doing the xmit acquire.
 	 * if someone else manages to jump in and do some work, we'll use
@@ -437,7 +443,12 @@ over_batch:
 		    !list_empty(&cp->cp_send_queue)) && !raced) {
 			if (batch_count < send_batch_count)
 				goto restart;
-			queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+			rcu_read_lock();
+			if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+				ret = -ENETUNREACH;
+			else
+				queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+			rcu_read_unlock();
 		} else if (raced) {
 			rds_stats_inc(s_send_lock_queue_raced);
 		}
@@ -1151,6 +1162,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	else
 		cpath = &conn->c_path[0];
 
+	if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
 	rds_conn_path_connect_if_down(cpath);
 
 	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
@@ -1190,9 +1206,17 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	rds_stats_inc(s_send_queued);
 
 	ret = rds_send_xmit(cpath);
-	if (ret == -ENOMEM || ret == -EAGAIN)
-		queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
-
+	if (ret == -ENOMEM || ret == -EAGAIN) {
+		ret = 0;
+		rcu_read_lock();
+		if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags))
+			ret = -ENETUNREACH;
+		else
+			queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
+		rcu_read_unlock();
+	}
+	if (ret)
+		goto out;
 	rds_message_put(rm);
 	return payload_len;
 
@@ -1270,7 +1294,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
 	rds_stats_inc(s_send_pong);
 
 	/* schedule the send work on rds_wq */
-	queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+	rcu_read_lock();
+	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+	rcu_read_unlock();
 
 	rds_message_put(rm);
 	return 0;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index e006ef8e6d40..dd707b9e73e5 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -321,8 +321,12 @@ void rds_tcp_data_ready(struct sock *sk)
 	ready = tc->t_orig_data_ready;
 	rds_tcp_stats_inc(s_tcp_data_ready_calls);
 
-	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM)
-		queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
+		rcu_read_lock();
+		if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+		rcu_read_unlock();
+	}
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
 	ready(sk);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index dc860d1bb608..73c74763ca72 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -202,8 +202,11 @@ void rds_tcp_write_space(struct sock *sk)
 	tc->t_last_seen_una = rds_tcp_snd_una(tc);
 	rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked);
 
-	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+	rcu_read_lock();
+	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
+	    !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+	rcu_read_unlock();
 
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index f121daa402c8..eb76db1360b0 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -87,8 +87,12 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
 
 	cp->cp_reconnect_jiffies = 0;
 	set_bit(0, &cp->cp_conn->c_map_queued);
-	queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
-	queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+	rcu_read_lock();
+	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+		queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rds_connect_path_complete);
 
@@ -133,7 +137,10 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 	set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
 	if (cp->cp_reconnect_jiffies == 0) {
 		cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
-		queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+		rcu_read_lock();
+		if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+			queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+		rcu_read_unlock();
 		return;
 	}
 
@@ -141,8 +148,11 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
 		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
 		 conn, &conn->c_laddr, &conn->c_faddr);
-	queue_delayed_work(rds_wq, &cp->cp_conn_w,
-			   rand % cp->cp_reconnect_jiffies);
+	rcu_read_lock();
+	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		queue_delayed_work(rds_wq, &cp->cp_conn_w,
+				   rand % cp->cp_reconnect_jiffies);
+	rcu_read_unlock();
 
 	cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
 					rds_sysctl_reconnect_max_jiffies);
-- 
cgit v1.2.3


From aecd67b60722dd24353b0bc50e78a55b30707dcd Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:13 +0100
Subject: xdp: base API for new XDP rx-queue info concept

This patch only introduce the core data structures and API functions.
All XDP enabled drivers must use the API before this info can used.

There is a need for XDP to know more about the RX-queue a given XDP
frames have arrived on.  For both the XDP bpf-prog and kernel side.

Instead of extending xdp_buff each time new info is needed, the patch
creates a separate read-mostly struct xdp_rxq_info, that contains this
info.  We stress this data/cache-line is for read-only info.  This is
NOT for dynamic per packet info, use the data_meta for such use-cases.

The performance advantage is this info can be setup at RX-ring init
time, instead of updating N-members in xdp_buff.  A possible (driver
level) micro optimization is that xdp_buff->rxq assignment could be
done once per XDP/NAPI loop.  The extra pointer deref only happens for
program needing access to this info (thus, no slowdown to existing
use-cases).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  2 ++
 include/net/xdp.h      | 47 +++++++++++++++++++++++++++++++++++
 net/core/Makefile      |  2 +-
 net/core/xdp.c         | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 include/net/xdp.h
 create mode 100644 net/core/xdp.c

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b0df2703671..425056c7f96c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -20,6 +20,7 @@
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
 
+#include <net/xdp.h>
 #include <net/sch_generic.h>
 
 #include <uapi/linux/filter.h>
@@ -503,6 +504,7 @@ struct xdp_buff {
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
 };
 
 /* Compute the linear packet data range [data, data_end) which
diff --git a/include/net/xdp.h b/include/net/xdp.h
new file mode 100644
index 000000000000..86c41631a908
--- /dev/null
+++ b/include/net/xdp.h
@@ -0,0 +1,47 @@
+/* include/net/xdp.h
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+#ifndef __LINUX_NET_XDP_H__
+#define __LINUX_NET_XDP_H__
+
+/**
+ * DOC: XDP RX-queue information
+ *
+ * The XDP RX-queue info (xdp_rxq_info) is associated with the driver
+ * level RX-ring queues.  It is information that is specific to how
+ * the driver have configured a given RX-ring queue.
+ *
+ * Each xdp_buff frame received in the driver carry a (pointer)
+ * reference to this xdp_rxq_info structure.  This provides the XDP
+ * data-path read-access to RX-info for both kernel and bpf-side
+ * (limited subset).
+ *
+ * For now, direct access is only safe while running in NAPI/softirq
+ * context.  Contents is read-mostly and must not be updated during
+ * driver NAPI/softirq poll.
+ *
+ * The driver usage API is a register and unregister API.
+ *
+ * The struct is not directly tied to the XDP prog.  A new XDP prog
+ * can be attached as long as it doesn't change the underlying
+ * RX-ring.  If the RX-ring does change significantly, the NIC driver
+ * naturally need to stop the RX-ring before purging and reallocating
+ * memory.  In that process the driver MUST call unregistor (which
+ * also apply for driver shutdown and unload).  The register API is
+ * also mandatory during RX-ring setup.
+ */
+
+struct xdp_rxq_info {
+	struct net_device *dev;
+	u32 queue_index;
+	u32 reg_state;
+} ____cacheline_aligned; /* perf critical, avoid false-sharing */
+
+int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		     struct net_device *dev, u32 queue_index);
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
+
+#endif /* __LINUX_NET_XDP_H__ */
diff --git a/net/core/Makefile b/net/core/Makefile
index 1fd0a9c88b1b..6dbbba8c57ae 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
 			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
-			fib_notifier.o
+			fib_notifier.o xdp.o
 
 obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
diff --git a/net/core/xdp.c b/net/core/xdp.c
new file mode 100644
index 000000000000..229bc5a0ee04
--- /dev/null
+++ b/net/core/xdp.c
@@ -0,0 +1,67 @@
+/* net/core/xdp.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include <net/xdp.h>
+
+#define REG_STATE_NEW		0x0
+#define REG_STATE_REGISTERED	0x1
+#define REG_STATE_UNREGISTERED	0x2
+#define REG_STATE_UNUSED	0x3
+
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
+{
+	/* Simplify driver cleanup code paths, allow unreg "unused" */
+	if (xdp_rxq->reg_state == REG_STATE_UNUSED)
+		return;
+
+	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+
+	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
+	xdp_rxq->dev = NULL;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
+
+static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
+{
+	memset(xdp_rxq, 0, sizeof(*xdp_rxq));
+}
+
+/* Returns 0 on success, negative on failure */
+int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		     struct net_device *dev, u32 queue_index)
+{
+	if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
+		WARN(1, "Driver promised not to register this");
+		return -EINVAL;
+	}
+
+	if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
+		WARN(1, "Missing unregister, handled but fix driver");
+		xdp_rxq_info_unreg(xdp_rxq);
+	}
+
+	if (!dev) {
+		WARN(1, "Missing net_device from driver");
+		return -ENODEV;
+	}
+
+	/* State either UNREGISTERED or NEW */
+	xdp_rxq_info_init(xdp_rxq);
+	xdp_rxq->dev = dev;
+	xdp_rxq->queue_index = queue_index;
+
+	xdp_rxq->reg_state = REG_STATE_REGISTERED;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
+{
+	xdp_rxq->reg_state = REG_STATE_UNUSED;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
-- 
cgit v1.2.3


From c0124f327e5cabd844a10d7e1fc5aa2a81e796a9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:34 +0100
Subject: xdp/qede: setup xdp_rxq_info and intro xdp_rxq_info_is_reg

The driver code qede_free_fp_array() depend on kfree() can be called
with a NULL pointer. This stems from the qede_alloc_fp_array()
function which either (kz)alloc memory for fp->txq or fp->rxq.
This also simplifies error handling code in case of memory allocation
failures, but xdp_rxq_info_unreg need to know the difference.

Introduce xdp_rxq_info_is_reg() to handle if a memory allocation fails
and detect this is the failure path by seeing that xdp_rxq_info was
not registred yet, which first happens after successful alloaction in
qede_init_fp().

Driver hook points for xdp_rxq_info:
 * reg  : qede_init_fp
 * unreg: qede_free_fp_array

Tested on actual hardware with samples/bpf program.

V2: Driver have no proper error path for failed XDP RX-queue info reg, as
qede_init_fp() is a void function.

Cc: everest-linux-l2@cavium.com
Cc: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/qlogic/qede/qede.h      |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_fp.c   |  1 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 10 ++++++++++
 include/net/xdp.h                            |  1 +
 net/core/xdp.c                               |  6 ++++++
 5 files changed, 20 insertions(+)

(limited to 'net')

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 8a336517baac..8116cfd30fad 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -40,6 +40,7 @@
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/qed/qede_rdma.h>
 #include <linux/io.h>
 #ifdef CONFIG_RFS_ACCEL
@@ -345,6 +346,7 @@ struct qede_rx_queue {
 	u64 xdp_no_pass;
 
 	void *handle;
+	struct xdp_rxq_info xdp_rxq;
 };
 
 union db_prod {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 48ec4c56cddf..dafc079ab6b9 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1006,6 +1006,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 	xdp.data = xdp.data_hard_start + *data_offset;
 	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
+	xdp.rxq = &rxq->xdp_rxq;
 
 	/* Queues always have a full reset currently, so for the time
 	 * being until there's atomic program replace just mark read
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 90d79ae2a48f..9929b4370ce6 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -765,6 +765,12 @@ static void qede_free_fp_array(struct qede_dev *edev)
 			fp = &edev->fp_array[i];
 
 			kfree(fp->sb_info);
+			/* Handle mem alloc failure case where qede_init_fp
+			 * didn't register xdp_rxq_info yet.
+			 * Implicit only (fp->type & QEDE_FASTPATH_RX)
+			 */
+			if (fp->rxq && xdp_rxq_info_is_reg(&fp->rxq->xdp_rxq))
+				xdp_rxq_info_unreg(&fp->rxq->xdp_rxq);
 			kfree(fp->rxq);
 			kfree(fp->xdp_tx);
 			kfree(fp->txq);
@@ -1493,6 +1499,10 @@ static void qede_init_fp(struct qede_dev *edev)
 			else
 				fp->rxq->data_direction = DMA_FROM_DEVICE;
 			fp->rxq->dev = &edev->pdev->dev;
+
+			/* Driver have no error path from here */
+			WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev,
+						 fp->rxq->rxq_id) < 0);
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 86c41631a908..b2362ddfa694 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -43,5 +43,6 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 229bc5a0ee04..097a0f74e004 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -65,3 +65,9 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
 	xdp_rxq->reg_state = REG_STATE_UNUSED;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
+
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
+{
+	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
-- 
cgit v1.2.3


From e817f85652c14d78f170b18797e4c477c78949e0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:09 +0100
Subject: xdp: generic XDP handling of xdp_rxq_info

Hook points for xdp_rxq_info:
 * reg  : netif_alloc_rx_queues
 * unreg: netif_free_rx_queues

The net_device have some members (num_rx_queues + real_num_rx_queues)
and data-area (dev->_rx with struct netdev_rx_queue's) that were
primarily used for exporting information about RPS (CONFIG_RPS) queues
to sysfs (CONFIG_SYSFS).

For generic XDP extend struct netdev_rx_queue with the xdp_rxq_info,
and remove some of the CONFIG_SYSFS ifdefs.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 69 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 61 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 49bfc6eec74c..440b000f07f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -44,6 +44,7 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
+#include <net/xdp.h>
 
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
@@ -686,6 +687,7 @@ struct netdev_rx_queue {
 #endif
 	struct kobject			kobj;
 	struct net_device		*dev;
+	struct xdp_rxq_info		xdp_rxq;
 } ____cacheline_aligned_in_smp;
 
 /*
diff --git a/net/core/dev.c b/net/core/dev.c
index 2eb66c0d9cdb..d7925ef8743d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3906,9 +3906,33 @@ drop:
 	return NET_RX_DROP;
 }
 
+static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct netdev_rx_queue *rxqueue;
+
+	rxqueue = dev->_rx;
+
+	if (skb_rx_queue_recorded(skb)) {
+		u16 index = skb_get_rx_queue(skb);
+
+		if (unlikely(index >= dev->real_num_rx_queues)) {
+			WARN_ONCE(dev->real_num_rx_queues > 1,
+				  "%s received packet on queue %u, but number "
+				  "of RX queues is %u\n",
+				  dev->name, index, dev->real_num_rx_queues);
+
+			return rxqueue; /* Return first rxqueue */
+		}
+		rxqueue += index;
+	}
+	return rxqueue;
+}
+
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	struct netdev_rx_queue *rxqueue;
 	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
 	void *orig_data;
@@ -3952,6 +3976,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
 
+	rxqueue = netif_get_rxqueue(skb);
+	xdp.rxq = &rxqueue->xdp_rxq;
+
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
 	off = xdp.data - orig_data;
@@ -7589,12 +7616,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
-#ifdef CONFIG_SYSFS
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
 	struct netdev_rx_queue *rx;
 	size_t sz = count * sizeof(*rx);
+	int err = 0;
 
 	BUG_ON(count < 1);
 
@@ -7604,11 +7631,39 @@ static int netif_alloc_rx_queues(struct net_device *dev)
 
 	dev->_rx = rx;
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		rx[i].dev = dev;
+
+		/* XDP RX-queue setup */
+		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+		if (err < 0)
+			goto err_rxq_info;
+	}
 	return 0;
+
+err_rxq_info:
+	/* Rollback successful reg's and free other resources */
+	while (i--)
+		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+	kfree(dev->_rx);
+	dev->_rx = NULL;
+	return err;
+}
+
+static void netif_free_rx_queues(struct net_device *dev)
+{
+	unsigned int i, count = dev->num_rx_queues;
+	struct netdev_rx_queue *rx;
+
+	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
+	if (!dev->_rx)
+		return;
+
+	rx = dev->_rx;
+
+	for (i = 0; i < count; i++)
+		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 }
-#endif
 
 static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue, void *_unused)
@@ -8169,12 +8224,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-#ifdef CONFIG_SYSFS
 	if (rxqs < 1) {
 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 		return NULL;
 	}
-#endif
 
 	alloc_size = sizeof(struct net_device);
 	if (sizeof_priv) {
@@ -8231,12 +8284,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (netif_alloc_netdev_queues(dev))
 		goto free_all;
 
-#ifdef CONFIG_SYSFS
 	dev->num_rx_queues = rxqs;
 	dev->real_num_rx_queues = rxqs;
 	if (netif_alloc_rx_queues(dev))
 		goto free_all;
-#endif
 
 	strcpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
@@ -8275,9 +8326,7 @@ void free_netdev(struct net_device *dev)
 
 	might_sleep();
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_SYSFS
-	kvfree(dev->_rx);
-#endif
+	netif_free_rx_queues(dev);
 
 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 
-- 
cgit v1.2.3


From 02dd3291b2f095bbc88e1d2628fd5bf2e92de69b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:14 +0100
Subject: bpf: finally expose xdp_rxq_info to XDP bpf-programs

Now all XDP driver have been updated to setup xdp_rxq_info and assign
this to xdp_buff->rxq.  Thus, it is now safe to enable access to some
of the xdp_rxq_info struct members.

This patch extend xdp_md and expose UAPI to userspace for
ingress_ifindex and rx_queue_index.  Access happens via bpf
instruction rewrite, that load data directly from struct xdp_rxq_info.

* ingress_ifindex map to xdp_rxq_info->dev->ifindex
* rx_queue_index  map to xdp_rxq_info->queue_index

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f2f8b36e2ad4..405317f9c064 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -899,6 +899,9 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
+	/* Below access go though struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
 enum sk_action {
diff --git a/net/core/filter.c b/net/core/filter.c
index 130b842c3a15..acdb94c0e97f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
+	case offsetof(struct xdp_md, ingress_ifindex):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct xdp_rxq_info, dev));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct net_device,
+						     ifindex, 4, target_size));
+		break;
+	case offsetof(struct xdp_md, rx_queue_index):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct xdp_rxq_info,
+						queue_index, 4, target_size));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From 9fcb0714dc38250d3b56d28c0b5a0bde0d2a59f2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:01 +0200
Subject: ipv6: Remove redundant route flushing during namespace dismantle

By the time fib6_net_exit() is executed all the netdevs in the namespace
have been either unregistered or pushed back to the default namespace.
That is because pernet subsys operations are always ordered before
pernet device operations and therefore invoked after them during
namespace dismantle.

Thus, all the routing tables in the namespace are empty by the time
fib6_net_exit() is invoked and the call to rt6_ifdown() can be removed.

This allows us to simplify the condition in fib6_ifdown() as it's only
ever called with an actual netdev.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 1 -
 net/ipv6/route.c   | 8 +++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index a64d559fa513..3bbb89d8187d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2103,7 +2103,6 @@ static void fib6_net_exit(struct net *net)
 {
 	unsigned int i;
 
-	rt6_ifdown(net, NULL);
 	del_timer_sync(&net->ipv6.ip6_fib_timer);
 
 	for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2490280b3394..c557362daa23 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3470,10 +3470,9 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
 	const struct arg_dev_net *adn = arg;
 	const struct net_device *dev = adn->dev;
 
-	if ((rt->dst.dev == dev || !dev) &&
+	if (rt->dst.dev == dev &&
 	    rt != adn->net->ipv6.ip6_null_entry &&
-	    (rt->rt6i_nsiblings == 0 ||
-	     (dev && netdev_unregistering(dev)) ||
+	    (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) ||
 	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
 		return -1;
 
@@ -3488,8 +3487,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
 	};
 
 	fib6_clean_all(net, fib6_ifdown, &adn);
-	if (dev)
-		rt6_uncached_list_flush_dev(net, dev);
+	rt6_uncached_list_flush_dev(net, dev);
 }
 
 struct rt6_mtu_change_arg {
-- 
cgit v1.2.3


From 2b2413610ef6fc9df9ad003b02fecc3a8057231e Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:02 +0200
Subject: ipv6: Mark dead nexthops with appropriate flags

When a netdev is put administratively down or unregistered all the
nexthops using it as their nexthop device should be marked with the
'dead' and 'linkdown' flags.

Currently, when a route is dumped its nexthop device is tested and the
flags are set accordingly. A similar check is performed during route
lookup.

Instead, we can simply mark the nexthops based on netdev events and
avoid checking the netdev's state during route dump and lookup.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c557362daa23..f5eda0aeab55 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3473,8 +3473,10 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
 	if (rt->dst.dev == dev &&
 	    rt != adn->net->ipv6.ip6_null_entry &&
 	    (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) ||
-	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) {
+		rt->rt6i_nh_flags |= (RTNH_F_DEAD | RTNH_F_LINKDOWN);
 		return -1;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2127d95aef6c795c3bd8b805722c5c46e8fe45dd Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:03 +0200
Subject: ipv6: Clear nexthop flags upon netdev up

Previous patch marked nexthops with the 'dead' and 'linkdown' flags.
Clear these flags when the netdev comes back up.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/addrconf.c     |  3 +++
 net/ipv6/route.c        | 29 +++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 18e442ea93d8..caad39198c2a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -169,6 +169,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
+void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ed06b1190f05..b6405568ed7b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			if (run_pending)
 				addrconf_dad_run(idev);
 
+			/* Device has an address by now */
+			rt6_sync_up(dev, RTNH_F_DEAD);
+
 			/*
 			 * If the MTU changed during the interface down,
 			 * when the interface up, the changed MTU must be
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f5eda0aeab55..4796d87e0b93 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3459,6 +3459,35 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
 	fib6_clean_all(net, fib6_clean_tohost, gateway);
 }
 
+struct arg_netdev_event {
+	const struct net_device *dev;
+	unsigned int nh_flags;
+};
+
+static int fib6_ifup(struct rt6_info *rt, void *p_arg)
+{
+	const struct arg_netdev_event *arg = p_arg;
+	const struct net *net = dev_net(arg->dev);
+
+	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev)
+		rt->rt6i_nh_flags &= ~arg->nh_flags;
+
+	return 0;
+}
+
+void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
+{
+	struct arg_netdev_event arg = {
+		.dev = dev,
+		.nh_flags = nh_flags,
+	};
+
+	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
+		arg.nh_flags |= RTNH_F_LINKDOWN;
+
+	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
+}
+
 struct arg_dev_net {
 	struct net_device *dev;
 	struct net *net;
-- 
cgit v1.2.3


From 4c981e28d373e391b76577635e7e216976b71c57 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:04 +0200
Subject: ipv6: Prepare to handle multiple netdev events

To make IPv6 more in line with IPv4 we need to be able to respond
differently to different netdev events. For example, when a netdev is
unregistered all the routes using it as their nexthop device should be
flushed, whereas when the netdev's carrier changes only the 'linkdown'
flag should be toggled.

Currently, this is not possible, as the function that traverses the
routing tables is not aware of the triggering event.

Propagate the triggering event down, so that it could be used in later
patches.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  2 +-
 net/ipv6/addrconf.c     |  4 ++--
 net/ipv6/route.c        | 37 +++++++++++++++++++++----------------
 3 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index caad39198c2a..6a2f80cbdf65 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -165,11 +165,11 @@ struct rt6_rtnl_dump_arg {
 };
 
 int rt6_dump_route(struct rt6_info *rt, void *p_arg);
-void rt6_ifdown(struct net *net, struct net_device *dev);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
+void rt6_disable_ip(struct net_device *dev, unsigned long event);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b6405568ed7b..a13e1ffe87ec 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3580,6 +3580,7 @@ static bool addr_is_local(const struct in6_addr *addr)
 
 static int addrconf_ifdown(struct net_device *dev, int how)
 {
+	unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN;
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
 	struct inet6_ifaddr *ifa, *tmp;
@@ -3589,8 +3590,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	ASSERT_RTNL();
 
-	rt6_ifdown(net, dev);
-	neigh_ifdown(&nd_tbl, dev);
+	rt6_disable_ip(dev, event);
 
 	idev = __in6_dev_get(dev);
 	if (!idev)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4796d87e0b93..194fe9d9cd85 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2344,7 +2344,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	rt->rt6i_idev     = idev;
 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
 
-	/* Add this dst into uncached_list so that rt6_ifdown() can
+	/* Add this dst into uncached_list so that rt6_disable_ip() can
 	 * do proper release of the net_device
 	 */
 	rt6_uncached_list_add(rt);
@@ -3461,7 +3461,10 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
 
 struct arg_netdev_event {
 	const struct net_device *dev;
-	unsigned int nh_flags;
+	union {
+		unsigned int nh_flags;
+		unsigned long event;
+	};
 };
 
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
@@ -3488,19 +3491,15 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
 }
 
-struct arg_dev_net {
-	struct net_device *dev;
-	struct net *net;
-};
-
 /* called with write lock held for table with rt */
-static int fib6_ifdown(struct rt6_info *rt, void *arg)
+static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 {
-	const struct arg_dev_net *adn = arg;
-	const struct net_device *dev = adn->dev;
+	const struct arg_netdev_event *arg = p_arg;
+	const struct net_device *dev = arg->dev;
+	const struct net *net = dev_net(dev);
 
 	if (rt->dst.dev == dev &&
-	    rt != adn->net->ipv6.ip6_null_entry &&
+	    rt != net->ipv6.ip6_null_entry &&
 	    (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) ||
 	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) {
 		rt->rt6i_nh_flags |= (RTNH_F_DEAD | RTNH_F_LINKDOWN);
@@ -3510,15 +3509,21 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
 	return 0;
 }
 
-void rt6_ifdown(struct net *net, struct net_device *dev)
+static void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
 {
-	struct arg_dev_net adn = {
+	struct arg_netdev_event arg = {
 		.dev = dev,
-		.net = net,
+		.event = event,
 	};
 
-	fib6_clean_all(net, fib6_ifdown, &adn);
-	rt6_uncached_list_flush_dev(net, dev);
+	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
+}
+
+void rt6_disable_ip(struct net_device *dev, unsigned long event)
+{
+	rt6_sync_down_dev(dev, event);
+	rt6_uncached_list_flush_dev(dev_net(dev), dev);
+	neigh_ifdown(&nd_tbl, dev);
 }
 
 struct rt6_mtu_change_arg {
-- 
cgit v1.2.3


From 27c6fa73f93b81671a77bdaa242473c3bda0ac4a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:05 +0200
Subject: ipv6: Set nexthop flags upon carrier change

Similar to IPv4, when the carrier of a netdev changes we should toggle
the 'linkdown' flag on all the nexthops using it as their nexthop
device.

This will later allow us to test for the presence of this flag during
route lookup and dump.

Up until commit 4832c30d5458 ("net: ipv6: put host and anycast routes on
device with address") host and anycast routes used the loopback netdev
as their nexthop device and thus were not marked with the 'linkdown'
flag. The patch preserves this behavior and allows one to ping the local
address even when the nexthop device does not have a carrier and the
'ignore_routes_with_linkdown' sysctl is set.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/addrconf.c     |  2 ++
 net/ipv6/route.c        | 23 +++++++++++++++++------
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 6a2f80cbdf65..34cd3b0c6ded 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -170,6 +170,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
+void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a13e1ffe87ec..2435f7ab070b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3438,6 +3438,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 		} else if (event == NETDEV_CHANGE) {
 			if (!addrconf_link_ready(dev)) {
 				/* device is still not ready. */
+				rt6_sync_down_dev(dev, event);
 				break;
 			}
 
@@ -3449,6 +3450,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 					 * multicast snooping switches
 					 */
 					ipv6_mc_up(idev);
+					rt6_sync_up(dev, RTNH_F_LINKDOWN);
 					break;
 				}
 				idev->if_flags |= IF_READY;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 194fe9d9cd85..2fd36c7dd143 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3498,18 +3498,29 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 	const struct net_device *dev = arg->dev;
 	const struct net *net = dev_net(dev);
 
-	if (rt->dst.dev == dev &&
-	    rt != net->ipv6.ip6_null_entry &&
-	    (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) ||
-	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) {
-		rt->rt6i_nh_flags |= (RTNH_F_DEAD | RTNH_F_LINKDOWN);
+	if (rt->dst.dev != dev || rt == net->ipv6.ip6_null_entry)
+		return 0;
+
+	switch (arg->event) {
+	case NETDEV_UNREGISTER:
 		return -1;
+	case NETDEV_DOWN:
+		if (rt->rt6i_nsiblings == 0 ||
+		    !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+			return -1;
+		rt->rt6i_nh_flags |= RTNH_F_DEAD;
+		/* fall through */
+	case NETDEV_CHANGE:
+		if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+			break;
+		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+		break;
 	}
 
 	return 0;
 }
 
-static void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
+void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
 {
 	struct arg_netdev_event arg = {
 		.dev = dev,
-- 
cgit v1.2.3


From 5609b80a37f69f796548339e675256188b29c17d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:06 +0200
Subject: ipv6: Set nexthop flags during route creation

It is valid to install routes with a nexthop device that does not have a
carrier, so we need to make sure they're marked accordingly.

As explained in the previous patch, host and anycast routes are never
marked with the 'linkdown' flag.

Note that reject routes are unaffected, as these use the loopback device
which always has a carrier.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2fd36c7dd143..314e3bf41f6f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2746,6 +2746,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	rt->rt6i_flags = cfg->fc_flags;
 
 install_route:
+	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+	    !netif_carrier_ok(dev))
+		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
 	rt->dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
-- 
cgit v1.2.3


From 14c5206c2d02495de995210d104cb7f084d82e2b Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:07 +0200
Subject: ipv6: Check nexthop flags during route lookup instead of carrier

Now that the RTNH_F_LINKDOWN flag is set in nexthops, we can avoid the
need to dereference the nexthop device and check its carrier and instead
check for the presence of the flag.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 314e3bf41f6f..ab0eed43ed97 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -474,7 +474,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 			if (route_choosen == 0) {
 				struct inet6_dev *idev = sibling->rt6i_idev;
 
-				if (!netif_carrier_ok(sibling->dst.dev) &&
+				if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
 				    idev->cnf.ignore_routes_with_linkdown)
 					break;
 				if (rt6_score_route(sibling, oif, strict) < 0)
@@ -679,10 +679,9 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	int m;
 	bool match_do_rr = false;
 	struct inet6_dev *idev = rt->rt6i_idev;
-	struct net_device *dev = rt->dst.dev;
 
-	if (dev && !netif_carrier_ok(dev) &&
-	    idev->cnf.ignore_routes_with_linkdown &&
+	if (idev->cnf.ignore_routes_with_linkdown &&
+	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
-- 
cgit v1.2.3


From 44c9f2f206f880c959fa4b43618e3f7fe2cd6157 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:08 +0200
Subject: ipv6: Check nexthop flags in route dump instead of carrier

Similar to previous patch, there is no need to check for the carrier of
the nexthop device when dumping the route and we can instead check for
the presence of the RTNH_F_LINKDOWN flag.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ab0eed43ed97..f980f904d6ea 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4039,7 +4039,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
-	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
+	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
 			*flags |= RTNH_F_DEAD;
-- 
cgit v1.2.3


From 8067bb8c1d3599e137dee445d65b64db90ebc6f5 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:09 +0200
Subject: ipv6: Ignore dead routes during lookup

Currently, dead routes are only present in the routing tables in case
the 'ignore_routes_with_linkdown' sysctl is set. Otherwise, they are
flushed.

Subsequent patches are going to remove the reliance on this sysctl and
make IPv6 more consistent with IPv4.

Before this is done, we need to make sure dead routes are skipped during
route lookup, so as to not cause packet loss.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f980f904d6ea..c00156805bf0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -474,6 +474,8 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 			if (route_choosen == 0) {
 				struct inet6_dev *idev = sibling->rt6i_idev;
 
+				if (sibling->rt6i_nh_flags & RTNH_F_DEAD)
+					break;
 				if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
 				    idev->cnf.ignore_routes_with_linkdown)
 					break;
@@ -499,12 +501,15 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	struct rt6_info *local = NULL;
 	struct rt6_info *sprt;
 
-	if (!oif && ipv6_addr_any(saddr))
-		goto out;
+	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
+		return rt;
 
 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
 		struct net_device *dev = sprt->dst.dev;
 
+		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
+			continue;
+
 		if (oif) {
 			if (dev->ifindex == oif)
 				return sprt;
@@ -533,8 +538,8 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 		if (flags & RT6_LOOKUP_F_IFACE)
 			return net->ipv6.ip6_null_entry;
 	}
-out:
-	return rt;
+
+	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -680,6 +685,9 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	bool match_do_rr = false;
 	struct inet6_dev *idev = rt->rt6i_idev;
 
+	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+		goto out;
+
 	if (idev->cnf.ignore_routes_with_linkdown &&
 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
@@ -2153,6 +2161,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
+		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+			continue;
 		if (rt6_check_expired(rt))
 			continue;
 		if (rt->dst.error)
-- 
cgit v1.2.3


From f9d882ea5705cf03f547e75ac515f1457c89f526 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:10 +0200
Subject: ipv6: Report dead flag during route dump

Up until now the RTNH_F_DEAD flag was only reported in route dump when
the 'ignore_routes_with_linkdown' sysctl was set. This is expected as
dead routes were flushed otherwise.

The reliance on this sysctl is going to be removed, so we need to report
the flag regardless of the sysctl's value.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c00156805bf0..f62d24948aa2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4049,6 +4049,9 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
+	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+		*flags |= RTNH_F_DEAD;
+
 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
-- 
cgit v1.2.3


From b5cb5a755bd43047a9bd18d950fdb22fc828947a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:12 +0200
Subject: ipv6: Teach tree walker to skip multipath routes

As explained in previous patch, fib6_ifdown() needs to consider the
state of all the sibling routes when a multipath route is traversed.

This is done by evaluating all the siblings when the first sibling in a
multipath route is traversed. If the multipath route does not need to be
flushed (e.g., not all siblings are dead), then we should just skip the
multipath route as our work is done.

Have the tree walker jump to the last sibling when it is determined that
the multipath route needs to be skipped.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 3bbb89d8187d..5e4b5eef0ddd 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1887,7 +1887,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 
 	for_each_fib6_walker_rt(w) {
 		res = c->func(rt, c->arg);
-		if (res < 0) {
+		if (res == -1) {
 			w->leaf = rt;
 			res = fib6_del(rt, &info);
 			if (res) {
@@ -1900,6 +1900,12 @@ static int fib6_clean_node(struct fib6_walker *w)
 				continue;
 			}
 			return 0;
+		} else if (res == -2) {
+			if (WARN_ON(!rt->rt6i_nsiblings))
+				continue;
+			rt = list_last_entry(&rt->rt6i_siblings,
+					     struct rt6_info, rt6i_siblings);
+			continue;
 		}
 		WARN_ON(res != 0);
 	}
@@ -1911,7 +1917,8 @@ static int fib6_clean_node(struct fib6_walker *w)
  *	Convenient frontend to tree walker.
  *
  *	func is called on each route.
- *		It may return -1 -> delete this route.
+ *		It may return -2 -> skip multipath route.
+ *			      -1 -> delete this route.
  *		              0  -> continue walking
  */
 
-- 
cgit v1.2.3


From 4a8e56ee2c8551e674f69ba007aabede8f0b88d9 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:13 +0200
Subject: ipv6: Export sernum update function

We are going to allow dead routes to stay in the FIB tree (e.g., when
they are part of a multipath route, directly connected route with no
carrier) and revive them when their nexthop device gains carrier or when
it is put administratively up.

This is equivalent to the addition of the route to the FIB tree and we
should therefore take care of updating the sernum of all the parent
nodes of the node where the route is stored. Otherwise, we risk sockets
caching and using sub-optimal dst entries.

Export the function that performs the above, so that it could be invoked
from fib6_ifup() later on.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/ip6_fib.c    | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index affea1aa6ae4..ddf53dd1e948 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -405,6 +405,7 @@ unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
 void fib6_update_sernum(struct rt6_info *rt);
+void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5e4b5eef0ddd..c1bbe7bf9fdd 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1102,8 +1102,8 @@ void fib6_force_start_gc(struct net *net)
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
 
-static void fib6_update_sernum_upto_root(struct rt6_info *rt,
-					 int sernum)
+static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
+					   int sernum)
 {
 	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
@@ -1117,6 +1117,11 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt,
 	}
 }
 
+void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
+{
+	__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
+}
+
 /*
  *	Add routing information to the routing tree.
  *	<destination addr>/<source addr>
@@ -1230,7 +1235,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	err = fib6_add_rt2node(fn, rt, info, mxc, extack);
 	if (!err) {
-		fib6_update_sernum_upto_root(rt, sernum);
+		__fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
 	}
 
-- 
cgit v1.2.3


From 922c2ac82e37523d1f0efaac8978ef573071d889 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:14 +0200
Subject: ipv6: Take table lock outside of sernum update function

The next patch is going to allow dead routes to remain in the FIB tree
in certain situations.

When this happens we need to be sure to bump the sernum of the nodes
where these are stored so that potential copies cached in sockets are
invalidated.

The function that performs this update assumes the table lock is not
taken when it is invoked, but that will not be the case when it is
invoked by the tree walker.

Have the function assume the lock is taken and make the single caller
take the lock itself.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 5 +----
 net/ipv6/route.c   | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c1bbe7bf9fdd..edda5ad3b405 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -107,16 +107,13 @@ enum {
 
 void fib6_update_sernum(struct rt6_info *rt)
 {
-	struct fib6_table *table = rt->rt6i_table;
 	struct net *net = dev_net(rt->dst.dev);
 	struct fib6_node *fn;
 
-	spin_lock_bh(&table->tb6_lock);
 	fn = rcu_dereference_protected(rt->rt6i_node,
-			lockdep_is_held(&table->tb6_lock));
+			lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	if (fn)
 		fn->fn_sernum = fib6_new_sernum(net);
-	spin_unlock_bh(&table->tb6_lock);
 }
 
 /*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f62d24948aa2..a3bfce71c861 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1353,7 +1353,9 @@ out:
 
 	/* Update fn->fn_sernum to invalidate all cached dst */
 	if (!err) {
+		spin_lock_bh(&ort->rt6i_table->tb6_lock);
 		fib6_update_sernum(ort);
+		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
 		fib6_force_start_gc(net);
 	}
 
-- 
cgit v1.2.3


From 1de178edc76e4ad87d02fcadf73af97f52e87caa Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 7 Jan 2018 12:45:15 +0200
Subject: ipv6: Flush multipath routes when all siblings are dead

By default, IPv6 deletes nexthops from a multipath route when the
nexthop device is put administratively down. This differs from IPv4
where the nexthops are kept, but marked with the RTNH_F_DEAD flag. A
multipath route is flushed when all of its nexthops become dead.

Align IPv6 with IPv4 and have it conform to the same guidelines.

In case the multipath route needs to be flushed, its siblings are
flushed one by one. Otherwise, the nexthops are marked with the
appropriate flags and the tree walker is instructed to skip all the
siblings.

As explained in previous patches, care is taken to update the sernum of
the affected tree nodes, so as to prevent the use of wrong dst entries.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 75 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a3bfce71c861..1054b059747f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3486,8 +3486,10 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	const struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev)
+	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
 		rt->rt6i_nh_flags &= ~arg->nh_flags;
+		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+	}
 
 	return 0;
 }
@@ -3505,6 +3507,58 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
 }
 
+static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
+				   const struct net_device *dev)
+{
+	struct rt6_info *iter;
+
+	if (rt->dst.dev == dev)
+		return true;
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+		if (iter->dst.dev == dev)
+			return true;
+
+	return false;
+}
+
+static void rt6_multipath_flush(struct rt6_info *rt)
+{
+	struct rt6_info *iter;
+
+	rt->should_flush = 1;
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+		iter->should_flush = 1;
+}
+
+static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
+					     const struct net_device *down_dev)
+{
+	struct rt6_info *iter;
+	unsigned int dead = 0;
+
+	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
+		dead++;
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+		if (iter->dst.dev == down_dev ||
+		    iter->rt6i_nh_flags & RTNH_F_DEAD)
+			dead++;
+
+	return dead;
+}
+
+static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
+				       const struct net_device *dev,
+				       unsigned int nh_flags)
+{
+	struct rt6_info *iter;
+
+	if (rt->dst.dev == dev)
+		rt->rt6i_nh_flags |= nh_flags;
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+		if (iter->dst.dev == dev)
+			iter->rt6i_nh_flags |= nh_flags;
+}
+
 /* called with write lock held for table with rt */
 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 {
@@ -3512,20 +3566,33 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 	const struct net_device *dev = arg->dev;
 	const struct net *net = dev_net(dev);
 
-	if (rt->dst.dev != dev || rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.ip6_null_entry)
 		return 0;
 
 	switch (arg->event) {
 	case NETDEV_UNREGISTER:
-		return -1;
+		return rt->dst.dev == dev ? -1 : 0;
 	case NETDEV_DOWN:
-		if (rt->rt6i_nsiblings == 0 ||
-		    !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+		if (rt->should_flush)
 			return -1;
-		rt->rt6i_nh_flags |= RTNH_F_DEAD;
-		/* fall through */
+		if (!rt->rt6i_nsiblings)
+			return rt->dst.dev == dev ? -1 : 0;
+		if (rt6_multipath_uses_dev(rt, dev)) {
+			unsigned int count;
+
+			count = rt6_multipath_dead_count(rt, dev);
+			if (rt->rt6i_nsiblings + 1 == count) {
+				rt6_multipath_flush(rt);
+				return -1;
+			}
+			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
+						   RTNH_F_LINKDOWN);
+			fib6_update_sernum(rt);
+		}
+		return -2;
 	case NETDEV_CHANGE:
-		if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+		if (rt->dst.dev != dev ||
+		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
 		break;
-- 
cgit v1.2.3


From 39215846740a9f29ac7dac276f9df98135f39bb0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 7 Nov 2017 07:20:07 +0100
Subject: netfilter: conntrack: remove nlattr_size pointer from l4proto
 trackers

similar to previous commit, but instead compute this at compile time
and turn nlattr_size into an u16.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h |  7 +++----
 net/netfilter/nf_conntrack_netlink.c         |  2 +-
 net/netfilter/nf_conntrack_proto.c           |  6 +-----
 net/netfilter/nf_conntrack_proto_dccp.c      | 17 ++++++++---------
 net/netfilter/nf_conntrack_proto_sctp.c      | 15 +++++++--------
 net/netfilter/nf_conntrack_proto_tcp.c       | 16 ++++++++--------
 6 files changed, 28 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 7ef56c13698a..0e5618ec8b9d 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -27,6 +27,9 @@ struct nf_conntrack_l4proto {
 	/* Resolve clashes on insertion races. */
 	bool allow_clash;
 
+	/* protoinfo nlattr size, closes a hole */
+	u16 nlattr_size;
+
 	/* Try to fill in the third arg: dataoff is offset past network protocol
            hdr.  Return true if possible. */
 	bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
@@ -66,8 +69,6 @@ struct nf_conntrack_l4proto {
 	/* convert protoinfo to nfnetink attributes */
 	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
 			 struct nf_conn *ct);
-	/* Calculate protoinfo nlattr size */
-	int (*nlattr_size)(void);
 
 	/* convert nfnetlink attributes to protoinfo */
 	int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct);
@@ -80,8 +81,6 @@ struct nf_conntrack_l4proto {
 			       struct nf_conntrack_tuple *t);
 	const struct nla_policy *nla_policy;
 
-	size_t nla_size;
-
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
 	struct {
 		int (*nlattr_to_obj)(struct nlattr *tb[],
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 382d49792f42..316bbdc4a158 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -544,7 +544,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
 	len *= 3u; /* ORIG, REPLY, MASTER */
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
-	len += l4proto->nla_size;
+	len += l4proto->nlattr_size;
 	if (l4proto->nlattr_tuple_size) {
 		len4 = l4proto->nlattr_tuple_size();
 		len4 *= 3u; /* ORIG, REPLY, MASTER */
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index c8e9c9503a08..19c3b1b84544 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -392,7 +392,7 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
 	if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos))
 		return -EBUSY;
 
-	if ((l4proto->to_nlattr && !l4proto->nlattr_size) ||
+	if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) ||
 	    (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
 		return -EINVAL;
 
@@ -428,10 +428,6 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
 		goto out_unlock;
 	}
 
-	l4proto->nla_size = 0;
-	if (l4proto->nlattr_size)
-		l4proto->nla_size += l4proto->nlattr_size();
-
 	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
 			   l4proto);
 out_unlock:
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 2a446f4a554c..2fee7c96ec09 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -654,6 +654,12 @@ static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
 	[CTA_PROTOINFO_DCCP_PAD]	= { .type = NLA_UNSPEC },
 };
 
+#define DCCP_NLATTR_SIZE ( \
+	NLA_ALIGN(NLA_HDRLEN + 1) + \
+	NLA_ALIGN(NLA_HDRLEN + 1) + \
+	NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \
+	NLA_ALIGN(NLA_HDRLEN + 0))
+
 static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 {
 	struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
@@ -691,13 +697,6 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 	spin_unlock_bh(&ct->lock);
 	return 0;
 }
-
-static int dccp_nlattr_size(void)
-{
-	return nla_total_size(0)	/* CTA_PROTOINFO_DCCP */
-		+ nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1);
-}
-
 #endif
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
@@ -876,8 +875,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
 	.print_conntrack	= dccp_print_conntrack,
 #endif
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_size		= DCCP_NLATTR_SIZE,
 	.to_nlattr		= dccp_to_nlattr,
-	.nlattr_size		= dccp_nlattr_size,
 	.from_nlattr		= nlattr_to_dccp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
@@ -912,8 +911,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
 	.print_conntrack	= dccp_print_conntrack,
 #endif
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_size		= DCCP_NLATTR_SIZE,
 	.to_nlattr		= dccp_to_nlattr,
-	.nlattr_size		= dccp_nlattr_size,
 	.from_nlattr		= nlattr_to_dccp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 80faf04ddf15..f5bff4de0386 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -578,6 +578,11 @@ static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = {
 	[CTA_PROTOINFO_SCTP_VTAG_REPLY]     = { .type = NLA_U32 },
 };
 
+#define SCTP_NLATTR_SIZE ( \
+		NLA_ALIGN(NLA_HDRLEN + 1) + \
+		NLA_ALIGN(NLA_HDRLEN + 4) + \
+		NLA_ALIGN(NLA_HDRLEN + 4))
+
 static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
 {
 	struct nlattr *attr = cda[CTA_PROTOINFO_SCTP];
@@ -608,12 +613,6 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
 
 	return 0;
 }
-
-static int sctp_nlattr_size(void)
-{
-	return nla_total_size(0)	/* CTA_PROTOINFO_SCTP */
-		+ nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1);
-}
 #endif
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
@@ -793,8 +792,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.can_early_drop		= sctp_can_early_drop,
 	.me 			= THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_size		= SCTP_NLATTR_SIZE,
 	.to_nlattr		= sctp_to_nlattr,
-	.nlattr_size		= sctp_nlattr_size,
 	.from_nlattr		= nlattr_to_sctp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
@@ -830,8 +829,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 	.can_early_drop		= sctp_can_early_drop,
 	.me 			= THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_size		= SCTP_NLATTR_SIZE,
 	.to_nlattr		= sctp_to_nlattr,
-	.nlattr_size		= sctp_nlattr_size,
 	.from_nlattr		= nlattr_to_sctp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 37ef35b861f2..9875a3623676 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1222,6 +1222,12 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
 	[CTA_PROTOINFO_TCP_FLAGS_REPLY]	    = { .len =  sizeof(struct nf_ct_tcp_flags) },
 };
 
+#define TCP_NLATTR_SIZE	( \
+	NLA_ALIGN(NLA_HDRLEN + 1) + \
+	NLA_ALIGN(NLA_HDRLEN + 1) + \
+	NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \
+	NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))))
+
 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 {
 	struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
@@ -1274,12 +1280,6 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 	return 0;
 }
 
-static int tcp_nlattr_size(void)
-{
-	return nla_total_size(0)	   /* CTA_PROTOINFO_TCP */
-		+ nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
-}
-
 static unsigned int tcp_nlattr_tuple_size(void)
 {
 	static unsigned int size __read_mostly;
@@ -1557,11 +1557,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 	.can_early_drop		= tcp_can_early_drop,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= tcp_to_nlattr,
-	.nlattr_size		= tcp_nlattr_size,
 	.from_nlattr		= nlattr_to_tcp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
 	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
+	.nlattr_size		= TCP_NLATTR_SIZE,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
@@ -1594,8 +1594,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 	.error			= tcp_error,
 	.can_early_drop		= tcp_can_early_drop,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_size		= TCP_NLATTR_SIZE,
 	.to_nlattr		= tcp_to_nlattr,
-	.nlattr_size		= tcp_nlattr_size,
 	.from_nlattr		= nlattr_to_tcp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-- 
cgit v1.2.3


From cd9ceafc0a761a15b4cbfe6c0024edf88f861d66 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 7 Nov 2017 07:20:08 +0100
Subject: netfilter: conntrack: constify list of builtin trackers

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   | 10 +++++-----
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  2 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  2 +-
 net/netfilter/nf_conntrack_proto.c             | 12 ++++++------
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 0e5618ec8b9d..7fbb8f64a96e 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -125,18 +125,18 @@ int nf_ct_l4proto_pernet_register_one(struct net *net,
 void nf_ct_l4proto_pernet_unregister_one(struct net *net,
 				const struct nf_conntrack_l4proto *proto);
 int nf_ct_l4proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l4proto *const proto[],
+				  const struct nf_conntrack_l4proto *const proto[],
 				  unsigned int num_proto);
 void nf_ct_l4proto_pernet_unregister(struct net *net,
-				struct nf_conntrack_l4proto *const proto[],
+				const struct nf_conntrack_l4proto *const proto[],
 				unsigned int num_proto);
 
 /* Protocol global registration. */
-int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto);
+int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *proto);
 void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto);
-int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[],
+int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const proto[],
 			   unsigned int num_proto);
-void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[],
+void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const proto[],
 			      unsigned int num_proto);
 
 /* Generic netlink helpers */
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 89af9d88ca21..bb2c868a5621 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -368,7 +368,7 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
 MODULE_ALIAS("ip_conntrack");
 MODULE_LICENSE("GPL");
 
-static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
+static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
 	&nf_conntrack_l4proto_tcp4,
 	&nf_conntrack_l4proto_udp4,
 	&nf_conntrack_l4proto_icmp,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 3b80a38f62b8..7340ca7cc362 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -368,7 +368,7 @@ static struct nf_sockopt_ops so_getorigdst6 = {
 	.owner		= THIS_MODULE,
 };
 
-static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
+static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = {
 	&nf_conntrack_l4proto_tcp6,
 	&nf_conntrack_l4proto_udp6,
 	&nf_conntrack_l4proto_icmpv6,
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 19c3b1b84544..afdeca53e88b 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -385,7 +385,7 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
 
 /* FIXME: Allow NULL functions and sub in pointers to generic for
    them. --RR */
-int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
 
@@ -498,7 +498,7 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net,
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
 
-int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
+int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
 			   unsigned int num_proto)
 {
 	int ret = -EINVAL, ver;
@@ -520,7 +520,7 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
 
 int nf_ct_l4proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l4proto *const l4proto[],
+				  const struct nf_conntrack_l4proto *const l4proto[],
 				  unsigned int num_proto)
 {
 	int ret = -EINVAL;
@@ -541,7 +541,7 @@ int nf_ct_l4proto_pernet_register(struct net *net,
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
 
-void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[],
+void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
 			      unsigned int num_proto)
 {
 	mutex_lock(&nf_ct_proto_mutex);
@@ -551,12 +551,12 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[],
 
 	synchronize_net();
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_destroy(kill_l4proto, l4proto);
+	nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
 
 void nf_ct_l4proto_pernet_unregister(struct net *net,
-				struct nf_conntrack_l4proto *const l4proto[],
+				const struct nf_conntrack_l4proto *const l4proto[],
 				unsigned int num_proto)
 {
 	while (num_proto-- != 0)
-- 
cgit v1.2.3


From 9dae47aba0a055f761176d9297371d5bb24289ec Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 7 Nov 2017 07:20:09 +0100
Subject: netfilter: conntrack: l4 protocol trackers can be const

previous patches removed all writes to these structs so we can
now mark them as const.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 12 ++++++------
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 12 ++++++------
 include/net/netfilter/nf_conntrack_l4proto.h   |  2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_proto_dccp.c        |  4 ++--
 net/netfilter/nf_conntrack_proto_generic.c     |  2 +-
 net/netfilter/nf_conntrack_proto_gre.c         |  2 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |  4 ++--
 net/netfilter/nf_conntrack_proto_tcp.c         |  4 ++--
 net/netfilter/nf_conntrack_proto_udp.c         |  8 ++++----
 11 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 4ed1040bbe4a..73f825732326 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -13,17 +13,17 @@
 
 const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
 #endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
 #endif
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4;
 #endif
 
 int nf_conntrack_ipv4_compat_init(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index 9cd55be95853..effa8dfba68c 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -4,17 +4,17 @@
 
 extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
 
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
 #endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
 #endif
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6;
 #endif
 
 #include <linux/sysctl.h>
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 7fbb8f64a96e..a7220eef9aee 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -108,7 +108,7 @@ struct nf_conntrack_l4proto {
 };
 
 /* Existing built-in generic protocol */
-extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
 
 #define MAX_NF_CT_PROTO 256
 
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 1849fedd9b81..669e586b6b8f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -351,7 +351,7 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.icmp.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 {
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_ICMP,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 3ac0d826afc4..75a85e35a16b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -352,7 +352,7 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.icmpv6.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_ICMPV6,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 2fee7c96ec09..abe647d5b8c6 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -861,7 +861,7 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.dccp.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.l3proto		= AF_INET,
 	.l4proto		= IPPROTO_DCCP,
 	.pkt_to_tuple		= dccp_pkt_to_tuple,
@@ -897,7 +897,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 	.l3proto		= AF_INET6,
 	.l4proto		= IPPROTO_DCCP,
 	.pkt_to_tuple		= dccp_pkt_to_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 1f86ddf6649a..f2d22442c89e 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -163,7 +163,7 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.generic.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 {
 	.l3proto		= PF_UNSPEC,
 	.l4proto		= 255,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a2503005d80b..a881c074a43a 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -352,7 +352,7 @@ static int gre_init_net(struct net *net, u_int16_t proto)
 }
 
 /* protocol helper struct */
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
+static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
 	.l3proto	 = AF_INET,
 	.l4proto	 = IPPROTO_GRE,
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index f5bff4de0386..69eaaca6f933 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -777,7 +777,7 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.sctp.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
@@ -814,7 +814,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 9875a3623676..44a6038f99bc 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1541,7 +1541,7 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.tcp.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_TCP,
@@ -1579,7 +1579,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_TCP,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3a5f727103af..59a20f61c364 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -296,7 +296,7 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.udp.pn;
 }
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 {
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDP,
@@ -328,7 +328,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
 
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 {
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDPLITE,
@@ -360,7 +360,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
 #endif
 
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDP,
@@ -392,7 +392,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
 
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDPLITE,
-- 
cgit v1.2.3


From e8542dcec002b31339f7771441fd5dffb42223ae Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Tue, 7 Nov 2017 08:19:29 -0600
Subject: netfilter: mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c      | 2 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c   | 1 +
 net/netfilter/ipvs/ip_vs_proto_udp.c   | 1 +
 net/netfilter/nf_conntrack_h323_asn1.c | 3 +++
 net/netfilter/nft_cmp.c                | 2 ++
 net/netfilter/x_tables.c               | 2 +-
 6 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index cf84f7b37cd9..d5a43cad90f0 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1386,7 +1386,7 @@ dump_last:
 				goto next_set;
 			if (set->variant->uref)
 				set->variant->uref(set, cb, true);
-			/* Fall through and add elements */
+			/* fall through */
 		default:
 			rcu_read_lock_bh();
 			ret = set->variant->list(set, skb, cb);
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 121a321b91be..bcd9b7bde4ee 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -315,6 +315,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 	switch (skb->ip_summed) {
 	case CHECKSUM_NONE:
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+		/* fall through */
 	case CHECKSUM_COMPLETE:
 #ifdef CONFIG_IP_VS_IPV6
 		if (af == AF_INET6) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 30e11cd6aa8a..c15ef7c2a1fa 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -319,6 +319,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 		case CHECKSUM_NONE:
 			skb->csum = skb_checksum(skb, udphoff,
 						 skb->len - udphoff, 0);
+			/* fall through */
 		case CHECKSUM_COMPLETE:
 #ifdef CONFIG_IP_VS_IPV6
 			if (af == AF_INET6) {
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index dc6347342e34..38cf5e4f30bd 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -262,12 +262,15 @@ static unsigned int get_uint(struct bitstr *bs, int b)
 	case 4:
 		v |= *bs->cur++;
 		v <<= 8;
+		/* fall through */
 	case 3:
 		v |= *bs->cur++;
 		v <<= 8;
+		/* fall through */
 	case 2:
 		v |= *bs->cur++;
 		v <<= 8;
+		/* fall through */
 	case 1:
 		v |= *bs->cur++;
 		break;
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index c2945eb3397c..fa90a8402845 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -44,6 +44,7 @@ static void nft_cmp_eval(const struct nft_expr *expr,
 	case NFT_CMP_LT:
 		if (d == 0)
 			goto mismatch;
+		/* fall through */
 	case NFT_CMP_LTE:
 		if (d > 0)
 			goto mismatch;
@@ -51,6 +52,7 @@ static void nft_cmp_eval(const struct nft_expr *expr,
 	case NFT_CMP_GT:
 		if (d == 0)
 			goto mismatch;
+		/* fall through */
 	case NFT_CMP_GTE:
 		if (d < 0)
 			goto mismatch;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 55802e97f906..0d9efc3cb451 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1397,7 +1397,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
 		trav->curr = trav->curr->next;
 		if (trav->curr != trav->head)
 			break;
-		/* fallthru, _stop will unlock */
+		/* fall through */
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 2c9e8637ea15cf7f060b48f73b79e5055ffa93ad Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 12 Nov 2017 20:42:01 +0100
Subject: netfilter: conntrack: timeouts can be const

Nowadays this is just the default template that is used when setting up
the net namespace, so nothing writes to these locations.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +-
 net/netfilter/nf_conntrack_proto_generic.c     | 2 +-
 net/netfilter/nf_conntrack_proto_gre.c         | 2 +-
 net/netfilter/nf_conntrack_proto_sctp.c        | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         | 2 +-
 net/netfilter/nf_conntrack_proto_udp.c         | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 669e586b6b8f..5c15beafa711 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -22,7 +22,7 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_log.h>
 
-static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+static const unsigned int nf_ct_icmp_timeout = 30*HZ;
 
 static inline struct nf_icmp_net *icmp_pernet(struct net *net)
 {
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 75a85e35a16b..2548e2c8aedd 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -27,7 +27,7 @@
 #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
 #include <net/netfilter/nf_log.h>
 
-static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ;
+static const unsigned int nf_ct_icmpv6_timeout = 30*HZ;
 
 static inline struct nf_icmp_net *icmpv6_pernet(struct net *net)
 {
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index f2d22442c89e..6c6896d21cd7 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -12,7 +12,7 @@
 #include <linux/netfilter.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 
-static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ;
+static const unsigned int nf_ct_generic_timeout = 600*HZ;
 
 static bool nf_generic_should_process(u8 proto)
 {
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a881c074a43a..d049ea5a3770 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -48,7 +48,7 @@ enum grep_conntrack {
 	GRE_CT_MAX
 };
 
-static unsigned int gre_timeouts[GRE_CT_MAX] = {
+static const unsigned int gre_timeouts[GRE_CT_MAX] = {
 	[GRE_CT_UNREPLIED]	= 30*HZ,
 	[GRE_CT_REPLIED]	= 180*HZ,
 };
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 69eaaca6f933..fb9a35d16069 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -52,7 +52,7 @@ static const char *const sctp_conntrack_names[] = {
 #define HOURS * 60 MINS
 #define DAYS  * 24 HOURS
 
-static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
+static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
 	[SCTP_CONNTRACK_CLOSED]			= 10 SECS,
 	[SCTP_CONNTRACK_COOKIE_WAIT]		= 3 SECS,
 	[SCTP_CONNTRACK_COOKIE_ECHOED]		= 3 SECS,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 44a6038f99bc..684cc29010a0 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -68,7 +68,7 @@ static const char *const tcp_conntrack_names[] = {
 #define HOURS * 60 MINS
 #define DAYS * 24 HOURS
 
-static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {
+static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
 	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
 	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
 	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 59a20f61c364..fe7243970aa4 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -26,7 +26,7 @@
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
-static unsigned int udp_timeouts[UDP_CT_MAX] = {
+static const unsigned int udp_timeouts[UDP_CT_MAX] = {
 	[UDP_CT_UNREPLIED]	= 30*HZ,
 	[UDP_CT_REPLIED]	= 180*HZ,
 };
-- 
cgit v1.2.3


From 6b3d933000cbe539e5b234d639b083da60bb275c Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Mon, 13 Nov 2017 22:58:18 +0800
Subject: netfilter: ipvs: Remove useless ipvsh param of frag_safe_skb_hp

The param of frag_safe_skb_hp, ipvsh, isn't used now. So remove it and
update the callers' codes too.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             |  3 +--
 net/netfilter/ipvs/ip_vs_conn.c |  2 +-
 net/netfilter/ipvs/ip_vs_core.c | 12 ++++++------
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ff68cf288f9b..eb0bec043c96 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -69,8 +69,7 @@ struct ip_vs_iphdr {
 };
 
 static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
-				      int len, void *buffer,
-				      const struct ip_vs_iphdr *ipvsh)
+				      int len, void *buffer)
 {
 	return skb_header_pointer(skb, offset, len, buffer);
 }
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 3e053cb30070..f489b8db2406 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -322,7 +322,7 @@ ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
 {
 	__be16 _ports[2], *pptr;
 
-	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return 1;
 
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5cb7cac9177d..5f6f73cf2174 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -433,7 +433,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	/*
 	 * IPv6 frags, only the first hit here.
 	 */
-	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return NULL;
 
@@ -566,7 +566,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	struct netns_ipvs *ipvs = svc->ipvs;
 	struct net *net = ipvs->net;
 
-	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 	if (!pptr)
 		return NF_DROP;
 	dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];
@@ -982,7 +982,7 @@ static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
 	unsigned int offset;
 
 	*related = 1;
-	ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
+	ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph);
 	if (ic == NULL)
 		return NF_DROP;
 
@@ -1214,7 +1214,7 @@ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
 		return NULL;
 
 	pptr = frag_safe_skb_hp(skb, iph->len,
-				sizeof(_ports), _ports, iph);
+				sizeof(_ports), _ports);
 	if (!pptr)
 		return NULL;
 
@@ -1407,7 +1407,7 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
 		__be16 _ports[2], *pptr;
 
 		pptr = frag_safe_skb_hp(skb, iph.len,
-					 sizeof(_ports), _ports, &iph);
+					 sizeof(_ports), _ports);
 		if (pptr == NULL)
 			return NF_ACCEPT;	/* Not for me */
 		if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
@@ -1741,7 +1741,7 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
 
 	*related = 1;
 
-	ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
+	ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph);
 	if (ic == NULL)
 		return NF_DROP;
 
-- 
cgit v1.2.3


From 49971b8853f576f4252cbc8fc3df8173d7a56de3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 30 Nov 2017 21:07:32 +0100
Subject: netfilter: ipset: use nfnl_mutex_is_locked

Check that we really hold nfnl mutex here instead of relying on correct
usage alone.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index d5a43cad90f0..1f3c03b3bebf 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -57,7 +57,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 
 /* When the nfnl mutex is held: */
 #define ip_set_dereference(p)		\
-	rcu_dereference_protected(p, 1)
+	rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
 #define ip_set(inst, id)		\
 	ip_set_dereference((inst)->ip_set_list)[id]
 
-- 
cgit v1.2.3


From a778a15fa5cf5f632cd55845f548189a29e9b57b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 30 Nov 2017 21:08:05 +0100
Subject: netfilter: ipset: add resched points during set listing

When sets are extremely large we can get softlockup during ipset -L.
We could fix this by adding cond_resched_rcu() at the right location
during iteration, but this only works if RCU nesting depth is 1.

At this time entire variant->list() is called under under rcu_read_lock_bh.
This used to be a read_lock_bh() but as rcu doesn't really lock anything,
it does not appear to be needed, so remove it (ipset increments set
reference count before this, so a set deletion should not be possible).

Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_bitmap_gen.h | 1 +
 net/netfilter/ipset/ip_set_core.c       | 2 --
 net/netfilter/ipset/ip_set_hash_gen.h   | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 5ca18f07683b..8afe882f846d 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -227,6 +227,7 @@ mtype_list(const struct ip_set *set,
 	rcu_read_lock();
 	for (; cb->args[IPSET_CB_ARG0] < map->elements;
 	     cb->args[IPSET_CB_ARG0]++) {
+		cond_resched_rcu();
 		id = cb->args[IPSET_CB_ARG0];
 		x = get_ext(set, map, id);
 		if (!test_bit(id, map->members) ||
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 1f3c03b3bebf..89b44458a761 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1388,9 +1388,7 @@ dump_last:
 				set->variant->uref(set, cb, true);
 			/* fall through */
 		default:
-			rcu_read_lock_bh();
 			ret = set->variant->list(set, skb, cb);
-			rcu_read_unlock_bh();
 			if (!cb->args[IPSET_CB_ARG0])
 				/* Set is done, proceed with next one */
 				goto next_set;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index efffc8eabafe..8ef079db7d34 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1143,6 +1143,7 @@ mtype_list(const struct ip_set *set,
 	rcu_read_lock();
 	for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
 	     cb->args[IPSET_CB_ARG0]++) {
+		cond_resched_rcu();
 		incomplete = skb_tail_pointer(skb);
 		n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0]));
 		pr_debug("cb->arg bucket: %lu, t %p n %p\n",
-- 
cgit v1.2.3


From ca9b01473a474a45b7a8a419a897a2aaf3304249 Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Thu, 30 Nov 2017 19:34:36 +0530
Subject: netfilter: nf_conntrack_h323: Remove unwanted comments.

Change old multi-line comment style to kernel comment style and
remove unwanted comments.

Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_h323_asn1.c | 37 +++++++-------------------
 net/netfilter/nf_conntrack_h323_main.c | 47 +++-------------------------------
 2 files changed, 13 insertions(+), 71 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 38cf5e4f30bd..1601275efe2d 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -1,4 +1,4 @@
-/****************************************************************************
+/*
  * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
  * 			      	     conntrack/NAT module.
  *
@@ -8,7 +8,7 @@
  *
  * See ip_conntrack_helper_h323_asn1.h for details.
  *
- ****************************************************************************/
+ */
 
 #ifdef __KERNEL__
 #include <linux/kernel.h>
@@ -140,14 +140,15 @@ static const decoder_t Decoders[] = {
 	decode_choice,
 };
 
-/****************************************************************************
+/*
  * H.323 Types
- ****************************************************************************/
+ */
 #include "nf_conntrack_h323_types.c"
 
-/****************************************************************************
+/*
  * Functions
- ****************************************************************************/
+ */
+
 /* Assume bs is aligned && v < 16384 */
 static unsigned int get_len(struct bitstr *bs)
 {
@@ -177,7 +178,6 @@ static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits)
 	return 0;
 }
 
-/****************************************************************************/
 static unsigned int get_bit(struct bitstr *bs)
 {
 	unsigned int b = (*bs->cur) & (0x80 >> bs->bit);
@@ -187,7 +187,6 @@ static unsigned int get_bit(struct bitstr *bs)
 	return b;
 }
 
-/****************************************************************************/
 /* Assume b <= 8 */
 static unsigned int get_bits(struct bitstr *bs, unsigned int b)
 {
@@ -213,7 +212,6 @@ static unsigned int get_bits(struct bitstr *bs, unsigned int b)
 	return v;
 }
 
-/****************************************************************************/
 /* Assume b <= 32 */
 static unsigned int get_bitmap(struct bitstr *bs, unsigned int b)
 {
@@ -251,9 +249,9 @@ static unsigned int get_bitmap(struct bitstr *bs, unsigned int b)
 	return v;
 }
 
-/****************************************************************************
+/*
  * Assume bs is aligned and sizeof(unsigned int) == 4
- ****************************************************************************/
+ */
 static unsigned int get_uint(struct bitstr *bs, int b)
 {
 	unsigned int v = 0;
@@ -278,7 +276,6 @@ static unsigned int get_uint(struct bitstr *bs, int b)
 	return v;
 }
 
-/****************************************************************************/
 static int decode_nul(struct bitstr *bs, const struct field_t *f,
                       char *base, int level)
 {
@@ -287,7 +284,6 @@ static int decode_nul(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_bool(struct bitstr *bs, const struct field_t *f,
                        char *base, int level)
 {
@@ -299,7 +295,6 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_oid(struct bitstr *bs, const struct field_t *f,
                       char *base, int level)
 {
@@ -319,7 +314,6 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_int(struct bitstr *bs, const struct field_t *f,
                       char *base, int level)
 {
@@ -367,7 +361,6 @@ static int decode_int(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_enum(struct bitstr *bs, const struct field_t *f,
                        char *base, int level)
 {
@@ -384,7 +377,6 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_bitstr(struct bitstr *bs, const struct field_t *f,
                          char *base, int level)
 {
@@ -421,7 +413,6 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_numstr(struct bitstr *bs, const struct field_t *f,
                          char *base, int level)
 {
@@ -442,7 +433,6 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_octstr(struct bitstr *bs, const struct field_t *f,
                          char *base, int level)
 {
@@ -496,7 +486,6 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_bmpstr(struct bitstr *bs, const struct field_t *f,
                          char *base, int level)
 {
@@ -526,7 +515,6 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_seq(struct bitstr *bs, const struct field_t *f,
                       char *base, int level)
 {
@@ -656,7 +644,6 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 static int decode_seqof(struct bitstr *bs, const struct field_t *f,
                         char *base, int level)
 {
@@ -753,8 +740,6 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-
-/****************************************************************************/
 static int decode_choice(struct bitstr *bs, const struct field_t *f,
                          char *base, int level)
 {
@@ -836,7 +821,6 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f,
 	return H323_ERROR_NONE;
 }
 
-/****************************************************************************/
 int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras)
 {
 	static const struct field_t ras_message = {
@@ -852,7 +836,6 @@ int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras)
 	return decode_choice(&bs, &ras_message, (char *) ras, 0);
 }
 
-/****************************************************************************/
 static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg,
 				      size_t sz, H323_UserInformation *uuie)
 {
@@ -870,7 +853,6 @@ static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg,
 	return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0);
 }
 
-/****************************************************************************/
 int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
 					 MultimediaSystemControlMessage *
 					 mscm)
@@ -889,7 +871,6 @@ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
 			     (char *) mscm, 0);
 }
 
-/****************************************************************************/
 int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931)
 {
 	unsigned char *p = buf;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index f71f0d2558fd..7f0e0f66e488 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -115,7 +115,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_h245;
 static struct nf_conntrack_helper nf_conntrack_helper_q931[];
 static struct nf_conntrack_helper nf_conntrack_helper_ras[];
 
-/****************************************************************************/
 static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
 			 struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			 unsigned char **data, int *datalen, int *dataoff)
@@ -219,7 +218,6 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
 	return 0;
 }
 
-/****************************************************************************/
 static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,
 			 H245_TransportAddress *taddr,
 			 union nf_inet_addr *addr, __be16 *port)
@@ -254,7 +252,6 @@ static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,
 	return 1;
 }
 
-/****************************************************************************/
 static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 			   enum ip_conntrack_info ctinfo,
 			   unsigned int protoff,
@@ -328,7 +325,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 	return ret;
 }
 
-/****************************************************************************/
 static int expect_t120(struct sk_buff *skb,
 		       struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
@@ -380,7 +376,6 @@ static int expect_t120(struct sk_buff *skb,
 	return ret;
 }
 
-/****************************************************************************/
 static int process_h245_channel(struct sk_buff *skb,
 				struct nf_conn *ct,
 				enum ip_conntrack_info ctinfo,
@@ -410,7 +405,6 @@ static int process_h245_channel(struct sk_buff *skb,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -472,7 +466,6 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
 			enum ip_conntrack_info ctinfo,
 			unsigned int protoff, unsigned char **data, int dataoff,
@@ -542,7 +535,6 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
 			enum ip_conntrack_info ctinfo,
 			unsigned int protoff, unsigned char **data, int dataoff,
@@ -578,7 +570,6 @@ static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int h245_help(struct sk_buff *skb, unsigned int protoff,
 		     struct nf_conn *ct, enum ip_conntrack_info ctinfo)
 {
@@ -628,7 +619,6 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,
 	return NF_DROP;
 }
 
-/****************************************************************************/
 static const struct nf_conntrack_expect_policy h245_exp_policy = {
 	.max_expected	= H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */,
 	.timeout	= 240,
@@ -643,7 +633,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
 	.expect_policy		= &h245_exp_policy,
 };
 
-/****************************************************************************/
 int get_h225_addr(struct nf_conn *ct, unsigned char *data,
 		  TransportAddress *taddr,
 		  union nf_inet_addr *addr, __be16 *port)
@@ -675,7 +664,6 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
 	return 1;
 }
 
-/****************************************************************************/
 static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff, unsigned char **data, int dataoff,
@@ -726,7 +714,8 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
 }
 
 /* If the calling party is on the same side of the forward-to party,
- * we don't need to track the second call */
+ * we don't need to track the second call
+ */
 static int callforward_do_filter(struct net *net,
 				 const union nf_inet_addr *src,
 				 const union nf_inet_addr *dst,
@@ -794,7 +783,6 @@ static int callforward_do_filter(struct net *net,
 
 }
 
-/****************************************************************************/
 static int expect_callforwarding(struct sk_buff *skb,
 				 struct nf_conn *ct,
 				 enum ip_conntrack_info ctinfo,
@@ -815,7 +803,8 @@ static int expect_callforwarding(struct sk_buff *skb,
 		return 0;
 
 	/* If the calling party is on the same side of the forward-to party,
-	 * we don't need to track the second call */
+	 * we don't need to track the second call
+	 */
 	if (callforward_filter &&
 	    callforward_do_filter(net, &addr, &ct->tuplehash[!dir].tuple.src.u3,
 				  nf_ct_l3num(ct))) {
@@ -854,7 +843,6 @@ static int expect_callforwarding(struct sk_buff *skb,
 	return ret;
 }
 
-/****************************************************************************/
 static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo,
 			 unsigned int protoff,
@@ -925,7 +913,6 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_callproceeding(struct sk_buff *skb,
 				  struct nf_conn *ct,
 				  enum ip_conntrack_info ctinfo,
@@ -958,7 +945,6 @@ static int process_callproceeding(struct sk_buff *skb,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
 			   enum ip_conntrack_info ctinfo,
 			   unsigned int protoff,
@@ -990,7 +976,6 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
 			    enum ip_conntrack_info ctinfo,
 			    unsigned int protoff,
@@ -1022,7 +1007,6 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
 			    enum ip_conntrack_info ctinfo,
 			    unsigned int protoff,
@@ -1063,7 +1047,6 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
 			    enum ip_conntrack_info ctinfo,
 			    unsigned int protoff,
@@ -1095,7 +1078,6 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
 			enum ip_conntrack_info ctinfo,
 			unsigned int protoff, unsigned char **data, int dataoff,
@@ -1154,7 +1136,6 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int q931_help(struct sk_buff *skb, unsigned int protoff,
 		     struct nf_conn *ct, enum ip_conntrack_info ctinfo)
 {
@@ -1203,7 +1184,6 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,
 	return NF_DROP;
 }
 
-/****************************************************************************/
 static const struct nf_conntrack_expect_policy q931_exp_policy = {
 	/* T.120 and H.245 */
 	.max_expected		= H323_RTP_CHANNEL_MAX * 4 + 4,
@@ -1231,7 +1211,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {
 	},
 };
 
-/****************************************************************************/
 static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
 				   int *datalen)
 {
@@ -1249,7 +1228,6 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
 	return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
 }
 
-/****************************************************************************/
 static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
 					       union nf_inet_addr *addr,
 					       __be16 port)
@@ -1270,7 +1248,6 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
 	return NULL;
 }
 
-/****************************************************************************/
 static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff, unsigned char **data,
@@ -1328,7 +1305,6 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
 	return ret;
 }
 
-/****************************************************************************/
 static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1346,7 +1322,6 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1391,7 +1366,6 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
 	return ret;
 }
 
-/****************************************************************************/
 static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1428,7 +1402,6 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1480,7 +1453,6 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1514,7 +1486,6 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1559,7 +1530,6 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1608,7 +1578,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
 	return ret;
 }
 
-/****************************************************************************/
 static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1626,7 +1595,6 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1666,7 +1634,6 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
 	return ret;
 }
 
-/****************************************************************************/
 static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1700,7 +1667,6 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int process_ras(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff,
@@ -1745,7 +1711,6 @@ static int process_ras(struct sk_buff *skb, struct nf_conn *ct,
 	return 0;
 }
 
-/****************************************************************************/
 static int ras_help(struct sk_buff *skb, unsigned int protoff,
 		    struct nf_conn *ct, enum ip_conntrack_info ctinfo)
 {
@@ -1788,7 +1753,6 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff,
 	return NF_DROP;
 }
 
-/****************************************************************************/
 static const struct nf_conntrack_expect_policy ras_exp_policy = {
 	.max_expected		= 32,
 	.timeout		= 240,
@@ -1849,7 +1813,6 @@ static void __exit h323_helper_exit(void)
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
 }
 
-/****************************************************************************/
 static void __exit nf_conntrack_h323_fini(void)
 {
 	h323_helper_exit();
@@ -1857,7 +1820,6 @@ static void __exit nf_conntrack_h323_fini(void)
 	pr_debug("nf_ct_h323: fini\n");
 }
 
-/****************************************************************************/
 static int __init nf_conntrack_h323_init(void)
 {
 	int ret;
@@ -1877,7 +1839,6 @@ err1:
 	return ret;
 }
 
-/****************************************************************************/
 module_init(nf_conntrack_h323_init);
 module_exit(nf_conntrack_h323_fini);
 
-- 
cgit v1.2.3


From 4e645b47c4f000a503b9c90163ad905786b9bc1d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Dec 2017 00:21:02 +0100
Subject: netfilter: core: make nf_unregister_net_hooks simple wrapper again

This reverts commit d3ad2c17b4047
("netfilter: core: batch nf_unregister_net_hooks synchronize_net calls").

Nothing wrong with it.  However, followup patch will delay freeing of hooks
with call_rcu, so all synchronize_net() calls become obsolete and there
is no need anymore for this batching.

This revert causes a temporary performance degradation when destroying
network namespace, but its resolved with the upcoming call_rcu conversion.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 59 +++-------------------------------------------------
 1 file changed, 3 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 52cd2901a097..d39bb2c583dc 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -395,63 +395,10 @@ EXPORT_SYMBOL(nf_register_net_hooks);
 void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
 			     unsigned int hookcount)
 {
-	struct nf_hook_entries *to_free[16], *p;
-	struct nf_hook_entries __rcu **pp;
-	unsigned int i, j, n;
-
-	mutex_lock(&nf_hook_mutex);
-	for (i = 0; i < hookcount; i++) {
-		pp = nf_hook_entry_head(net, &reg[i]);
-		if (!pp)
-			continue;
-
-		p = nf_entry_dereference(*pp);
-		if (WARN_ON_ONCE(!p))
-			continue;
-		__nf_unregister_net_hook(p, &reg[i]);
-	}
-	mutex_unlock(&nf_hook_mutex);
-
-	do {
-		n = min_t(unsigned int, hookcount, ARRAY_SIZE(to_free));
-
-		mutex_lock(&nf_hook_mutex);
-
-		for (i = 0, j = 0; i < hookcount && j < n; i++) {
-			pp = nf_hook_entry_head(net, &reg[i]);
-			if (!pp)
-				continue;
-
-			p = nf_entry_dereference(*pp);
-			if (!p)
-				continue;
-
-			to_free[j] = __nf_hook_entries_try_shrink(pp);
-			if (to_free[j])
-				++j;
-		}
-
-		mutex_unlock(&nf_hook_mutex);
-
-		if (j) {
-			unsigned int nfq;
-
-			synchronize_net();
-
-			/* need 2nd synchronize_net() if nfqueue is used, skb
-			 * can get reinjected right before nf_queue_hook_drop()
-			 */
-			nfq = nf_queue_nf_hook_drop(net);
-			if (nfq)
-				synchronize_net();
-
-			for (i = 0; i < j; i++)
-				kvfree(to_free[i]);
-		}
+	unsigned int i;
 
-		reg += n;
-		hookcount -= n;
-	} while (hookcount > 0);
+	for (i = 0; i < hookcount; i++)
+		nf_unregister_net_hook(net, &reg[i]);
 }
 EXPORT_SYMBOL(nf_unregister_net_hooks);
 
-- 
cgit v1.2.3


From 26888dfd7e7454686b8d3ea9ba5045d5f236e4d7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Dec 2017 00:21:03 +0100
Subject: netfilter: core: remove synchronize_net call if nfqueue is used

since commit 960632ece6949b ("netfilter: convert hook list to an array")
nfqueue no longer stores a pointer to the hook that caused the packet
to be queued.  Therefore no extra synchronize_net() call is needed after
dropping the packets enqueued by the old rule blob.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 2 +-
 net/netfilter/core.c             | 6 +-----
 net/netfilter/nf_internals.h     | 2 +-
 net/netfilter/nf_queue.c         | 7 ++-----
 net/netfilter/nfnetlink_queue.c  | 9 ++-------
 5 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 814058d0f167..a50a69f5334c 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -25,7 +25,7 @@ struct nf_queue_entry {
 struct nf_queue_handler {
 	int		(*outfn)(struct nf_queue_entry *entry,
 				 unsigned int queuenum);
-	unsigned int	(*nf_hook_drop)(struct net *net);
+	void		(*nf_hook_drop)(struct net *net);
 };
 
 void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh);
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index d39bb2c583dc..9a84b6cb99e6 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -341,7 +341,6 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
 	struct nf_hook_entries __rcu **pp;
 	struct nf_hook_entries *p;
-	unsigned int nfq;
 
 	pp = nf_hook_entry_head(net, reg);
 	if (!pp)
@@ -364,10 +363,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 
 	synchronize_net();
 
-	/* other cpu might still process nfqueue verdict that used reg */
-	nfq = nf_queue_nf_hook_drop(net);
-	if (nfq)
-		synchronize_net();
+	nf_queue_nf_hook_drop(net);
 	kvfree(p);
 }
 EXPORT_SYMBOL(nf_unregister_net_hook);
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 44284cd2528d..18f6d7ae995b 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -10,7 +10,7 @@
 int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
 	     const struct nf_hook_entries *entries, unsigned int index,
 	     unsigned int verdict);
-unsigned int nf_queue_nf_hook_drop(struct net *net);
+void nf_queue_nf_hook_drop(struct net *net);
 
 /* nf_log.c */
 int __init netfilter_log_init(void);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index f7e21953b1de..4e42a4a68a0b 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -96,18 +96,15 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
 }
 EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
 
-unsigned int nf_queue_nf_hook_drop(struct net *net)
+void nf_queue_nf_hook_drop(struct net *net)
 {
 	const struct nf_queue_handler *qh;
-	unsigned int count = 0;
 
 	rcu_read_lock();
 	qh = rcu_dereference(net->nf.queue_handler);
 	if (qh)
-		count = qh->nf_hook_drop(net);
+		qh->nf_hook_drop(net);
 	rcu_read_unlock();
-
-	return count;
 }
 EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop);
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index c09b36755ed7..2db35f2d553d 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -941,23 +941,18 @@ static struct notifier_block nfqnl_dev_notifier = {
 	.notifier_call	= nfqnl_rcv_dev_event,
 };
 
-static unsigned int nfqnl_nf_hook_drop(struct net *net)
+static void nfqnl_nf_hook_drop(struct net *net)
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
-	unsigned int instances = 0;
 	int i;
 
 	for (i = 0; i < INSTANCE_BUCKETS; i++) {
 		struct nfqnl_instance *inst;
 		struct hlist_head *head = &q->instance_table[i];
 
-		hlist_for_each_entry_rcu(inst, head, hlist) {
+		hlist_for_each_entry_rcu(inst, head, hlist)
 			nfqnl_flush(inst, NULL, 0);
-			instances++;
-		}
 	}
-
-	return instances;
 }
 
 static int
-- 
cgit v1.2.3


From 8c873e2199700c2de7dbd5eedb9d90d5f109462b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Dec 2017 00:21:04 +0100
Subject: netfilter: core: free hooks with call_rcu

Giuseppe Scrivano says:
  "SELinux, if enabled, registers for each new network namespace 6
    netfilter hooks."

Cost for this is high.  With synchronize_net() removed:
   "The net benefit on an SMP machine with two cores is that creating a
   new network namespace takes -40% of the original time."

This patch replaces synchronize_net+kvfree with call_rcu().
We store rcu_head at the tail of a structure that has no fixed layout,
i.e. we cannot use offsetof() to compute the start of the original
allocation.  Thus store this information right after the rcu head.

We could simplify this by just placing the rcu_head at the start
of struct nf_hook_entries.  However, this structure is used in
packet processing hotpath, so only place what is needed for that
at the beginning of the struct.

Reported-by: Giuseppe Scrivano <gscrivan@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 19 +++++++++++++++----
 net/netfilter/core.c      | 34 ++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index b24e9b101651..792f6d535707 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -77,17 +77,28 @@ struct nf_hook_entry {
 	void				*priv;
 };
 
+struct nf_hook_entries_rcu_head {
+	struct rcu_head head;
+	void	*allocation;
+};
+
 struct nf_hook_entries {
 	u16				num_hook_entries;
 	/* padding */
 	struct nf_hook_entry		hooks[];
 
-	/* trailer: pointers to original orig_ops of each hook.
-	 *
-	 * This is not part of struct nf_hook_entry since its only
-	 * needed in slow path (hook register/unregister).
+	/* trailer: pointers to original orig_ops of each hook,
+	 * followed by rcu_head and scratch space used for freeing
+	 * the structure via call_rcu.
 	 *
+	 *   This is not part of struct nf_hook_entry since its only
+	 *   needed in slow path (hook register/unregister):
 	 * const struct nf_hook_ops     *orig_ops[]
+	 *
+	 *   For the same reason, we store this at end -- its
+	 *   only needed when a hook is deleted, not during
+	 *   packet path processing:
+	 * struct nf_hook_entries_rcu_head     head
 	 */
 };
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 9a84b6cb99e6..6921f9f1cc81 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -74,7 +74,8 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
 	struct nf_hook_entries *e;
 	size_t alloc = sizeof(*e) +
 		       sizeof(struct nf_hook_entry) * num +
-		       sizeof(struct nf_hook_ops *) * num;
+		       sizeof(struct nf_hook_ops *) * num +
+		       sizeof(struct nf_hook_entries_rcu_head);
 
 	if (num == 0)
 		return NULL;
@@ -85,6 +86,30 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
 	return e;
 }
 
+static void __nf_hook_entries_free(struct rcu_head *h)
+{
+	struct nf_hook_entries_rcu_head *head;
+
+	head = container_of(h, struct nf_hook_entries_rcu_head, head);
+	kvfree(head->allocation);
+}
+
+static void nf_hook_entries_free(struct nf_hook_entries *e)
+{
+	struct nf_hook_entries_rcu_head *head;
+	struct nf_hook_ops **ops;
+	unsigned int num;
+
+	if (!e)
+		return;
+
+	num = e->num_hook_entries;
+	ops = nf_hook_entries_get_hook_ops(e);
+	head = (void *)&ops[num];
+	head->allocation = e;
+	call_rcu(&head->head, __nf_hook_entries_free);
+}
+
 static unsigned int accept_all(void *priv,
 			       struct sk_buff *skb,
 			       const struct nf_hook_state *state)
@@ -291,9 +316,8 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 #ifdef HAVE_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
-	synchronize_net();
 	BUG_ON(p == new_hooks);
-	kvfree(p);
+	nf_hook_entries_free(p);
 	return 0;
 }
 EXPORT_SYMBOL(nf_register_net_hook);
@@ -361,10 +385,8 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	if (!p)
 		return;
 
-	synchronize_net();
-
 	nf_queue_nf_hook_drop(net);
-	kvfree(p);
+	nf_hook_entries_free(p);
 }
 EXPORT_SYMBOL(nf_unregister_net_hook);
 
-- 
cgit v1.2.3


From b0f38338aef2dae5ade3c16acf713737e3b15a73 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 3 Dec 2017 00:58:47 +0100
Subject: netfilter: reduce size of hook entry point locations

struct net contains:

struct nf_hook_entries __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];

which store the hook entry point locations for the various protocol
families and the hooks.

Using array results in compact c code when doing accesses, i.e.
  x = rcu_dereference(net->nf.hooks[pf][hook]);

but its also wasting a lot of memory, as most families are
not used.

So split the array into those families that are used, which
are only 5 (instead of 13).  In most cases, the 'pf' argument is
constant, i.e. gcc removes switch statement.

struct net before:
 /* size: 5184, cachelines: 81, members: 46 */
after:
 /* size: 4672, cachelines: 73, members: 46 */

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h       | 24 ++++++++++++++++++++++--
 include/net/netns/netfilter.h   |  6 +++++-
 net/bridge/br_netfilter_hooks.c |  2 +-
 net/netfilter/core.c            | 38 ++++++++++++++++++++++++++++++--------
 net/netfilter/nf_queue.c        | 21 +++++++++++++++++++--
 5 files changed, 77 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 792f6d535707..9dcbcdfa3b82 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -195,7 +195,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 			  struct net_device *indev, struct net_device *outdev,
 			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
-	struct nf_hook_entries *hook_head;
+	struct nf_hook_entries *hook_head = NULL;
 	int ret = 1;
 
 #ifdef HAVE_JUMP_LABEL
@@ -206,7 +206,27 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 #endif
 
 	rcu_read_lock();
-	hook_head = rcu_dereference(net->nf.hooks[pf][hook]);
+	switch (pf) {
+	case NFPROTO_IPV4:
+		hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
+		break;
+	case NFPROTO_IPV6:
+		hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
+		break;
+	case NFPROTO_ARP:
+		hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
+		break;
+	case NFPROTO_BRIDGE:
+		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+		break;
+	case NFPROTO_DECNET:
+		hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
 	if (hook_head) {
 		struct nf_hook_state state;
 
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index cc00af2ac2d7..b39c563c2fce 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -17,7 +17,11 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
-	struct nf_hook_entries __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks_ipv4[NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks_ipv6[NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks_arp[NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks_bridge[NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks_decnet[NF_MAX_HOOKS];
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	bool			defrag_ipv4;
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index c2eea1b8737a..27f1d4f2114a 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -991,7 +991,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 	unsigned int i;
 	int ret;
 
-	e = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+	e = rcu_dereference(net->nf.hooks_bridge[hook]);
 	if (!e)
 		return okfn(net, sk, skb);
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 6921f9f1cc81..a6eaaf303be8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -264,8 +264,23 @@ out_assign:
 
 static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg)
 {
-	if (reg->pf != NFPROTO_NETDEV)
-		return net->nf.hooks[reg->pf]+reg->hooknum;
+	switch (reg->pf) {
+	case NFPROTO_NETDEV:
+		break;
+	case NFPROTO_ARP:
+		return net->nf.hooks_arp + reg->hooknum;
+	case NFPROTO_BRIDGE:
+		return net->nf.hooks_bridge + reg->hooknum;
+	case NFPROTO_IPV4:
+		return net->nf.hooks_ipv4 + reg->hooknum;
+	case NFPROTO_IPV6:
+		return net->nf.hooks_ipv6 + reg->hooknum;
+	case NFPROTO_DECNET:
+		return net->nf.hooks_decnet + reg->hooknum;
+	default:
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
 
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->hooknum == NF_NETDEV_INGRESS) {
@@ -534,14 +549,21 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
-static int __net_init netfilter_net_init(struct net *net)
+static void __net_init __netfilter_net_init(struct nf_hook_entries *e[NF_MAX_HOOKS])
 {
-	int i, h;
+	int h;
 
-	for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
-		for (h = 0; h < NF_MAX_HOOKS; h++)
-			RCU_INIT_POINTER(net->nf.hooks[i][h], NULL);
-	}
+	for (h = 0; h < NF_MAX_HOOKS; h++)
+		RCU_INIT_POINTER(e[h], NULL);
+}
+
+static int __net_init netfilter_net_init(struct net *net)
+{
+	__netfilter_net_init(net->nf.hooks_ipv4);
+	__netfilter_net_init(net->nf.hooks_ipv6);
+	__netfilter_net_init(net->nf.hooks_arp);
+	__netfilter_net_init(net->nf.hooks_bridge);
+	__netfilter_net_init(net->nf.hooks_decnet);
 
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 4e42a4a68a0b..836aeb08686e 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -201,6 +201,23 @@ repeat:
 	return NF_ACCEPT;
 }
 
+static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
+{
+	switch (pf) {
+	case NFPROTO_BRIDGE:
+		return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+	case NFPROTO_IPV4:
+		return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
+	case NFPROTO_IPV6:
+		return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
+	default:
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	return NULL;
+}
+
 /* Caller must hold rcu read-side lock */
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
@@ -216,12 +233,12 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 	net = entry->state.net;
 	pf = entry->state.pf;
 
-	hooks = rcu_dereference(net->nf.hooks[pf][entry->state.hook]);
+	hooks = nf_hook_entries_head(net, pf, entry->state.hook);
 
 	nf_queue_entry_release_refs(entry);
 
 	i = entry->hook_index;
-	if (WARN_ON_ONCE(i >= hooks->num_hook_entries)) {
+	if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
 		kfree_skb(skb);
 		kfree(entry);
 		return;
-- 
cgit v1.2.3


From ef57170bbfdd6958281011332b1fd237712f69f0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 7 Dec 2017 16:28:24 +0100
Subject: netfilter: reduce hook array sizes to what is needed

Not all families share the same hook count, adjust sizes to what is
needed.

struct net before:
/* size: 6592, cachelines: 103, members: 46 */
after:
/* size: 5952, cachelines: 93, members: 46 */

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/netfilter.h | 10 +++++-----
 net/netfilter/core.c          | 24 +++++++++++++++++-------
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index b39c563c2fce..8f756a4b9205 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -17,11 +17,11 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
-	struct nf_hook_entries __rcu *hooks_ipv4[NF_MAX_HOOKS];
-	struct nf_hook_entries __rcu *hooks_ipv6[NF_MAX_HOOKS];
-	struct nf_hook_entries __rcu *hooks_arp[NF_MAX_HOOKS];
-	struct nf_hook_entries __rcu *hooks_bridge[NF_MAX_HOOKS];
-	struct nf_hook_entries __rcu *hooks_decnet[NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS];
+	struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS];
+	struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS];
+	struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
+	struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS];
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	bool			defrag_ipv4;
 #endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a6eaaf303be8..43643427b560 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -268,14 +268,24 @@ static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const
 	case NFPROTO_NETDEV:
 		break;
 	case NFPROTO_ARP:
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= reg->hooknum))
+			return NULL;
 		return net->nf.hooks_arp + reg->hooknum;
 	case NFPROTO_BRIDGE:
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= reg->hooknum))
+			return NULL;
 		return net->nf.hooks_bridge + reg->hooknum;
 	case NFPROTO_IPV4:
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= reg->hooknum))
+			return NULL;
 		return net->nf.hooks_ipv4 + reg->hooknum;
 	case NFPROTO_IPV6:
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= reg->hooknum))
+			return NULL;
 		return net->nf.hooks_ipv6 + reg->hooknum;
 	case NFPROTO_DECNET:
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= reg->hooknum))
+			return NULL;
 		return net->nf.hooks_decnet + reg->hooknum;
 	default:
 		WARN_ON_ONCE(1);
@@ -549,21 +559,21 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
-static void __net_init __netfilter_net_init(struct nf_hook_entries *e[NF_MAX_HOOKS])
+static void __net_init __netfilter_net_init(struct nf_hook_entries **e, int max)
 {
 	int h;
 
-	for (h = 0; h < NF_MAX_HOOKS; h++)
+	for (h = 0; h < max; h++)
 		RCU_INIT_POINTER(e[h], NULL);
 }
 
 static int __net_init netfilter_net_init(struct net *net)
 {
-	__netfilter_net_init(net->nf.hooks_ipv4);
-	__netfilter_net_init(net->nf.hooks_ipv6);
-	__netfilter_net_init(net->nf.hooks_arp);
-	__netfilter_net_init(net->nf.hooks_bridge);
-	__netfilter_net_init(net->nf.hooks_decnet);
+	__netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4));
+	__netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6));
+	__netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
+	__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
+	__netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
 
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
-- 
cgit v1.2.3


From bb4badf3a3dc81190f7c1c1fa063cdefb18df45f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 7 Dec 2017 16:28:25 +0100
Subject: netfilter: don't allocate space for decnet hooks unless needed

no need to define hook points if the family isn't supported.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h     | 2 ++
 include/net/netns/netfilter.h | 2 ++
 net/netfilter/core.c          | 4 ++++
 3 files changed, 8 insertions(+)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9dcbcdfa3b82..ce4e91df8b56 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -219,9 +219,11 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 	case NFPROTO_BRIDGE:
 		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
 		break;
+#if IS_ENABLED(CONFIG_DECNET)
 	case NFPROTO_DECNET:
 		hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
 		break;
+#endif
 	default:
 		WARN_ON_ONCE(1);
 		break;
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 8f756a4b9205..432609fd9899 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -21,7 +21,9 @@ struct netns_nf {
 	struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS];
 	struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS];
 	struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
+#if IS_ENABLED(CONFIG_DECNET)
 	struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS];
+#endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	bool			defrag_ipv4;
 #endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 43643427b560..4738d0d0ebac 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -283,10 +283,12 @@ static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const
 		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= reg->hooknum))
 			return NULL;
 		return net->nf.hooks_ipv6 + reg->hooknum;
+#if IS_ENABLED(CONFIG_DECNET)
 	case NFPROTO_DECNET:
 		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= reg->hooknum))
 			return NULL;
 		return net->nf.hooks_decnet + reg->hooknum;
+#endif
 	default:
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -573,7 +575,9 @@ static int __net_init netfilter_net_init(struct net *net)
 	__netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6));
 	__netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
 	__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
+#if IS_ENABLED(CONFIG_DECNET)
 	__netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
+#endif
 
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
-- 
cgit v1.2.3


From 2a95183a5e0375df756efb2ca37602d71e8455f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 7 Dec 2017 16:28:26 +0100
Subject: netfilter: don't allocate space for arp/bridge hooks unless needed

no need to define hook points if the family isn't supported.
Because we need these hooks for either nftables, arp/ebtables
or the 'call-iptables' hack we have in the bridge layer add two
new dependencies, NETFILTER_FAMILY_{ARP,BRIDGE}, and have the
users select them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h     | 4 ++++
 include/net/netns/netfilter.h | 4 ++++
 net/Kconfig                   | 1 +
 net/bridge/netfilter/Kconfig  | 2 ++
 net/ipv4/netfilter/Kconfig    | 2 ++
 net/netfilter/Kconfig         | 6 ++++++
 net/netfilter/core.c          | 8 ++++++++
 net/netfilter/nf_queue.c      | 2 ++
 8 files changed, 29 insertions(+)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index ce4e91df8b56..ee7a9cbd8d81 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -214,10 +214,14 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 		hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
 		break;
 	case NFPROTO_ARP:
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
 		hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
+#endif
 		break;
 	case NFPROTO_BRIDGE:
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+#endif
 		break;
 #if IS_ENABLED(CONFIG_DECNET)
 	case NFPROTO_DECNET:
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 432609fd9899..ca043342c0eb 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -19,8 +19,12 @@ struct netns_nf {
 #endif
 	struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS];
 	struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS];
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
 	struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS];
+#endif
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
+#endif
 #if IS_ENABLED(CONFIG_DECNET)
 	struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS];
 #endif
diff --git a/net/Kconfig b/net/Kconfig
index efe930db3c08..37ec8e67af57 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -182,6 +182,7 @@ config BRIDGE_NETFILTER
 	depends on BRIDGE
 	depends on NETFILTER && INET
 	depends on NETFILTER_ADVANCED
+	select NETFILTER_FAMILY_BRIDGE
 	default m
 	---help---
 	  Enabling this option will let arptables resp. iptables see bridged
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index e7ef1a1ef3a6..225d1668dfdd 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -4,6 +4,7 @@
 #
 menuconfig NF_TABLES_BRIDGE
 	depends on BRIDGE && NETFILTER && NF_TABLES
+	select NETFILTER_FAMILY_BRIDGE
 	tristate "Ethernet Bridge nf_tables support"
 
 if NF_TABLES_BRIDGE
@@ -29,6 +30,7 @@ endif # NF_TABLES_BRIDGE
 menuconfig BRIDGE_NF_EBTABLES
 	tristate "Ethernet Bridge tables (ebtables) support"
 	depends on BRIDGE && NETFILTER && NETFILTER_XTABLES
+	select NETFILTER_FAMILY_BRIDGE
 	help
 	  ebtables is a general, extensible frame/packet identification
 	  framework. Say 'Y' or 'M' here if you want to do Ethernet
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index c11eb1744ab1..cee51045e2f7 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -72,6 +72,7 @@ endif # NF_TABLES_IPV4
 
 config NF_TABLES_ARP
 	tristate "ARP nf_tables support"
+	select NETFILTER_FAMILY_ARP
 	help
 	  This option enables the ARP support for nf_tables.
 
@@ -392,6 +393,7 @@ endif # IP_NF_IPTABLES
 config IP_NF_ARPTABLES
 	tristate "ARP tables support"
 	select NETFILTER_XTABLES
+	select NETFILTER_FAMILY_ARP
 	depends on NETFILTER_ADVANCED
 	help
 	  arptables is a general, extensible packet identification framework.
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e4a13cc8a2e7..263609a7e010 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -12,6 +12,12 @@ config NETFILTER_INGRESS
 config NETFILTER_NETLINK
 	tristate
 
+config NETFILTER_FAMILY_BRIDGE
+	bool
+
+config NETFILTER_FAMILY_ARP
+	bool
+
 config NETFILTER_NETLINK_ACCT
 tristate "Netfilter NFACCT over NFNETLINK interface"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 4738d0d0ebac..ed8618f4efd7 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -267,14 +267,18 @@ static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const
 	switch (reg->pf) {
 	case NFPROTO_NETDEV:
 		break;
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
 	case NFPROTO_ARP:
 		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= reg->hooknum))
 			return NULL;
 		return net->nf.hooks_arp + reg->hooknum;
+#endif
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	case NFPROTO_BRIDGE:
 		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= reg->hooknum))
 			return NULL;
 		return net->nf.hooks_bridge + reg->hooknum;
+#endif
 	case NFPROTO_IPV4:
 		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= reg->hooknum))
 			return NULL;
@@ -573,8 +577,12 @@ static int __net_init netfilter_net_init(struct net *net)
 {
 	__netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4));
 	__netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6));
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
 	__netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
+#endif
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
+#endif
 #if IS_ENABLED(CONFIG_DECNET)
 	__netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
 #endif
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 836aeb08686e..0c02fdb7efc9 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -204,8 +204,10 @@ repeat:
 static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
 {
 	switch (pf) {
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	case NFPROTO_BRIDGE:
 		return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+#endif
 	case NFPROTO_IPV4:
 		return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
 	case NFPROTO_IPV6:
-- 
cgit v1.2.3


From 03d13b6868a261f24fbc82b6a2d5823df8d075d3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 8 Dec 2017 17:01:53 +0100
Subject: netfilter: xtables: add and use xt_request_find_table_lock

currently we always return -ENOENT to userspace if we can't find
a particular table, or if the table initialization fails.

Followup patch will make nat table init fail in case nftables already
registered a nat hook so this change makes xt_find_table_lock return
an ERR_PTR to return the errno value reported from the table init
function.

Add xt_request_find_table_lock as try_then_request_module replacement
and use it where needed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  2 ++
 net/ipv4/netfilter/arp_tables.c    | 26 ++++++++++++--------------
 net/ipv4/netfilter/ip_tables.c     | 26 ++++++++++++--------------
 net/ipv6/netfilter/ip6_tables.c    | 26 ++++++++++++--------------
 net/netfilter/x_tables.c           | 36 +++++++++++++++++++++++++++---------
 5 files changed, 65 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 33f7530f96b9..1313b35c3ab7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -320,6 +320,8 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name);
+struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
+					    const char *name);
 void xt_table_unlock(struct xt_table *t);
 
 int xt_proto_init(struct net *net, u_int8_t af);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 0c3c944a7b72..bf8a5340f15e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -810,9 +810,8 @@ static int get_info(struct net *net, void __user *user,
 	if (compat)
 		xt_compat_lock(NFPROTO_ARP);
 #endif
-	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
-				    "arptable_%s", name);
-	if (t) {
+	t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+	if (!IS_ERR(t)) {
 		struct arpt_getinfo info;
 		const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
@@ -841,7 +840,7 @@ static int get_info(struct net *net, void __user *user,
 		xt_table_unlock(t);
 		module_put(t->me);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_unlock(NFPROTO_ARP);
@@ -866,7 +865,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 	get.name[sizeof(get.name) - 1] = '\0';
 
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
-	if (t) {
+	if (!IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 
 		if (get.size == private->size)
@@ -878,7 +877,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 
 	return ret;
 }
@@ -903,10 +902,9 @@ static int __do_replace(struct net *net, const char *name,
 		goto out;
 	}
 
-	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
-				    "arptable_%s", name);
-	if (!t) {
-		ret = -ENOENT;
+	t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
 		goto free_newinfo_counters_untrans;
 	}
 
@@ -1020,8 +1018,8 @@ static int do_add_counters(struct net *net, const void __user *user,
 		return PTR_ERR(paddc);
 
 	t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
-	if (!t) {
-		ret = -ENOENT;
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
 		goto free;
 	}
 
@@ -1408,7 +1406,7 @@ static int compat_get_entries(struct net *net,
 
 	xt_compat_lock(NFPROTO_ARP);
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
-	if (t) {
+	if (!IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 		struct xt_table_info info;
 
@@ -1423,7 +1421,7 @@ static int compat_get_entries(struct net *net,
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 
 	xt_compat_unlock(NFPROTO_ARP);
 	return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2e0d339028bb..0b975aa2d363 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -973,9 +973,8 @@ static int get_info(struct net *net, void __user *user,
 	if (compat)
 		xt_compat_lock(AF_INET);
 #endif
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
-				    "iptable_%s", name);
-	if (t) {
+	t = xt_request_find_table_lock(net, AF_INET, name);
+	if (!IS_ERR(t)) {
 		struct ipt_getinfo info;
 		const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
@@ -1005,7 +1004,7 @@ static int get_info(struct net *net, void __user *user,
 		xt_table_unlock(t);
 		module_put(t->me);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_unlock(AF_INET);
@@ -1030,7 +1029,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
 	get.name[sizeof(get.name) - 1] = '\0';
 
 	t = xt_find_table_lock(net, AF_INET, get.name);
-	if (t) {
+	if (!IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
@@ -1041,7 +1040,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 
 	return ret;
 }
@@ -1064,10 +1063,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 		goto out;
 	}
 
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
-				    "iptable_%s", name);
-	if (!t) {
-		ret = -ENOENT;
+	t = xt_request_find_table_lock(net, AF_INET, name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
 		goto free_newinfo_counters_untrans;
 	}
 
@@ -1181,8 +1179,8 @@ do_add_counters(struct net *net, const void __user *user,
 		return PTR_ERR(paddc);
 
 	t = xt_find_table_lock(net, AF_INET, tmp.name);
-	if (!t) {
-		ret = -ENOENT;
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
 		goto free;
 	}
 
@@ -1625,7 +1623,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
 
 	xt_compat_lock(AF_INET);
 	t = xt_find_table_lock(net, AF_INET, get.name);
-	if (t) {
+	if (!IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 		struct xt_table_info info;
 		ret = compat_table_info(private, &info);
@@ -1639,7 +1637,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 
 	xt_compat_unlock(AF_INET);
 	return ret;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 1d7ae9366335..6ebbef2dfb60 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -991,9 +991,8 @@ static int get_info(struct net *net, void __user *user,
 	if (compat)
 		xt_compat_lock(AF_INET6);
 #endif
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
-				    "ip6table_%s", name);
-	if (t) {
+	t = xt_request_find_table_lock(net, AF_INET6, name);
+	if (!IS_ERR(t)) {
 		struct ip6t_getinfo info;
 		const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
@@ -1023,7 +1022,7 @@ static int get_info(struct net *net, void __user *user,
 		xt_table_unlock(t);
 		module_put(t->me);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_unlock(AF_INET6);
@@ -1049,7 +1048,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
 	get.name[sizeof(get.name) - 1] = '\0';
 
 	t = xt_find_table_lock(net, AF_INET6, get.name);
-	if (t) {
+	if (!IS_ERR(t)) {
 		struct xt_table_info *private = t->private;
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
@@ -1060,7 +1059,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 
 	return ret;
 }
@@ -1083,10 +1082,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 		goto out;
 	}
 
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
-				    "ip6table_%s", name);
-	if (!t) {
-		ret = -ENOENT;
+	t = xt_request_find_table_lock(net, AF_INET6, name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
 		goto free_newinfo_counters_untrans;
 	}
 
@@ -1199,8 +1197,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	if (IS_ERR(paddc))
 		return PTR_ERR(paddc);
 	t = xt_find_table_lock(net, AF_INET6, tmp.name);
-	if (!t) {
-		ret = -ENOENT;
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
 		goto free;
 	}
 
@@ -1636,7 +1634,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
 
 	xt_compat_lock(AF_INET6);
 	t = xt_find_table_lock(net, AF_INET6, get.name);
-	if (t) {
+	if (!IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 		struct xt_table_info info;
 		ret = compat_table_info(private, &info);
@@ -1650,7 +1648,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
-		ret = -ENOENT;
+		ret = PTR_ERR(t);
 
 	xt_compat_unlock(AF_INET6);
 	return ret;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0d9efc3cb451..10c19a3f4cbd 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1027,7 +1027,7 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-/* Find table by name, grabs mutex & ref.  Returns NULL on error. */
+/* Find table by name, grabs mutex & ref.  Returns ERR_PTR on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
 {
@@ -1043,17 +1043,17 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 
 	/* Table doesn't exist in this netns, re-try init */
 	list_for_each_entry(t, &init_net.xt.tables[af], list) {
+		int err;
+
 		if (strcmp(t->name, name))
 			continue;
-		if (!try_module_get(t->me)) {
-			mutex_unlock(&xt[af].mutex);
-			return NULL;
-		}
-
+		if (!try_module_get(t->me))
+			goto out;
 		mutex_unlock(&xt[af].mutex);
-		if (t->table_init(net) != 0) {
+		err = t->table_init(net);
+		if (err < 0) {
 			module_put(t->me);
-			return NULL;
+			return ERR_PTR(err);
 		}
 
 		found = t;
@@ -1073,10 +1073,28 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 	module_put(found->me);
  out:
 	mutex_unlock(&xt[af].mutex);
-	return NULL;
+	return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL_GPL(xt_find_table_lock);
 
+struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
+					    const char *name)
+{
+	struct xt_table *t = xt_find_table_lock(net, af, name);
+
+#ifdef CONFIG_MODULE
+	if (IS_ERR(t)) {
+		int err = request_module("%stable_%s", xt_prefix[af], name);
+		if (err)
+			return ERR_PTR(err);
+		t = xt_find_table_lock(net, af, name);
+	}
+#endif
+
+	return t;
+}
+EXPORT_SYMBOL_GPL(xt_request_find_table_lock);
+
 void xt_table_unlock(struct xt_table *table)
 {
 	mutex_unlock(&xt[table->af].mutex);
-- 
cgit v1.2.3


From f92b40a8b2645af38bd6814651c59c1e690db53d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 8 Dec 2017 17:01:54 +0100
Subject: netfilter: core: only allow one nat hook per hook point

The netfilter NAT core cannot deal with more than one NAT hook per hook
location (prerouting, input ...), because the NAT hooks install a NAT null
binding in case the iptables nat table (iptable_nat hooks) or the
corresponding nftables chain (nft nat hooks) doesn't specify a nat
transformation.

Null bindings are needed to detect port collsisions between NAT-ed and
non-NAT-ed connections.

This causes nftables NAT rules to not work when iptable_nat module is
loaded, and vice versa because nat binding has already been attached
when the second nat hook is consulted.

The netfilter core is not really the correct location to handle this
(hooks are just hooks, the core has no notion of what kinds of side
 effects a hook implements), but its the only place where we can check
for conflicts between both iptables hooks and nftables hooks without
adding dependencies.

So add nat annotation to hook_ops to describe those hooks that will
add NAT bindings and then make core reject if such a hook already exists.
The annotation fills a padding hole, in case further restrictions appar
we might change this to a 'u8 type' instead of bool.

iptables error if nft nat hook active:
iptables -t nat -A POSTROUTING -j MASQUERADE
iptables v1.4.21: can't initialize iptables table `nat': File exists
Perhaps iptables or your kernel needs to be upgraded.

nftables error if iptables nat table present:
nft -f /etc/nftables/ipv4-nat
/usr/etc/nftables/ipv4-nat:3:1-2: Error: Could not process rule: File exists
table nat {
^^

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         | 1 +
 net/ipv4/netfilter/iptable_nat.c  | 4 ++++
 net/ipv6/netfilter/ip6table_nat.c | 4 ++++
 net/netfilter/core.c              | 6 ++++++
 net/netfilter/nf_tables_api.c     | 2 ++
 5 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index ee7a9cbd8d81..85a0b0d599e6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -67,6 +67,7 @@ struct nf_hook_ops {
 	struct net_device	*dev;
 	void			*priv;
 	u_int8_t		pf;
+	bool			nat_hook;
 	unsigned int		hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int			priority;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a1a07b338ccf..0f7255cc65ee 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -72,6 +72,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	{
 		.hook		= iptable_nat_ipv4_in,
 		.pf		= NFPROTO_IPV4,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
@@ -79,6 +80,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	{
 		.hook		= iptable_nat_ipv4_out,
 		.pf		= NFPROTO_IPV4,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP_PRI_NAT_SRC,
 	},
@@ -86,6 +88,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	{
 		.hook		= iptable_nat_ipv4_local_fn,
 		.pf		= NFPROTO_IPV4,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
@@ -93,6 +96,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	{
 		.hook		= iptable_nat_ipv4_fn,
 		.pf		= NFPROTO_IPV4,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP_PRI_NAT_SRC,
 	},
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 991512576c8c..47306e45a80a 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -74,6 +74,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
 	{
 		.hook		= ip6table_nat_in,
 		.pf		= NFPROTO_IPV6,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP6_PRI_NAT_DST,
 	},
@@ -81,6 +82,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
 	{
 		.hook		= ip6table_nat_out,
 		.pf		= NFPROTO_IPV6,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP6_PRI_NAT_SRC,
 	},
@@ -88,12 +90,14 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
 	{
 		.hook		= ip6table_nat_local_fn,
 		.pf		= NFPROTO_IPV6,
+		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP6_PRI_NAT_DST,
 	},
 	/* After packet filtering, change source */
 	{
 		.hook		= ip6table_nat_fn,
+		.nat_hook	= true,
 		.pf		= NFPROTO_IPV6,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP6_PRI_NAT_SRC,
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ed8618f4efd7..3508a5c8edbb 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -160,6 +160,12 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
 			++i;
 			continue;
 		}
+
+		if (reg->nat_hook && orig_ops[i]->nat_hook) {
+			kvfree(new);
+			return ERR_PTR(-EEXIST);
+		}
+
 		if (inserted || reg->priority > orig_ops[i]->priority) {
 			new_ops[nhooks] = (void *)orig_ops[i];
 			new->hooks[nhooks] = old->hooks[i];
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 10798b357481..838eb581b5ab 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1400,6 +1400,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 				ops->hook = hookfn;
 			if (afi->hook_ops_init)
 				afi->hook_ops_init(ops, i);
+			if (basechain->type->type == NFT_CHAIN_T_NAT)
+				ops->nat_hook = true;
 		}
 
 		chain->flags |= NFT_BASE_CHAIN;
-- 
cgit v1.2.3


From 84ba7dd71add05b52e55c60b4a3af9bb6194c73d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 8 Dec 2017 17:01:55 +0100
Subject: netfilter: nf_tables: reject nat hook registration if prio is before
 conntrack

No problem for iptables as priorities are fixed values defined in the
nat modules, but in nftables the priority its coming from userspace.

Reject in case we see that such a hook would not work.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 838eb581b5ab..36d38f8b0284 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1264,7 +1264,7 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
 
 struct nft_chain_hook {
 	u32				num;
-	u32				priority;
+	s32				priority;
 	const struct nf_chain_type	*type;
 	struct net_device		*dev;
 };
@@ -1303,6 +1303,11 @@ static int nft_chain_parse_hook(struct net *net,
 	}
 	if (!(type->hook_mask & (1 << hook->num)))
 		return -EOPNOTSUPP;
+
+	if (type->type == NFT_CHAIN_T_NAT &&
+	    hook->priority <= NF_IP_PRI_CONNTRACK)
+		return -EOPNOTSUPP;
+
 	if (!try_module_get(type->owner))
 		return -ENOENT;
 
-- 
cgit v1.2.3


From fa45a7602166e9a0998b2228b7398b18b58c5579 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 10 Dec 2017 01:42:58 +0100
Subject: netfilter: nf_tables_arp: don't set forward chain

46928a0b49f3 ("netfilter: nf_tables: remove multihook chains and
families") already removed this, this is a leftover.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_tables_arp.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 4bbc273b45e8..ec47c12cd137 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -34,7 +34,6 @@ static struct nft_af_info nft_af_arp __read_mostly = {
 	.hooks		= {
 		[NF_ARP_IN]		= nft_do_chain_arp,
 		[NF_ARP_OUT]		= nft_do_chain_arp,
-		[NF_ARP_FORWARD]	= nft_do_chain_arp,
 	},
 };
 
-- 
cgit v1.2.3


From 7a4473a31a6974c0fbf9afe80ef16ac5bc67cf79 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 10 Dec 2017 01:43:14 +0100
Subject: netfilter: nf_tables: explicit nft_set_pktinfo() call from hook path

Instead of calling this function from the family specific variant, this
reduces the code size in the fast path for the netdev, bridge and inet
families. After this change, we must call nft_set_pktinfo() upfront from
the chain hook indirection.

Before:

   text    data     bss     dec     hex filename
   2145     208       0    2353     931 net/netfilter/nf_tables_netdev.o

After:

   text    data     bss     dec     hex filename
   2125     208       0    2333     91d net/netfilter/nf_tables_netdev.o

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h         | 12 ++----------
 include/net/netfilter/nf_tables_ipv4.h    | 25 ++++++++-----------------
 include/net/netfilter/nf_tables_ipv6.h    | 27 +++++++++------------------
 net/bridge/netfilter/nf_tables_bridge.c   |  8 +++++---
 net/ipv4/netfilter/nf_tables_arp.c        |  3 ++-
 net/ipv4/netfilter/nf_tables_ipv4.c       |  3 ++-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c   |  3 ++-
 net/ipv4/netfilter/nft_chain_route_ipv4.c |  3 ++-
 net/ipv6/netfilter/nf_tables_ipv6.c       |  3 ++-
 net/ipv6/netfilter/nft_chain_nat_ipv6.c   |  3 ++-
 net/ipv6/netfilter/nft_chain_route_ipv6.c |  3 ++-
 net/netfilter/nf_tables_netdev.c          |  8 +++++---
 12 files changed, 43 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index fecc6112c768..f6e4325b3306 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -54,8 +54,8 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 	pkt->xt.state = state;
 }
 
-static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt,
-						struct sk_buff *skb)
+static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt,
+					  struct sk_buff *skb)
 {
 	pkt->tprot_set = false;
 	pkt->tprot = 0;
@@ -63,14 +63,6 @@ static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt,
 	pkt->xt.fragoff = 0;
 }
 
-static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	nft_set_pktinfo(pkt, skb, state);
-	nft_set_pktinfo_proto_unspec(pkt, skb);
-}
-
 /**
  * 	struct nft_verdict - nf_tables verdict
  *
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
index f0896ba456c4..b2deeb2755a4 100644
--- a/include/net/netfilter/nf_tables_ipv4.h
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -5,15 +5,11 @@
 #include <net/netfilter/nf_tables.h>
 #include <net/ip.h>
 
-static inline void
-nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
-		     struct sk_buff *skb,
-		     const struct nf_hook_state *state)
+static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+					struct sk_buff *skb)
 {
 	struct iphdr *ip;
 
-	nft_set_pktinfo(pkt, skb, state);
-
 	ip = ip_hdr(pkt->skb);
 	pkt->tprot_set = true;
 	pkt->tprot = ip->protocol;
@@ -21,10 +17,8 @@ nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
 	pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
 }
 
-static inline int
-__nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
-				struct sk_buff *skb,
-				const struct nf_hook_state *state)
+static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
+						  struct sk_buff *skb)
 {
 	struct iphdr *iph, _iph;
 	u32 len, thoff;
@@ -52,14 +46,11 @@ __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
 	return 0;
 }
 
-static inline void
-nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
-			      struct sk_buff *skb,
-			      const struct nf_hook_state *state)
+static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
+						 struct sk_buff *skb)
 {
-	nft_set_pktinfo(pkt, skb, state);
-	if (__nft_set_pktinfo_ipv4_validate(pkt, skb, state) < 0)
-		nft_set_pktinfo_proto_unspec(pkt, skb);
+	if (__nft_set_pktinfo_ipv4_validate(pkt, skb) < 0)
+		nft_set_pktinfo_unspec(pkt, skb);
 }
 
 extern struct nft_af_info nft_af_ipv4;
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index b8065b72f56e..1890c5bc3c3c 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -5,20 +5,16 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/ipv6.h>
 
-static inline void
-nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
-		     struct sk_buff *skb,
-		     const struct nf_hook_state *state)
+static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+					struct sk_buff *skb)
 {
 	unsigned int flags = IP6_FH_F_AUTH;
 	int protohdr, thoff = 0;
 	unsigned short frag_off;
 
-	nft_set_pktinfo(pkt, skb, state);
-
 	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags);
 	if (protohdr < 0) {
-		nft_set_pktinfo_proto_unspec(pkt, skb);
+		nft_set_pktinfo_unspec(pkt, skb);
 		return;
 	}
 
@@ -28,10 +24,8 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	pkt->xt.fragoff = frag_off;
 }
 
-static inline int
-__nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
-				struct sk_buff *skb,
-				const struct nf_hook_state *state)
+static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
+						  struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_IPV6)
 	unsigned int flags = IP6_FH_F_AUTH;
@@ -68,14 +62,11 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
 #endif
 }
 
-static inline void
-nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
-			      struct sk_buff *skb,
-			      const struct nf_hook_state *state)
+static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
+						 struct sk_buff *skb)
 {
-	nft_set_pktinfo(pkt, skb, state);
-	if (__nft_set_pktinfo_ipv6_validate(pkt, skb, state) < 0)
-		nft_set_pktinfo_proto_unspec(pkt, skb);
+	if (__nft_set_pktinfo_ipv6_validate(pkt, skb) < 0)
+		nft_set_pktinfo_unspec(pkt, skb);
 }
 
 extern struct nft_af_info nft_af_ipv6;
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 97afdc0744e6..612bfd0737d5 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -25,15 +25,17 @@ nft_do_chain_bridge(void *priv,
 {
 	struct nft_pktinfo pkt;
 
+	nft_set_pktinfo(&pkt, skb, state);
+
 	switch (eth_hdr(skb)->h_proto) {
 	case htons(ETH_P_IP):
-		nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
+		nft_set_pktinfo_ipv4_validate(&pkt, skb);
 		break;
 	case htons(ETH_P_IPV6):
-		nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
+		nft_set_pktinfo_ipv6_validate(&pkt, skb);
 		break;
 	default:
-		nft_set_pktinfo_unspec(&pkt, skb, state);
+		nft_set_pktinfo_unspec(&pkt, skb);
 		break;
 	}
 
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index ec47c12cd137..3fa7e1b22bdd 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -21,7 +21,8 @@ nft_do_chain_arp(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	nft_set_pktinfo_unspec(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_unspec(&pkt, skb);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 2840a29b2e04..35fa265d1ce3 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -24,7 +24,8 @@ static unsigned int nft_do_chain_ipv4(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	nft_set_pktinfo_ipv4(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv4(&pkt, skb);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index f5c66a7a4bf2..f2a490981594 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -33,7 +33,8 @@ static unsigned int nft_nat_do_chain(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	nft_set_pktinfo_ipv4(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv4(&pkt, skb);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 30493beb611a..fb3d49fb62fe 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -38,7 +38,8 @@ static unsigned int nf_route_table_hook(void *priv,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	nft_set_pktinfo_ipv4(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv4(&pkt, skb);
 
 	mark = skb->mark;
 	iph = ip_hdr(skb);
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index d6e4ba5de916..71bac94770dd 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -22,7 +22,8 @@ static unsigned int nft_do_chain_ipv6(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	nft_set_pktinfo_ipv6(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv6(&pkt, skb);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 443cd306c0b0..73fe2bd13fcf 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -31,7 +31,8 @@ static unsigned int nft_nat_do_chain(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	nft_set_pktinfo_ipv6(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv6(&pkt, skb);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index f2727475895e..11d3c3b9aa18 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -33,7 +33,8 @@ static unsigned int nf_route_table_hook(void *priv,
 	u32 mark, flowlabel;
 	int err;
 
-	nft_set_pktinfo_ipv6(&pkt, skb, state);
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv6(&pkt, skb);
 
 	/* save source/dest address, mark, hoplimit, flowlabel, priority */
 	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 403432988313..3cd127dd2895 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -21,15 +21,17 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 {
 	struct nft_pktinfo pkt;
 
+	nft_set_pktinfo(&pkt, skb, state);
+
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
+		nft_set_pktinfo_ipv4_validate(&pkt, skb);
 		break;
 	case htons(ETH_P_IPV6):
-		nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
+		nft_set_pktinfo_ipv6_validate(&pkt, skb);
 		break;
 	default:
-		nft_set_pktinfo_unspec(&pkt, skb, state);
+		nft_set_pktinfo_unspec(&pkt, skb);
 		break;
 	}
 
-- 
cgit v1.2.3


From 408070d6ee3490da63430bc8ce13348cf2eb47ea Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Nov 2017 13:39:57 +0100
Subject: netfilter: nf_tables: add nft_set_is_anonymous() helper

Add helper function to test for the NFT_SET_ANONYMOUS flag.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 5 +++++
 net/netfilter/nf_tables_api.c     | 8 ++++----
 net/netfilter/nft_dynset.c        | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index f6e4325b3306..169b562df226 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -416,6 +416,11 @@ struct nft_set {
 		__attribute__((aligned(__alignof__(u64))));
 };
 
+static inline bool nft_set_is_anonymous(const struct nft_set *set)
+{
+	return set->flags & NFT_SET_ANONYMOUS;
+}
+
 static inline void *nft_set_priv(const struct nft_set *set)
 {
 	return (void *)set->data;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 36d38f8b0284..7bc1b0c92a7f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -774,7 +774,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
 		if (!nft_is_active_next(ctx->net, set))
 			continue;
 
-		if (set->flags & NFT_SET_ANONYMOUS &&
+		if (nft_set_is_anonymous(set) &&
 		    !list_empty(&set->bindings))
 			continue;
 
@@ -3284,7 +3284,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_set_binding *i;
 	struct nft_set_iter iter;
 
-	if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS)
+	if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
 		return -EBUSY;
 
 	if (binding->flags & NFT_SET_MAP) {
@@ -3319,7 +3319,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 {
 	list_del_rcu(&binding->list);
 
-	if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS &&
+	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
 	    nft_is_active(ctx->net, set))
 		nf_tables_set_destroy(ctx, set);
 }
@@ -5157,7 +5157,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			/* This avoids hitting -EBUSY when deleting the table
 			 * from the transaction.
 			 */
-			if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS &&
+			if (nft_set_is_anonymous(nft_trans_set(trans)) &&
 			    !list_empty(&nft_trans_set(trans)->bindings))
 				trans->ctx.table->use--;
 
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 66221ad891a9..ec0fd78231d8 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -184,7 +184,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	if (tb[NFTA_DYNSET_EXPR] != NULL) {
 		if (!(set->flags & NFT_SET_EVAL))
 			return -EINVAL;
-		if (!(set->flags & NFT_SET_ANONYMOUS))
+		if (!nft_set_is_anonymous(set))
 			return -EOPNOTSUPP;
 
 		priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
-- 
cgit v1.2.3


From 3d3cdc38e8c265a9f9d3825e823e772872bca1b8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:19:14 +0100
Subject: netfilter: core: add nf_remove_net_hook

Just a cleanup, __nf_unregister_net_hook() is used by a follow up patch
when handling NFPROTO_INET as a real family from the core.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 3508a5c8edbb..9153b6e03f6b 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -360,7 +360,7 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 EXPORT_SYMBOL(nf_register_net_hook);
 
 /*
- * __nf_unregister_net_hook - remove a hook from blob
+ * nf_remove_net_hook - remove a hook from blob
  *
  * @oldp: current address of hook blob
  * @unreg: hook to unregister
@@ -368,8 +368,8 @@ EXPORT_SYMBOL(nf_register_net_hook);
  * This cannot fail, hook unregistration must always succeed.
  * Therefore replace the to-be-removed hook with a dummy hook.
  */
-static void __nf_unregister_net_hook(struct nf_hook_entries *old,
-				     const struct nf_hook_ops *unreg)
+static void nf_remove_net_hook(struct nf_hook_entries *old,
+			       const struct nf_hook_ops *unreg)
 {
 	struct nf_hook_ops **orig_ops;
 	bool found = false;
@@ -415,7 +415,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 		return;
 	}
 
-	__nf_unregister_net_hook(p, reg);
+	nf_remove_net_hook(p, reg);
 
 	p = __nf_hook_entries_try_shrink(pp);
 	mutex_unlock(&nf_hook_mutex);
-- 
cgit v1.2.3


From 62a0fe46e2aaba1812d3cbcae014a41539f9eb09 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:23:51 +0100
Subject: netfilter: core: pass hook number, family and device to
 nf_find_hook_list()

Instead of passing struct nf_hook_ops, this is needed by follow up
patches to handle NFPROTO_INET from the core.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 9153b6e03f6b..bcbaa78ec374 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -268,36 +268,38 @@ out_assign:
 	return old;
 }
 
-static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg)
+static struct nf_hook_entries __rcu **
+nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
+		   struct net_device *dev)
 {
-	switch (reg->pf) {
+	switch (pf) {
 	case NFPROTO_NETDEV:
 		break;
 #ifdef CONFIG_NETFILTER_FAMILY_ARP
 	case NFPROTO_ARP:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= reg->hooknum))
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum))
 			return NULL;
-		return net->nf.hooks_arp + reg->hooknum;
+		return net->nf.hooks_arp + hooknum;
 #endif
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	case NFPROTO_BRIDGE:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= reg->hooknum))
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum))
 			return NULL;
-		return net->nf.hooks_bridge + reg->hooknum;
+		return net->nf.hooks_bridge + hooknum;
 #endif
 	case NFPROTO_IPV4:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= reg->hooknum))
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
 			return NULL;
-		return net->nf.hooks_ipv4 + reg->hooknum;
+		return net->nf.hooks_ipv4 + hooknum;
 	case NFPROTO_IPV6:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= reg->hooknum))
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
 			return NULL;
-		return net->nf.hooks_ipv6 + reg->hooknum;
+		return net->nf.hooks_ipv6 + hooknum;
 #if IS_ENABLED(CONFIG_DECNET)
 	case NFPROTO_DECNET:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= reg->hooknum))
+		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
 			return NULL;
-		return net->nf.hooks_decnet + reg->hooknum;
+		return net->nf.hooks_decnet + hooknum;
 #endif
 	default:
 		WARN_ON_ONCE(1);
@@ -305,9 +307,9 @@ static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const
 	}
 
 #ifdef CONFIG_NETFILTER_INGRESS
-	if (reg->hooknum == NF_NETDEV_INGRESS) {
-		if (reg->dev && dev_net(reg->dev) == net)
-			return &reg->dev->nf_hooks_ingress;
+	if (hooknum == NF_NETDEV_INGRESS) {
+		if (dev && dev_net(dev) == net)
+			return &dev->nf_hooks_ingress;
 	}
 #endif
 	WARN_ON_ONCE(1);
@@ -329,7 +331,7 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 			return -EINVAL;
 	}
 
-	pp = nf_hook_entry_head(net, reg);
+	pp = nf_hook_entry_head(net, reg->pf, reg->hooknum, reg->dev);
 	if (!pp)
 		return -EINVAL;
 
@@ -403,7 +405,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	struct nf_hook_entries __rcu **pp;
 	struct nf_hook_entries *p;
 
-	pp = nf_hook_entry_head(net, reg);
+	pp = nf_hook_entry_head(net, reg->pf, reg->hooknum, reg->dev);
 	if (!pp)
 		return;
 
-- 
cgit v1.2.3


From 30259408118f550f5969fda19c0d67020d21eda8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:26:37 +0100
Subject: netfilter: core: pass family as parameter to nf_remove_net_hook()

So static_key_slow_dec applies to the family behind NFPROTO_INET.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index bcbaa78ec374..6c9874c8b10a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL(nf_register_net_hook);
  * Therefore replace the to-be-removed hook with a dummy hook.
  */
 static void nf_remove_net_hook(struct nf_hook_entries *old,
-			       const struct nf_hook_ops *unreg)
+			       const struct nf_hook_ops *unreg, int pf)
 {
 	struct nf_hook_ops **orig_ops;
 	bool found = false;
@@ -389,14 +389,14 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
 
 	if (found) {
 #ifdef CONFIG_NETFILTER_INGRESS
-		if (unreg->pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
+		if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
 			net_dec_ingress_queue();
 #endif
 #ifdef HAVE_JUMP_LABEL
-		static_key_slow_dec(&nf_hooks_needed[unreg->pf][unreg->hooknum]);
+		static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]);
 #endif
 	} else {
-		WARN_ONCE(1, "hook not found, pf %d num %d", unreg->pf, unreg->hooknum);
+		WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum);
 	}
 }
 
@@ -417,7 +417,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 		return;
 	}
 
-	nf_remove_net_hook(p, reg);
+	nf_remove_net_hook(p, reg, reg->pf);
 
 	p = __nf_hook_entries_try_shrink(pp);
 	mutex_unlock(&nf_hook_mutex);
-- 
cgit v1.2.3


From cb7ccd835ebb333669e400f99c650e4f3abf11c0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:30:26 +0100
Subject: netfilter: core: support for NFPROTO_INET hook registration

Expand NFPROTO_INET in two hook registrations, one for NFPROTO_IPV4 and
another for NFPROTO_IPV6. Hence, we handle NFPROTO_INET from the core.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 6c9874c8b10a..606efc9b14e1 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -316,12 +316,13 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 	return NULL;
 }
 
-int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
+static int __nf_register_net_hook(struct net *net, int pf,
+				  const struct nf_hook_ops *reg)
 {
 	struct nf_hook_entries *p, *new_hooks;
 	struct nf_hook_entries __rcu **pp;
 
-	if (reg->pf == NFPROTO_NETDEV) {
+	if (pf == NFPROTO_NETDEV) {
 #ifndef CONFIG_NETFILTER_INGRESS
 		if (reg->hooknum == NF_NETDEV_INGRESS)
 			return -EOPNOTSUPP;
@@ -331,7 +332,7 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 			return -EINVAL;
 	}
 
-	pp = nf_hook_entry_head(net, reg->pf, reg->hooknum, reg->dev);
+	pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
 	if (!pp)
 		return -EINVAL;
 
@@ -349,17 +350,16 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 
 	hooks_validate(new_hooks);
 #ifdef CONFIG_NETFILTER_INGRESS
-	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
 #endif
 #ifdef HAVE_JUMP_LABEL
-	static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
+	static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
 	BUG_ON(p == new_hooks);
 	nf_hook_entries_free(p);
 	return 0;
 }
-EXPORT_SYMBOL(nf_register_net_hook);
 
 /*
  * nf_remove_net_hook - remove a hook from blob
@@ -400,12 +400,13 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
 	}
 }
 
-void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
+void __nf_unregister_net_hook(struct net *net, int pf,
+			      const struct nf_hook_ops *reg)
 {
 	struct nf_hook_entries __rcu **pp;
 	struct nf_hook_entries *p;
 
-	pp = nf_hook_entry_head(net, reg->pf, reg->hooknum, reg->dev);
+	pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
 	if (!pp)
 		return;
 
@@ -417,7 +418,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 		return;
 	}
 
-	nf_remove_net_hook(p, reg, reg->pf);
+	nf_remove_net_hook(p, reg, pf);
 
 	p = __nf_hook_entries_try_shrink(pp);
 	mutex_unlock(&nf_hook_mutex);
@@ -427,8 +428,42 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	nf_queue_nf_hook_drop(net);
 	nf_hook_entries_free(p);
 }
+
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
+{
+	if (reg->pf == NFPROTO_INET) {
+		__nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+		__nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+	} else {
+		__nf_unregister_net_hook(net, reg->pf, reg);
+	}
+}
 EXPORT_SYMBOL(nf_unregister_net_hook);
 
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
+{
+	int err;
+
+	if (reg->pf == NFPROTO_INET) {
+		err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
+		if (err < 0)
+			return err;
+
+		err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
+		if (err < 0) {
+			__nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+			return err;
+		}
+	} else {
+		err = __nf_register_net_hook(net, reg->pf, reg);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(nf_register_net_hook);
+
 int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
 			  unsigned int n)
 {
-- 
cgit v1.2.3


From 12355d3670dac0dde5aae3deefb59f8cc0a9ed2a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:36:24 +0100
Subject: netfilter: nf_tables_inet: don't use multihook infrastructure anymore

Use new native NFPROTO_INET support in netfilter core, this gets rid of
ad-hoc code in the nf_tables API codebase.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_ipv4.h |  2 -
 include/net/netfilter/nf_tables_ipv6.h |  2 -
 net/ipv4/netfilter/nf_tables_ipv4.c    |  3 +-
 net/ipv6/netfilter/nf_tables_ipv6.c    |  3 +-
 net/netfilter/nf_tables_inet.c         | 70 ++++++++++++++++++++++++++++------
 5 files changed, 60 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
index b2deeb2755a4..ed7b511f0a59 100644
--- a/include/net/netfilter/nf_tables_ipv4.h
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -53,6 +53,4 @@ static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
 		nft_set_pktinfo_unspec(pkt, skb);
 }
 
-extern struct nft_af_info nft_af_ipv4;
-
 #endif
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index 1890c5bc3c3c..dabe6fdb553a 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -69,6 +69,4 @@ static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
 		nft_set_pktinfo_unspec(pkt, skb);
 }
 
-extern struct nft_af_info nft_af_ipv6;
-
 #endif
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 35fa265d1ce3..b6223f4b1315 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -45,7 +45,7 @@ static unsigned int nft_ipv4_output(void *priv,
 	return nft_do_chain_ipv4(priv, skb, state);
 }
 
-struct nft_af_info nft_af_ipv4 __read_mostly = {
+static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	.family		= NFPROTO_IPV4,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
@@ -58,7 +58,6 @@ struct nft_af_info nft_af_ipv4 __read_mostly = {
 		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
 	},
 };
-EXPORT_SYMBOL_GPL(nft_af_ipv4);
 
 static int nf_tables_ipv4_init_net(struct net *net)
 {
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 71bac94770dd..b1b5d3824fc1 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -42,7 +42,7 @@ static unsigned int nft_ipv6_output(void *priv,
 	return nft_do_chain_ipv6(priv, skb, state);
 }
 
-struct nft_af_info nft_af_ipv6 __read_mostly = {
+static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	.family		= NFPROTO_IPV6,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
@@ -55,7 +55,6 @@ struct nft_af_info nft_af_ipv6 __read_mostly = {
 		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
 	},
 };
-EXPORT_SYMBOL_GPL(nft_af_ipv6);
 
 static int nf_tables_ipv6_init_net(struct net *net)
 {
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index f713cc205669..c6194b3509aa 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_tables.h>
@@ -16,26 +17,71 @@
 #include <net/netfilter/nf_tables_ipv6.h>
 #include <net/ip.h>
 
-static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n)
+static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
+				      const struct nf_hook_state *state)
 {
-	struct nft_af_info *afi;
-
-	if (n == 1)
-		afi = &nft_af_ipv4;
-	else
-		afi = &nft_af_ipv6;
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+
+	switch (state->pf) {
+	case NFPROTO_IPV4:
+		nft_set_pktinfo_ipv4(&pkt, skb);
+		break;
+	case NFPROTO_IPV6:
+		nft_set_pktinfo_ipv6(&pkt, skb);
+		break;
+	default:
+		break;
+	}
+
+	return nft_do_chain(&pkt, priv);
+}
 
-	ops->pf = afi->family;
-	if (afi->hooks[ops->hooknum])
-		ops->hook = afi->hooks[ops->hooknum];
+static unsigned int nft_inet_output(void *priv, struct sk_buff *skb,
+				    const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+
+	switch (state->pf) {
+	case NFPROTO_IPV4:
+		if (unlikely(skb->len < sizeof(struct iphdr) ||
+			     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+			if (net_ratelimit())
+				pr_info("ignoring short SOCK_RAW packet\n");
+			return NF_ACCEPT;
+		}
+		nft_set_pktinfo_ipv4(&pkt, skb);
+		break;
+	case NFPROTO_IPV6:
+	        if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
+			if (net_ratelimit())
+				pr_info("ignoring short SOCK_RAW packet\n");
+			return NF_ACCEPT;
+		}
+		nft_set_pktinfo_ipv6(&pkt, skb);
+		break;
+	default:
+		break;
+	}
+
+	return nft_do_chain(&pkt, priv);
 }
 
 static struct nft_af_info nft_af_inet __read_mostly = {
 	.family		= NFPROTO_INET,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.nops		= 2,
-	.hook_ops_init	= nft_inet_hook_ops_init,
+	.nops		= 1,
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
+		[NF_INET_LOCAL_OUT]	= nft_inet_output,
+		[NF_INET_FORWARD]	= nft_do_chain_inet,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_inet,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_inet,
+        },
 };
 
 static int __net_init nf_tables_inet_init_net(struct net *net)
-- 
cgit v1.2.3


From c974a3a36468d1947c96f0c694c8a1b2e7810043 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:40:25 +0100
Subject: netfilter: nf_tables: remove multihook chains and families

Since NFPROTO_INET is handled from the core, we don't need to maintain
extra infrastructure in nf_tables to handle the double hook
registration, one for IPv4 and another for IPv6.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       |   9 +--
 net/bridge/netfilter/nf_tables_bridge.c |   1 -
 net/ipv4/netfilter/nf_tables_arp.c      |   1 -
 net/ipv4/netfilter/nf_tables_ipv4.c     |   1 -
 net/ipv6/netfilter/nf_tables_ipv6.c     |   1 -
 net/netfilter/nf_tables_api.c           | 102 ++++++++++++++------------------
 net/netfilter/nf_tables_inet.c          |   1 -
 net/netfilter/nf_tables_netdev.c        |   3 +-
 net/netfilter/nft_compat.c              |   8 +--
 9 files changed, 49 insertions(+), 78 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 169b562df226..a3560fd55f99 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -902,8 +902,6 @@ struct nft_stats {
 	struct u64_stats_sync	syncp;
 };
 
-#define NFT_HOOK_OPS_MAX		2
-
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
@@ -915,7 +913,7 @@ struct nft_stats {
  *	@dev_name: device name that this base chain is attached to (if any)
  */
 struct nft_base_chain {
-	struct nf_hook_ops		ops[NFT_HOOK_OPS_MAX];
+	struct nf_hook_ops		ops;
 	const struct nf_chain_type	*type;
 	u8				policy;
 	u8				flags;
@@ -976,8 +974,6 @@ enum nft_af_flags {
  *	@owner: module owner
  *	@tables: used internally
  *	@flags: family flags
- *	@nops: number of hook ops in this family
- *	@hook_ops_init: initialization function for chain hook ops
  *	@hooks: hookfn overrides for packet validation
  */
 struct nft_af_info {
@@ -987,9 +983,6 @@ struct nft_af_info {
 	struct module			*owner;
 	struct list_head		tables;
 	u32				flags;
-	unsigned int			nops;
-	void				(*hook_ops_init)(struct nf_hook_ops *,
-							 unsigned int);
 	nf_hookfn			*hooks[NF_MAX_HOOKS];
 };
 
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 612bfd0737d5..991d0abb46aa 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -46,7 +46,6 @@ static struct nft_af_info nft_af_bridge __read_mostly = {
 	.family		= NFPROTO_BRIDGE,
 	.nhooks		= NF_BR_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.nops		= 1,
 	.hooks		= {
 		[NF_BR_PRE_ROUTING]	= nft_do_chain_bridge,
 		[NF_BR_LOCAL_IN]	= nft_do_chain_bridge,
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 3fa7e1b22bdd..3090f639bd89 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -31,7 +31,6 @@ static struct nft_af_info nft_af_arp __read_mostly = {
 	.family		= NFPROTO_ARP,
 	.nhooks		= NF_ARP_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.nops		= 1,
 	.hooks		= {
 		[NF_ARP_IN]		= nft_do_chain_arp,
 		[NF_ARP_OUT]		= nft_do_chain_arp,
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index b6223f4b1315..51b363abd541 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -49,7 +49,6 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	.family		= NFPROTO_IPV4,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.nops		= 1,
 	.hooks		= {
 		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
 		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index b1b5d3824fc1..78d34a2f3347 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -46,7 +46,6 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	.family		= NFPROTO_IPV6,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.nops		= 1,
 	.hooks		= {
 		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
 		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7bc1b0c92a7f..06fae437c9cb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -139,29 +139,26 @@ static void nft_trans_destroy(struct nft_trans *trans)
 	kfree(trans);
 }
 
-static int nf_tables_register_hooks(struct net *net,
-				    const struct nft_table *table,
-				    struct nft_chain *chain,
-				    unsigned int hook_nops)
+static int nf_tables_register_hook(struct net *net,
+				   const struct nft_table *table,
+				   struct nft_chain *chain)
 {
 	if (table->flags & NFT_TABLE_F_DORMANT ||
 	    !nft_is_base_chain(chain))
 		return 0;
 
-	return nf_register_net_hooks(net, nft_base_chain(chain)->ops,
-				     hook_nops);
+	return nf_register_net_hook(net, &nft_base_chain(chain)->ops);
 }
 
-static void nf_tables_unregister_hooks(struct net *net,
-				       const struct nft_table *table,
-				       struct nft_chain *chain,
-				       unsigned int hook_nops)
+static void nf_tables_unregister_hook(struct net *net,
+				      const struct nft_table *table,
+				      struct nft_chain *chain)
 {
 	if (table->flags & NFT_TABLE_F_DORMANT ||
 	    !nft_is_base_chain(chain))
 		return;
 
-	nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops);
+	nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
 }
 
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -595,8 +592,7 @@ static void _nf_tables_table_disable(struct net *net,
 		if (cnt && i++ == cnt)
 			break;
 
-		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
-					afi->nops);
+		nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
 	}
 }
 
@@ -613,8 +609,7 @@ static int nf_tables_table_enable(struct net *net,
 		if (!nft_is_base_chain(chain))
 			continue;
 
-		err = nf_register_net_hooks(net, nft_base_chain(chain)->ops,
-					    afi->nops);
+		err = nf_register_net_hook(net, &nft_base_chain(chain)->ops);
 		if (err < 0)
 			goto err;
 
@@ -1026,7 +1021,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 
 	if (nft_is_base_chain(chain)) {
 		const struct nft_base_chain *basechain = nft_base_chain(chain);
-		const struct nf_hook_ops *ops = &basechain->ops[0];
+		const struct nf_hook_ops *ops = &basechain->ops;
 		struct nlattr *nest;
 
 		nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
@@ -1252,8 +1247,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
 		free_percpu(basechain->stats);
 		if (basechain->stats)
 			static_branch_dec(&nft_counters_enabled);
-		if (basechain->ops[0].dev != NULL)
-			dev_put(basechain->ops[0].dev);
+		if (basechain->ops.dev != NULL)
+			dev_put(basechain->ops.dev);
 		kfree(chain->name);
 		kfree(basechain);
 	} else {
@@ -1354,7 +1349,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	struct nft_stats __percpu *stats;
 	struct net *net = ctx->net;
 	struct nft_chain *chain;
-	unsigned int i;
 	int err;
 
 	if (table->use == UINT_MAX)
@@ -1393,21 +1387,18 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		basechain->type = hook.type;
 		chain = &basechain->chain;
 
-		for (i = 0; i < afi->nops; i++) {
-			ops = &basechain->ops[i];
-			ops->pf		= family;
-			ops->hooknum	= hook.num;
-			ops->priority	= hook.priority;
-			ops->priv	= chain;
-			ops->hook	= afi->hooks[ops->hooknum];
-			ops->dev	= hook.dev;
-			if (hookfn)
-				ops->hook = hookfn;
-			if (afi->hook_ops_init)
-				afi->hook_ops_init(ops, i);
-			if (basechain->type->type == NFT_CHAIN_T_NAT)
-				ops->nat_hook = true;
-		}
+		ops		= &basechain->ops;
+		ops->pf		= family;
+		ops->hooknum	= hook.num;
+		ops->priority	= hook.priority;
+		ops->priv	= chain;
+		ops->hook	= afi->hooks[ops->hooknum];
+		ops->dev	= hook.dev;
+		if (hookfn)
+			ops->hook = hookfn;
+
+		if (basechain->type->type == NFT_CHAIN_T_NAT)
+			ops->nat_hook = true;
 
 		chain->flags |= NFT_BASE_CHAIN;
 		basechain->policy = policy;
@@ -1425,7 +1416,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		goto err1;
 	}
 
-	err = nf_tables_register_hooks(net, table, chain, afi->nops);
+	err = nf_tables_register_hook(net, table, chain);
 	if (err < 0)
 		goto err1;
 
@@ -1439,7 +1430,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 
 	return 0;
 err2:
-	nf_tables_unregister_hooks(net, table, chain, afi->nops);
+	nf_tables_unregister_hook(net, table, chain);
 err1:
 	nf_tables_chain_destroy(chain);
 
@@ -1452,14 +1443,13 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	const struct nlattr * const *nla = ctx->nla;
 	struct nft_table *table = ctx->table;
 	struct nft_chain *chain = ctx->chain;
-	struct nft_af_info *afi = ctx->afi;
 	struct nft_base_chain *basechain;
 	struct nft_stats *stats = NULL;
 	struct nft_chain_hook hook;
 	const struct nlattr *name;
 	struct nf_hook_ops *ops;
 	struct nft_trans *trans;
-	int err, i;
+	int err;
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		if (!nft_is_base_chain(chain))
@@ -1476,14 +1466,12 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			return -EBUSY;
 		}
 
-		for (i = 0; i < afi->nops; i++) {
-			ops = &basechain->ops[i];
-			if (ops->hooknum != hook.num ||
-			    ops->priority != hook.priority ||
-			    ops->dev != hook.dev) {
-				nft_chain_release_hook(&hook);
-				return -EBUSY;
-			}
+		ops = &basechain->ops;
+		if (ops->hooknum != hook.num ||
+		    ops->priority != hook.priority ||
+		    ops->dev != hook.dev) {
+			nft_chain_release_hook(&hook);
+			return -EBUSY;
 		}
 		nft_chain_release_hook(&hook);
 	}
@@ -5134,10 +5122,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_DELCHAIN:
 			list_del_rcu(&trans->ctx.chain->list);
 			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
-			nf_tables_unregister_hooks(trans->ctx.net,
-						   trans->ctx.table,
-						   trans->ctx.chain,
-						   trans->ctx.afi->nops);
+			nf_tables_unregister_hook(trans->ctx.net,
+						  trans->ctx.table,
+						  trans->ctx.chain);
 			break;
 		case NFT_MSG_NEWRULE:
 			nft_clear(trans->ctx.net, nft_trans_rule(trans));
@@ -5274,10 +5261,9 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			} else {
 				trans->ctx.table->use--;
 				list_del_rcu(&trans->ctx.chain->list);
-				nf_tables_unregister_hooks(trans->ctx.net,
-							   trans->ctx.table,
-							   trans->ctx.chain,
-							   trans->ctx.afi->nops);
+				nf_tables_unregister_hook(trans->ctx.net,
+							  trans->ctx.table,
+							  trans->ctx.chain);
 			}
 			break;
 		case NFT_MSG_DELCHAIN:
@@ -5378,7 +5364,7 @@ int nft_chain_validate_hooks(const struct nft_chain *chain,
 	if (nft_is_base_chain(chain)) {
 		basechain = nft_base_chain(chain);
 
-		if ((1 << basechain->ops[0].hooknum) & hook_flags)
+		if ((1 << basechain->ops.hooknum) & hook_flags)
 			return 0;
 
 		return -EOPNOTSUPP;
@@ -5866,8 +5852,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 
 	BUG_ON(!nft_is_base_chain(ctx->chain));
 
-	nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain,
-				   ctx->afi->nops);
+	nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
 	list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
 		list_del(&rule->list);
 		ctx->chain->use--;
@@ -5896,8 +5881,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 
 	list_for_each_entry_safe(table, nt, &afi->tables, list) {
 		list_for_each_entry(chain, &table->chains, list)
-			nf_tables_unregister_hooks(net, table, chain,
-						   afi->nops);
+			nf_tables_unregister_hook(net, table, chain);
 		/* No packets are walking on these chains anymore. */
 		ctx.table = table;
 		list_for_each_entry(chain, &table->chains, list) {
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index c6194b3509aa..edd7829a5753 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -74,7 +74,6 @@ static struct nft_af_info nft_af_inet __read_mostly = {
 	.family		= NFPROTO_INET,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.nops		= 1,
 	.hooks		= {
 		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
 		[NF_INET_LOCAL_OUT]	= nft_inet_output,
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 3cd127dd2895..018e2c5b4a49 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -43,7 +43,6 @@ static struct nft_af_info nft_af_netdev __read_mostly = {
 	.nhooks		= NF_NETDEV_NUMHOOKS,
 	.owner		= THIS_MODULE,
 	.flags		= NFT_AF_NEEDS_DEV,
-	.nops		= 1,
 	.hooks		= {
 		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
 	},
@@ -98,7 +97,7 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
 		__nft_release_basechain(ctx);
 		break;
 	case NETDEV_CHANGENAME:
-		if (dev->ifindex != basechain->ops[0].dev->ifindex)
+		if (dev->ifindex != basechain->ops.dev->ifindex)
 			return;
 
 		strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index b89f4f65b2a0..dcff0dc8d28b 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -169,7 +169,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 	if (nft_is_base_chain(ctx->chain)) {
 		const struct nft_base_chain *basechain =
 						nft_base_chain(ctx->chain);
-		const struct nf_hook_ops *ops = &basechain->ops[0];
+		const struct nf_hook_ops *ops = &basechain->ops;
 
 		par->hook_mask = 1 << ops->hooknum;
 	} else {
@@ -302,7 +302,7 @@ static int nft_target_validate(const struct nft_ctx *ctx,
 	if (nft_is_base_chain(ctx->chain)) {
 		const struct nft_base_chain *basechain =
 						nft_base_chain(ctx->chain);
-		const struct nf_hook_ops *ops = &basechain->ops[0];
+		const struct nf_hook_ops *ops = &basechain->ops;
 
 		hook_mask = 1 << ops->hooknum;
 		if (target->hooks && !(hook_mask & target->hooks))
@@ -383,7 +383,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 	if (nft_is_base_chain(ctx->chain)) {
 		const struct nft_base_chain *basechain =
 						nft_base_chain(ctx->chain);
-		const struct nf_hook_ops *ops = &basechain->ops[0];
+		const struct nf_hook_ops *ops = &basechain->ops;
 
 		par->hook_mask = 1 << ops->hooknum;
 	} else {
@@ -481,7 +481,7 @@ static int nft_match_validate(const struct nft_ctx *ctx,
 	if (nft_is_base_chain(ctx->chain)) {
 		const struct nft_base_chain *basechain =
 						nft_base_chain(ctx->chain);
-		const struct nf_hook_ops *ops = &basechain->ops[0];
+		const struct nf_hook_ops *ops = &basechain->ops;
 
 		hook_mask = 1 << ops->hooknum;
 		if (match->hooks && !(hook_mask & match->hooks))
-- 
cgit v1.2.3


From c2f9eafee9aaeedaad9eadbf47913f4681d723df Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 15:43:17 +0100
Subject: netfilter: nf_tables: remove hooks from family definition

They don't belong to the family definition, move them to the filter
chain type definition instead.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       |  4 +---
 net/bridge/netfilter/nf_tables_bridge.c | 14 +++++++-------
 net/ipv4/netfilter/nf_tables_arp.c      |  8 ++++----
 net/ipv4/netfilter/nf_tables_ipv4.c     | 14 +++++++-------
 net/ipv6/netfilter/nf_tables_ipv6.c     | 14 +++++++-------
 net/netfilter/nf_tables_api.c           |  6 +-----
 net/netfilter/nf_tables_inet.c          | 14 +++++++-------
 net/netfilter/nf_tables_netdev.c        |  6 +++---
 8 files changed, 37 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index a3560fd55f99..e040b6151acc 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -880,7 +880,7 @@ enum nft_chain_type {
  * 	@family: address family
  * 	@owner: module owner
  * 	@hook_mask: mask of valid hooks
- * 	@hooks: hookfn overrides
+ * 	@hooks: array of hook functions
  */
 struct nf_chain_type {
 	const char			*name;
@@ -974,7 +974,6 @@ enum nft_af_flags {
  *	@owner: module owner
  *	@tables: used internally
  *	@flags: family flags
- *	@hooks: hookfn overrides for packet validation
  */
 struct nft_af_info {
 	struct list_head		list;
@@ -983,7 +982,6 @@ struct nft_af_info {
 	struct module			*owner;
 	struct list_head		tables;
 	u32				flags;
-	nf_hookfn			*hooks[NF_MAX_HOOKS];
 };
 
 int nft_register_afinfo(struct net *, struct nft_af_info *);
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 991d0abb46aa..74260ffec74d 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -46,13 +46,6 @@ static struct nft_af_info nft_af_bridge __read_mostly = {
 	.family		= NFPROTO_BRIDGE,
 	.nhooks		= NF_BR_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.hooks		= {
-		[NF_BR_PRE_ROUTING]	= nft_do_chain_bridge,
-		[NF_BR_LOCAL_IN]	= nft_do_chain_bridge,
-		[NF_BR_FORWARD]		= nft_do_chain_bridge,
-		[NF_BR_LOCAL_OUT]	= nft_do_chain_bridge,
-		[NF_BR_POST_ROUTING]	= nft_do_chain_bridge,
-	},
 };
 
 static int nf_tables_bridge_init_net(struct net *net)
@@ -93,6 +86,13 @@ static const struct nf_chain_type filter_bridge = {
 			  (1 << NF_BR_FORWARD) |
 			  (1 << NF_BR_LOCAL_OUT) |
 			  (1 << NF_BR_POST_ROUTING),
+	.hooks		= {
+		[NF_BR_PRE_ROUTING]	= nft_do_chain_bridge,
+		[NF_BR_LOCAL_IN]	= nft_do_chain_bridge,
+		[NF_BR_FORWARD]		= nft_do_chain_bridge,
+		[NF_BR_LOCAL_OUT]	= nft_do_chain_bridge,
+		[NF_BR_POST_ROUTING]	= nft_do_chain_bridge,
+	},
 };
 
 static void nf_br_saveroute(const struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 3090f639bd89..f84c17763f6f 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -31,10 +31,6 @@ static struct nft_af_info nft_af_arp __read_mostly = {
 	.family		= NFPROTO_ARP,
 	.nhooks		= NF_ARP_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.hooks		= {
-		[NF_ARP_IN]		= nft_do_chain_arp,
-		[NF_ARP_OUT]		= nft_do_chain_arp,
-	},
 };
 
 static int nf_tables_arp_init_net(struct net *net)
@@ -72,6 +68,10 @@ static const struct nf_chain_type filter_arp = {
 	.owner		= THIS_MODULE,
 	.hook_mask	= (1 << NF_ARP_IN) |
 			  (1 << NF_ARP_OUT),
+	.hooks		= {
+		[NF_ARP_IN]		= nft_do_chain_arp,
+		[NF_ARP_OUT]		= nft_do_chain_arp,
+	},
 };
 
 static int __init nf_tables_arp_init(void)
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 51b363abd541..8aeb15c2b9b2 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -49,13 +49,6 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	.family		= NFPROTO_IPV4,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.hooks		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
-		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
-		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
-		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
-	},
 };
 
 static int nf_tables_ipv4_init_net(struct net *net)
@@ -96,6 +89,13 @@ static const struct nf_chain_type filter_ipv4 = {
 			  (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
+		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
+	},
 };
 
 static int __init nf_tables_ipv4_init(void)
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 78d34a2f3347..d4c9ef030e4f 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -46,13 +46,6 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	.family		= NFPROTO_IPV6,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.hooks		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
-		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
-		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
-		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
-	},
 };
 
 static int nf_tables_ipv6_init_net(struct net *net)
@@ -93,6 +86,13 @@ static const struct nf_chain_type filter_ipv6 = {
 			  (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
+		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
+	},
 };
 
 static int __init nf_tables_ipv6_init(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 06fae437c9cb..15773a3189ce 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1357,7 +1357,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nft_chain_hook hook;
 		struct nf_hook_ops *ops;
-		nf_hookfn *hookfn;
 
 		err = nft_chain_parse_hook(net, nla, afi, &hook, create);
 		if (err < 0)
@@ -1383,7 +1382,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			static_branch_inc(&nft_counters_enabled);
 		}
 
-		hookfn = hook.type->hooks[hook.num];
 		basechain->type = hook.type;
 		chain = &basechain->chain;
 
@@ -1392,10 +1390,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		ops->hooknum	= hook.num;
 		ops->priority	= hook.priority;
 		ops->priv	= chain;
-		ops->hook	= afi->hooks[ops->hooknum];
+		ops->hook	= hook.type->hooks[ops->hooknum];
 		ops->dev	= hook.dev;
-		if (hookfn)
-			ops->hook = hookfn;
 
 		if (basechain->type->type == NFT_CHAIN_T_NAT)
 			ops->nat_hook = true;
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index edd7829a5753..313987e2b1fe 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -74,13 +74,6 @@ static struct nft_af_info nft_af_inet __read_mostly = {
 	.family		= NFPROTO_INET,
 	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
-	.hooks		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
-		[NF_INET_LOCAL_OUT]	= nft_inet_output,
-		[NF_INET_FORWARD]	= nft_do_chain_inet,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain_inet,
-		[NF_INET_POST_ROUTING]	= nft_do_chain_inet,
-        },
 };
 
 static int __net_init nf_tables_inet_init_net(struct net *net)
@@ -121,6 +114,13 @@ static const struct nf_chain_type filter_inet = {
 			  (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
+		[NF_INET_LOCAL_OUT]	= nft_inet_output,
+		[NF_INET_FORWARD]	= nft_do_chain_inet,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_inet,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_inet,
+        },
 };
 
 static int __init nf_tables_inet_init(void)
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 018e2c5b4a49..42f6f6d42a6d 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -43,9 +43,6 @@ static struct nft_af_info nft_af_netdev __read_mostly = {
 	.nhooks		= NF_NETDEV_NUMHOOKS,
 	.owner		= THIS_MODULE,
 	.flags		= NFT_AF_NEEDS_DEV,
-	.hooks		= {
-		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
-	},
 };
 
 static int nf_tables_netdev_init_net(struct net *net)
@@ -82,6 +79,9 @@ static const struct nf_chain_type nft_filter_chain_netdev = {
 	.family		= NFPROTO_NETDEV,
 	.owner		= THIS_MODULE,
 	.hook_mask	= (1 << NF_NETDEV_INGRESS),
+	.hooks		= {
+		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
+	},
 };
 
 static void nft_netdev_event(unsigned long event, struct net_device *dev,
-- 
cgit v1.2.3


From 625c556118f3c2fd28bb8ef6da18c53bd4037be4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 9 Dec 2017 21:01:08 +0100
Subject: netfilter: connlimit: split xt_connlimit into front and backend

This allows to reuse xt_connlimit infrastructure from nf_tables.
The upcoming nf_tables frontend can just pass in an nftables register
as input key, this allows limiting by any nft-supported key, including
concatenations.

For xt_connlimit, pass in the zone and the ip/ipv6 address.

With help from Yi-Hung Wei.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h  |  17 ++
 include/uapi/linux/netfilter/xt_connlimit.h |   2 +-
 net/netfilter/Kconfig                       |   3 +
 net/netfilter/Makefile                      |   2 +
 net/netfilter/nf_conncount.c                | 373 ++++++++++++++++++++++++++++
 net/netfilter/xt_connlimit.c                | 369 ++-------------------------
 6 files changed, 420 insertions(+), 346 deletions(-)
 create mode 100644 include/net/netfilter/nf_conntrack_count.h
 create mode 100644 net/netfilter/nf_conncount.c

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
new file mode 100644
index 000000000000..adf8db44cf86
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -0,0 +1,17 @@
+#ifndef _NF_CONNTRACK_COUNT_H
+#define _NF_CONNTRACK_COUNT_H
+
+struct nf_conncount_data;
+
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
+					    unsigned int keylen);
+void nf_conncount_destroy(struct net *net, unsigned int family,
+			  struct nf_conncount_data *data);
+
+unsigned int nf_conncount_count(struct net *net,
+				struct nf_conncount_data *data,
+				const u32 *key,
+				unsigned int family,
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_zone *zone);
+#endif
diff --git a/include/uapi/linux/netfilter/xt_connlimit.h b/include/uapi/linux/netfilter/xt_connlimit.h
index 07e5e9d47882..d4d1943dcd11 100644
--- a/include/uapi/linux/netfilter/xt_connlimit.h
+++ b/include/uapi/linux/netfilter/xt_connlimit.h
@@ -27,7 +27,7 @@ struct xt_connlimit_info {
 	__u32 flags;
 
 	/* Used internally by the kernel */
-	struct xt_connlimit_data *data __attribute__((aligned(8)));
+	struct nf_conncount_data *data __attribute__((aligned(8)));
 };
 
 #endif /* _XT_CONNLIMIT_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 263609a7e010..af3d9f721b3f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -68,6 +68,8 @@ config NF_LOG_NETDEV
 	select NF_LOG_COMMON
 
 if NF_CONNTRACK
+config NETFILTER_CONNCOUNT
+	tristate
 
 config NF_CONNTRACK_MARK
 	bool  'Connection mark tracking support'
@@ -1126,6 +1128,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT
 	tristate '"connlimit" match support'
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
+	select NETFILTER_CONNCOUNT
 	---help---
 	  This match allows you to match against the number of parallel
 	  connections to a server per client IP address (or address block).
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index f78ed2470831..490a55e7166d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -67,6 +67,8 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 # SYNPROXY
 obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
 
+obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o
+
 # generic packet duplication from netdev family
 obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
new file mode 100644
index 000000000000..a95518261168
--- /dev/null
+++ b/net/netfilter/nf_conncount.c
@@ -0,0 +1,373 @@
+/*
+ * count the number of connections matching an arbitrary key.
+ *
+ * (C) 2017 Red Hat GmbH
+ * Author: Florian Westphal <fw@strlen.de>
+ *
+ * split from xt_connlimit.c:
+ *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
+ *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
+ *		only ignore TIME_WAIT or gone connections
+ *   (C) CC Computer Consultants GmbH, 2007
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+#define CONNCOUNT_SLOTS		256U
+
+#ifdef CONFIG_LOCKDEP
+#define CONNCOUNT_LOCK_SLOTS	8U
+#else
+#define CONNCOUNT_LOCK_SLOTS	256U
+#endif
+
+#define CONNCOUNT_GC_MAX_NODES	8
+#define MAX_KEYLEN		5
+
+/* we will save the tuples of all connections we care about */
+struct nf_conncount_tuple {
+	struct hlist_node		node;
+	struct nf_conntrack_tuple	tuple;
+};
+
+struct nf_conncount_rb {
+	struct rb_node node;
+	struct hlist_head hhead; /* connections/hosts in same subnet */
+	u32 key[MAX_KEYLEN];
+};
+
+static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
+
+struct nf_conncount_data {
+	unsigned int keylen;
+	struct rb_root root[CONNCOUNT_SLOTS];
+};
+
+static u_int32_t conncount_rnd __read_mostly;
+static struct kmem_cache *conncount_rb_cachep __read_mostly;
+static struct kmem_cache *conncount_conn_cachep __read_mostly;
+
+static inline bool already_closed(const struct nf_conn *conn)
+{
+	if (nf_ct_protonum(conn) == IPPROTO_TCP)
+		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
+		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
+	else
+		return 0;
+}
+
+static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
+{
+	return memcmp(a, b, klen * sizeof(u32));
+}
+
+static bool add_hlist(struct hlist_head *head,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conncount_tuple *conn;
+
+	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL)
+		return false;
+	conn->tuple = *tuple;
+	hlist_add_head(&conn->node, head);
+	return true;
+}
+
+static unsigned int check_hlist(struct net *net,
+				struct hlist_head *head,
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_zone *zone,
+				bool *addit)
+{
+	const struct nf_conntrack_tuple_hash *found;
+	struct nf_conncount_tuple *conn;
+	struct hlist_node *n;
+	struct nf_conn *found_ct;
+	unsigned int length = 0;
+
+	*addit = true;
+
+	/* check the saved connections */
+	hlist_for_each_entry_safe(conn, n, head, node) {
+		found = nf_conntrack_find_get(net, zone, &conn->tuple);
+		if (found == NULL) {
+			hlist_del(&conn->node);
+			kmem_cache_free(conncount_conn_cachep, conn);
+			continue;
+		}
+
+		found_ct = nf_ct_tuplehash_to_ctrack(found);
+
+		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
+			/*
+			 * Just to be sure we have it only once in the list.
+			 * We should not see tuples twice unless someone hooks
+			 * this into a table without "-p tcp --syn".
+			 */
+			*addit = false;
+		} else if (already_closed(found_ct)) {
+			/*
+			 * we do not care about connections which are
+			 * closed already -> ditch it
+			 */
+			nf_ct_put(found_ct);
+			hlist_del(&conn->node);
+			kmem_cache_free(conncount_conn_cachep, conn);
+			continue;
+		}
+
+		nf_ct_put(found_ct);
+		length++;
+	}
+
+	return length;
+}
+
+static void tree_nodes_free(struct rb_root *root,
+			    struct nf_conncount_rb *gc_nodes[],
+			    unsigned int gc_count)
+{
+	struct nf_conncount_rb *rbconn;
+
+	while (gc_count) {
+		rbconn = gc_nodes[--gc_count];
+		rb_erase(&rbconn->node, root);
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+	}
+}
+
+static unsigned int
+count_tree(struct net *net, struct rb_root *root,
+	   const u32 *key, u8 keylen,
+	   u8 family,
+	   const struct nf_conntrack_tuple *tuple,
+	   const struct nf_conntrack_zone *zone)
+{
+	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+	struct rb_node **rbnode, *parent;
+	struct nf_conncount_rb *rbconn;
+	struct nf_conncount_tuple *conn;
+	unsigned int gc_count;
+	bool no_gc = false;
+
+ restart:
+	gc_count = 0;
+	parent = NULL;
+	rbnode = &(root->rb_node);
+	while (*rbnode) {
+		int diff;
+		bool addit;
+
+		rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
+
+		parent = *rbnode;
+		diff = key_diff(key, rbconn->key, keylen);
+		if (diff < 0) {
+			rbnode = &((*rbnode)->rb_left);
+		} else if (diff > 0) {
+			rbnode = &((*rbnode)->rb_right);
+		} else {
+			/* same source network -> be counted! */
+			unsigned int count;
+			count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+			tree_nodes_free(root, gc_nodes, gc_count);
+			if (!addit)
+				return count;
+
+			if (!add_hlist(&rbconn->hhead, tuple))
+				return 0; /* hotdrop */
+
+			return count + 1;
+		}
+
+		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+			continue;
+
+		/* only used for GC on hhead, retval and 'addit' ignored */
+		check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+		if (hlist_empty(&rbconn->hhead))
+			gc_nodes[gc_count++] = rbconn;
+	}
+
+	if (gc_count) {
+		no_gc = true;
+		tree_nodes_free(root, gc_nodes, gc_count);
+		/* tree_node_free before new allocation permits
+		 * allocator to re-use newly free'd object.
+		 *
+		 * This is a rare event; in most cases we will find
+		 * existing node to re-use. (or gc_count is 0).
+		 */
+		goto restart;
+	}
+
+	/* no match, need to insert new node */
+	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+	if (rbconn == NULL)
+		return 0;
+
+	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL) {
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+		return 0;
+	}
+
+	conn->tuple = *tuple;
+	memcpy(rbconn->key, key, sizeof(u32) * keylen);
+
+	INIT_HLIST_HEAD(&rbconn->hhead);
+	hlist_add_head(&conn->node, &rbconn->hhead);
+
+	rb_link_node(&rbconn->node, parent, rbnode);
+	rb_insert_color(&rbconn->node, root);
+	return 1;
+}
+
+unsigned int nf_conncount_count(struct net *net,
+				struct nf_conncount_data *data,
+				const u32 *key,
+				unsigned int family,
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_zone *zone)
+{
+	struct rb_root *root;
+	int count;
+	u32 hash;
+
+	hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
+	root = &data->root[hash];
+
+	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+
+	count = count_tree(net, root, key, data->keylen, family, tuple, zone);
+
+	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_count);
+
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
+					    unsigned int keylen)
+{
+	struct nf_conncount_data *data;
+	int ret, i;
+
+	if (keylen % sizeof(u32) ||
+	    keylen / sizeof(u32) > MAX_KEYLEN ||
+	    keylen == 0)
+		return ERR_PTR(-EINVAL);
+
+	net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	ret = nf_ct_netns_get(net, family);
+	if (ret < 0) {
+		kfree(data);
+		return ERR_PTR(ret);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(data->root); ++i)
+		data->root[i] = RB_ROOT;
+
+	data->keylen = keylen / sizeof(u32);
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_init);
+
+static void destroy_tree(struct rb_root *r)
+{
+	struct nf_conncount_tuple *conn;
+	struct nf_conncount_rb *rbconn;
+	struct hlist_node *n;
+	struct rb_node *node;
+
+	while ((node = rb_first(r)) != NULL) {
+		rbconn = rb_entry(node, struct nf_conncount_rb, node);
+
+		rb_erase(node, r);
+
+		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
+			kmem_cache_free(conncount_conn_cachep, conn);
+
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+	}
+}
+
+void nf_conncount_destroy(struct net *net, unsigned int family,
+			  struct nf_conncount_data *data)
+{
+	unsigned int i;
+
+	nf_ct_netns_put(net, family);
+
+	for (i = 0; i < ARRAY_SIZE(data->root); ++i)
+		destroy_tree(&data->root[i]);
+
+	kfree(data);
+}
+EXPORT_SYMBOL_GPL(nf_conncount_destroy);
+
+static int __init nf_conncount_modinit(void)
+{
+	int i;
+
+	BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS);
+	BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0);
+
+	for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i)
+		spin_lock_init(&nf_conncount_locks[i]);
+
+	conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
+					   sizeof(struct nf_conncount_tuple),
+					   0, 0, NULL);
+	if (!conncount_conn_cachep)
+		return -ENOMEM;
+
+	conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
+					   sizeof(struct nf_conncount_rb),
+					   0, 0, NULL);
+	if (!conncount_rb_cachep) {
+		kmem_cache_destroy(conncount_conn_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit nf_conncount_modexit(void)
+{
+	kmem_cache_destroy(conncount_conn_cachep);
+	kmem_cache_destroy(conncount_rb_cachep);
+}
+
+module_init(nf_conncount_modinit);
+module_exit(nf_conncount_modexit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index a6214f235333..b1b17b9353e1 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -12,292 +12,30 @@
  * GPL (C) 1999  Rusty Russell (rusty@rustcorp.com.au).
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/jhash.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/rbtree.h>
+
 #include <linux/module.h>
-#include <linux/random.h>
 #include <linux/skbuff.h>
-#include <linux/spinlock.h>
-#include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_connlimit.h>
+
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_zones.h>
-
-#define CONNLIMIT_SLOTS		256U
-
-#ifdef CONFIG_LOCKDEP
-#define CONNLIMIT_LOCK_SLOTS	8U
-#else
-#define CONNLIMIT_LOCK_SLOTS	256U
-#endif
-
-#define CONNLIMIT_GC_MAX_NODES	8
-
-/* we will save the tuples of all connections we care about */
-struct xt_connlimit_conn {
-	struct hlist_node		node;
-	struct nf_conntrack_tuple	tuple;
-};
-
-struct xt_connlimit_rb {
-	struct rb_node node;
-	struct hlist_head hhead; /* connections/hosts in same subnet */
-	union nf_inet_addr addr; /* search key */
-};
-
-static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp;
-
-struct xt_connlimit_data {
-	struct rb_root climit_root[CONNLIMIT_SLOTS];
-};
-
-static u_int32_t connlimit_rnd __read_mostly;
-static struct kmem_cache *connlimit_rb_cachep __read_mostly;
-static struct kmem_cache *connlimit_conn_cachep __read_mostly;
-
-static inline unsigned int connlimit_iphash(__be32 addr)
-{
-	return jhash_1word((__force __u32)addr,
-			    connlimit_rnd) % CONNLIMIT_SLOTS;
-}
-
-static inline unsigned int
-connlimit_iphash6(const union nf_inet_addr *addr)
-{
-	return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6),
-		       connlimit_rnd) % CONNLIMIT_SLOTS;
-}
-
-static inline bool already_closed(const struct nf_conn *conn)
-{
-	if (nf_ct_protonum(conn) == IPPROTO_TCP)
-		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
-		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
-	else
-		return 0;
-}
-
-static int
-same_source(const union nf_inet_addr *addr,
-	    const union nf_inet_addr *u3, u_int8_t family)
-{
-	if (family == NFPROTO_IPV4)
-		return ntohl(addr->ip) - ntohl(u3->ip);
-
-	return memcmp(addr->ip6, u3->ip6, sizeof(addr->ip6));
-}
-
-static bool add_hlist(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple,
-		      const union nf_inet_addr *addr)
-{
-	struct xt_connlimit_conn *conn;
-
-	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL)
-		return false;
-	conn->tuple = *tuple;
-	hlist_add_head(&conn->node, head);
-	return true;
-}
-
-static unsigned int check_hlist(struct net *net,
-				struct hlist_head *head,
-				const struct nf_conntrack_tuple *tuple,
-				const struct nf_conntrack_zone *zone,
-				bool *addit)
-{
-	const struct nf_conntrack_tuple_hash *found;
-	struct xt_connlimit_conn *conn;
-	struct hlist_node *n;
-	struct nf_conn *found_ct;
-	unsigned int length = 0;
-
-	*addit = true;
-
-	/* check the saved connections */
-	hlist_for_each_entry_safe(conn, n, head, node) {
-		found = nf_conntrack_find_get(net, zone, &conn->tuple);
-		if (found == NULL) {
-			hlist_del(&conn->node);
-			kmem_cache_free(connlimit_conn_cachep, conn);
-			continue;
-		}
-
-		found_ct = nf_ct_tuplehash_to_ctrack(found);
-
-		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
-			/*
-			 * Just to be sure we have it only once in the list.
-			 * We should not see tuples twice unless someone hooks
-			 * this into a table without "-p tcp --syn".
-			 */
-			*addit = false;
-		} else if (already_closed(found_ct)) {
-			/*
-			 * we do not care about connections which are
-			 * closed already -> ditch it
-			 */
-			nf_ct_put(found_ct);
-			hlist_del(&conn->node);
-			kmem_cache_free(connlimit_conn_cachep, conn);
-			continue;
-		}
-
-		nf_ct_put(found_ct);
-		length++;
-	}
-
-	return length;
-}
-
-static void tree_nodes_free(struct rb_root *root,
-			    struct xt_connlimit_rb *gc_nodes[],
-			    unsigned int gc_count)
-{
-	struct xt_connlimit_rb *rbconn;
-
-	while (gc_count) {
-		rbconn = gc_nodes[--gc_count];
-		rb_erase(&rbconn->node, root);
-		kmem_cache_free(connlimit_rb_cachep, rbconn);
-	}
-}
-
-static unsigned int
-count_tree(struct net *net, struct rb_root *root,
-	   const struct nf_conntrack_tuple *tuple,
-	   const union nf_inet_addr *addr,
-	   u8 family, const struct nf_conntrack_zone *zone)
-{
-	struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
-	struct rb_node **rbnode, *parent;
-	struct xt_connlimit_rb *rbconn;
-	struct xt_connlimit_conn *conn;
-	unsigned int gc_count;
-	bool no_gc = false;
-
- restart:
-	gc_count = 0;
-	parent = NULL;
-	rbnode = &(root->rb_node);
-	while (*rbnode) {
-		int diff;
-		bool addit;
-
-		rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
-
-		parent = *rbnode;
-		diff = same_source(addr, &rbconn->addr, family);
-		if (diff < 0) {
-			rbnode = &((*rbnode)->rb_left);
-		} else if (diff > 0) {
-			rbnode = &((*rbnode)->rb_right);
-		} else {
-			/* same source network -> be counted! */
-			unsigned int count;
-			count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
-
-			tree_nodes_free(root, gc_nodes, gc_count);
-			if (!addit)
-				return count;
-
-			if (!add_hlist(&rbconn->hhead, tuple, addr))
-				return 0; /* hotdrop */
-
-			return count + 1;
-		}
-
-		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
-			continue;
-
-		/* only used for GC on hhead, retval and 'addit' ignored */
-		check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
-		if (hlist_empty(&rbconn->hhead))
-			gc_nodes[gc_count++] = rbconn;
-	}
-
-	if (gc_count) {
-		no_gc = true;
-		tree_nodes_free(root, gc_nodes, gc_count);
-		/* tree_node_free before new allocation permits
-		 * allocator to re-use newly free'd object.
-		 *
-		 * This is a rare event; in most cases we will find
-		 * existing node to re-use. (or gc_count is 0).
-		 */
-		goto restart;
-	}
-
-	/* no match, need to insert new node */
-	rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
-	if (rbconn == NULL)
-		return 0;
-
-	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL) {
-		kmem_cache_free(connlimit_rb_cachep, rbconn);
-		return 0;
-	}
-
-	conn->tuple = *tuple;
-	rbconn->addr = *addr;
-
-	INIT_HLIST_HEAD(&rbconn->hhead);
-	hlist_add_head(&conn->node, &rbconn->hhead);
-
-	rb_link_node(&rbconn->node, parent, rbnode);
-	rb_insert_color(&rbconn->node, root);
-	return 1;
-}
-
-static int count_them(struct net *net,
-		      struct xt_connlimit_data *data,
-		      const struct nf_conntrack_tuple *tuple,
-		      const union nf_inet_addr *addr,
-		      u_int8_t family,
-		      const struct nf_conntrack_zone *zone)
-{
-	struct rb_root *root;
-	int count;
-	u32 hash;
-
-	if (family == NFPROTO_IPV6)
-		hash = connlimit_iphash6(addr);
-	else
-		hash = connlimit_iphash(addr->ip);
-	root = &data->climit_root[hash];
-
-	spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
-
-	count = count_tree(net, root, tuple, addr, family, zone);
-
-	spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
-
-	return count;
-}
+#include <net/netfilter/nf_conntrack_count.h>
 
 static bool
 connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	struct net *net = xt_net(par);
 	const struct xt_connlimit_info *info = par->matchinfo;
-	union nf_inet_addr addr;
 	struct nf_conntrack_tuple tuple;
 	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 	unsigned int connections;
+	u32 key[5];
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct != NULL) {
@@ -310,6 +48,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	if (xt_family(par) == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		union nf_inet_addr addr;
 		unsigned int i;
 
 		memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
@@ -317,22 +56,24 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 		for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i)
 			addr.ip6[i] &= info->mask.ip6[i];
+		memcpy(key, &addr, sizeof(addr.ip6));
+		key[4] = zone->id;
 	} else {
 		const struct iphdr *iph = ip_hdr(skb);
-		addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+		key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
 			  iph->daddr : iph->saddr;
 
-		addr.ip &= info->mask.ip;
+		key[0] &= info->mask.ip;
+		key[1] = zone->id;
 	}
 
-	connections = count_them(net, info->data, tuple_ptr, &addr,
-				 xt_family(par), zone);
+	connections = nf_conncount_count(net, info->data, key,
+					 xt_family(par), tuple_ptr, zone);
 	if (connections == 0)
 		/* kmalloc failed, drop it entirely */
 		goto hotdrop;
 
-	return (connections > info->limit) ^
-	       !!(info->flags & XT_CONNLIMIT_INVERT);
+	return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
 
  hotdrop:
 	par->hotdrop = true;
@@ -342,61 +83,27 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 static int connlimit_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_connlimit_info *info = par->matchinfo;
-	unsigned int i;
-	int ret;
+	unsigned int keylen;
 
-	net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
-
-	ret = nf_ct_netns_get(par->net, par->family);
-	if (ret < 0) {
-		pr_info("cannot load conntrack support for "
-			"address family %u\n", par->family);
-		return ret;
-	}
+	keylen = sizeof(u32);
+	if (par->family == NFPROTO_IPV6)
+		keylen += sizeof(struct in6_addr);
+	else
+		keylen += sizeof(struct in_addr);
 
 	/* init private data */
-	info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
-	if (info->data == NULL) {
-		nf_ct_netns_put(par->net, par->family);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i)
-		info->data->climit_root[i] = RB_ROOT;
+	info->data = nf_conncount_init(par->net, par->family, keylen);
+	if (IS_ERR(info->data))
+		return PTR_ERR(info->data);
 
 	return 0;
 }
 
-static void destroy_tree(struct rb_root *r)
-{
-	struct xt_connlimit_conn *conn;
-	struct xt_connlimit_rb *rbconn;
-	struct hlist_node *n;
-	struct rb_node *node;
-
-	while ((node = rb_first(r)) != NULL) {
-		rbconn = rb_entry(node, struct xt_connlimit_rb, node);
-
-		rb_erase(node, r);
-
-		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
-			kmem_cache_free(connlimit_conn_cachep, conn);
-
-		kmem_cache_free(connlimit_rb_cachep, rbconn);
-	}
-}
-
 static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	const struct xt_connlimit_info *info = par->matchinfo;
-	unsigned int i;
-
-	nf_ct_netns_put(par->net, par->family);
-
-	for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i)
-		destroy_tree(&info->data->climit_root[i]);
 
-	kfree(info->data);
+	nf_conncount_destroy(par->net, par->family, info->data);
 }
 
 static struct xt_match connlimit_mt_reg __read_mostly = {
@@ -413,40 +120,12 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
 
 static int __init connlimit_mt_init(void)
 {
-	int ret, i;
-
-	BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
-	BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
-
-	for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i)
-		spin_lock_init(&xt_connlimit_locks[i]);
-
-	connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
-					   sizeof(struct xt_connlimit_conn),
-					   0, 0, NULL);
-	if (!connlimit_conn_cachep)
-		return -ENOMEM;
-
-	connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
-					   sizeof(struct xt_connlimit_rb),
-					   0, 0, NULL);
-	if (!connlimit_rb_cachep) {
-		kmem_cache_destroy(connlimit_conn_cachep);
-		return -ENOMEM;
-	}
-	ret = xt_register_match(&connlimit_mt_reg);
-	if (ret != 0) {
-		kmem_cache_destroy(connlimit_conn_cachep);
-		kmem_cache_destroy(connlimit_rb_cachep);
-	}
-	return ret;
+	return xt_register_match(&connlimit_mt_reg);
 }
 
 static void __exit connlimit_mt_exit(void)
 {
 	xt_unregister_match(&connlimit_mt_reg);
-	kmem_cache_destroy(connlimit_conn_cachep);
-	kmem_cache_destroy(connlimit_rb_cachep);
 }
 
 module_init(connlimit_mt_init);
-- 
cgit v1.2.3


From ef71fe27ec2f1607e38af160ab261a8d8ef8e121 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 27 Nov 2017 21:55:14 +0100
Subject: netfilter: move checksum indirection to struct nf_ipv6_ops

We cannot make a direct call to nf_ip6_checksum() because that would
result in autoloading the 'ipv6' module because of symbol dependencies.
Therefore, define checksum indirection in nf_ipv6_ops where this really
belongs to.

For IPv4, we can indeed make a direct function call, which is faster,
given IPv4 is built-in in the networking code by default. Still,
CONFIG_INET=n and CONFIG_NETFILTER=y is possible, so define empty inline
stub for IPv4 in such case.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h               | 19 +++----------------
 include/linux/netfilter_ipv4.h          | 10 ++++++++++
 include/linux/netfilter_ipv6.h          |  2 ++
 net/bridge/netfilter/nf_tables_bridge.c |  7 -------
 net/ipv4/netfilter.c                    |  1 -
 net/ipv6/netfilter.c                    |  4 ++--
 net/netfilter/Makefile                  |  2 +-
 net/netfilter/utils.c                   | 26 ++++++++++++++++++++++++++
 8 files changed, 44 insertions(+), 27 deletions(-)
 create mode 100644 net/netfilter/utils.c

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 85a0b0d599e6..4c4d38ef1a76 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -311,8 +311,6 @@ struct nf_queue_entry;
 
 struct nf_afinfo {
 	unsigned short	family;
-	__sum16		(*checksum)(struct sk_buff *skb, unsigned int hook,
-				    unsigned int dataoff, u_int8_t protocol);
 	__sum16		(*checksum_partial)(struct sk_buff *skb,
 					    unsigned int hook,
 					    unsigned int dataoff,
@@ -333,20 +331,9 @@ static inline const struct nf_afinfo *nf_get_afinfo(unsigned short family)
 	return rcu_dereference(nf_afinfo[family]);
 }
 
-static inline __sum16
-nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff,
-	    u_int8_t protocol, unsigned short family)
-{
-	const struct nf_afinfo *afinfo;
-	__sum16 csum = 0;
-
-	rcu_read_lock();
-	afinfo = nf_get_afinfo(family);
-	if (afinfo)
-		csum = afinfo->checksum(skb, hook, dataoff, protocol);
-	rcu_read_unlock();
-	return csum;
-}
+__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
+		    unsigned int dataoff, u_int8_t protocol,
+		    unsigned short family);
 
 static inline __sum16
 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 98c03b2462b5..c7deb78bac88 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -7,6 +7,16 @@
 #include <uapi/linux/netfilter_ipv4.h>
 
 int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type);
+
+#ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 		       unsigned int dataoff, u_int8_t protocol);
+#else
+static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+				     unsigned int dataoff, u_int8_t protocol)
+{
+	return 0;
+}
+#endif /* CONFIG_INET */
+
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 47c6b04c28c0..b136101b5cde 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -19,6 +19,8 @@ struct nf_ipv6_ops {
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
+	__sum16 (*checksum)(struct sk_buff *skb, unsigned int hook,
+			    unsigned int dataoff, u_int8_t protocol);
 };
 
 #ifdef CONFIG_NETFILTER
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 74260ffec74d..b850c17f02f5 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -106,12 +106,6 @@ static int nf_br_reroute(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
-static __sum16 nf_br_checksum(struct sk_buff *skb, unsigned int hook,
-			      unsigned int dataoff, u_int8_t protocol)
-{
-	return 0;
-}
-
 static __sum16 nf_br_checksum_partial(struct sk_buff *skb, unsigned int hook,
 				      unsigned int dataoff, unsigned int len,
 				      u_int8_t protocol)
@@ -127,7 +121,6 @@ static int nf_br_route(struct net *net, struct dst_entry **dst,
 
 static const struct nf_afinfo nf_br_afinfo = {
 	.family                 = AF_BRIDGE,
-	.checksum               = nf_br_checksum,
 	.checksum_partial       = nf_br_checksum_partial,
 	.route                  = nf_br_route,
 	.saveroute              = nf_br_saveroute,
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c0cc6aa8cfaa..2f7ffefd2732 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -188,7 +188,6 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst,
 
 static const struct nf_afinfo nf_ip_afinfo = {
 	.family			= AF_INET,
-	.checksum		= nf_ip_checksum,
 	.checksum_partial	= nf_ip_checksum_partial,
 	.route			= nf_ip_route,
 	.saveroute		= nf_ip_saveroute,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 39970e212ad5..db69c8af95aa 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -192,12 +192,12 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
 	.route_input    = ip6_route_input,
-	.fragment	= ip6_fragment
+	.fragment	= ip6_fragment,
+	.checksum	= nf_ip6_checksum,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
-	.checksum		= nf_ip6_checksum,
 	.checksum_partial	= nf_ip6_checksum_partial,
 	.route			= nf_ip6_route,
 	.saveroute		= nf_ip6_saveroute,
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 490a55e7166d..eec0c3b72926 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
 
 nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
new file mode 100644
index 000000000000..159a9cdcfe1e
--- /dev/null
+++ b/net/netfilter/utils.c
@@ -0,0 +1,26 @@
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+
+__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
+		    unsigned int dataoff, u_int8_t protocol,
+		    unsigned short family)
+{
+	const struct nf_ipv6_ops *v6ops;
+	__sum16 csum = 0;
+
+	switch (family) {
+	case AF_INET:
+		csum = nf_ip_checksum(skb, hook, dataoff, protocol);
+		break;
+	case AF_INET6:
+		v6ops = rcu_dereference(nf_ipv6_ops);
+		if (v6ops)
+			csum = v6ops->checksum(skb, hook, dataoff, protocol);
+		break;
+	}
+
+	return csum;
+}
+EXPORT_SYMBOL_GPL(nf_checksum);
-- 
cgit v1.2.3


From f7dcbe2f36a660140ecb286e15f502028d96ffdf Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 Dec 2017 16:04:18 +0100
Subject: netfilter: move checksum_partial indirection to struct nf_ipv6_ops

We cannot make a direct call to nf_ip6_checksum_partial() because that
would result in autoloading the 'ipv6' module because of symbol
dependencies.  Therefore, define checksum_partial indirection in
nf_ipv6_ops where this really belongs to.

For IPv4, we can indeed make a direct function call, which is faster,
given IPv4 is built-in in the networking code by default. Still,
CONFIG_INET=n and CONFIG_NETFILTER=y is possible, so define empty inline
stub for IPv4 in such case.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h               | 24 +++---------------------
 include/linux/netfilter_ipv4.h          | 11 +++++++++++
 include/linux/netfilter_ipv6.h          |  3 +++
 net/bridge/netfilter/nf_tables_bridge.c |  8 --------
 net/ipv4/netfilter.c                    |  8 ++++----
 net/ipv6/netfilter.c                    | 10 +++++-----
 net/netfilter/utils.c                   | 24 ++++++++++++++++++++++++
 7 files changed, 50 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 4c4d38ef1a76..70b238eff29f 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -311,11 +311,6 @@ struct nf_queue_entry;
 
 struct nf_afinfo {
 	unsigned short	family;
-	__sum16		(*checksum_partial)(struct sk_buff *skb,
-					    unsigned int hook,
-					    unsigned int dataoff,
-					    unsigned int len,
-					    u_int8_t protocol);
 	int		(*route)(struct net *net, struct dst_entry **dst,
 				 struct flowi *fl, bool strict);
 	void		(*saveroute)(const struct sk_buff *skb,
@@ -335,22 +330,9 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		    unsigned int dataoff, u_int8_t protocol,
 		    unsigned short family);
 
-static inline __sum16
-nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
-		    unsigned int dataoff, unsigned int len,
-		    u_int8_t protocol, unsigned short family)
-{
-	const struct nf_afinfo *afinfo;
-	__sum16 csum = 0;
-
-	rcu_read_lock();
-	afinfo = nf_get_afinfo(family);
-	if (afinfo)
-		csum = afinfo->checksum_partial(skb, hook, dataoff, len,
-						protocol);
-	rcu_read_unlock();
-	return csum;
-}
+__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
+			    unsigned int dataoff, unsigned int len,
+			    u_int8_t protocol, unsigned short family);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo);
 void nf_unregister_afinfo(const struct nf_afinfo *afinfo);
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index c7deb78bac88..811425ece8d5 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -11,12 +11,23 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type)
 #ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 		       unsigned int dataoff, u_int8_t protocol);
+__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+			       unsigned int dataoff, unsigned int len,
+			       u_int8_t protocol);
 #else
 static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 				     unsigned int dataoff, u_int8_t protocol)
 {
 	return 0;
 }
+static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb,
+					     unsigned int hook,
+					     unsigned int dataoff,
+					     unsigned int len,
+					     u_int8_t protocol)
+{
+	return 0;
+}
 #endif /* CONFIG_INET */
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index b136101b5cde..29e8f1286584 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -21,6 +21,9 @@ struct nf_ipv6_ops {
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
 	__sum16 (*checksum)(struct sk_buff *skb, unsigned int hook,
 			    unsigned int dataoff, u_int8_t protocol);
+	__sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook,
+				    unsigned int dataoff, unsigned int len,
+				    u_int8_t protocol);
 };
 
 #ifdef CONFIG_NETFILTER
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index b850c17f02f5..b24ac11cacc9 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -106,13 +106,6 @@ static int nf_br_reroute(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
-static __sum16 nf_br_checksum_partial(struct sk_buff *skb, unsigned int hook,
-				      unsigned int dataoff, unsigned int len,
-				      u_int8_t protocol)
-{
-	return 0;
-}
-
 static int nf_br_route(struct net *net, struct dst_entry **dst,
 		       struct flowi *fl, bool strict __always_unused)
 {
@@ -121,7 +114,6 @@ static int nf_br_route(struct net *net, struct dst_entry **dst,
 
 static const struct nf_afinfo nf_br_afinfo = {
 	.family                 = AF_BRIDGE,
-	.checksum_partial       = nf_br_checksum_partial,
 	.route                  = nf_br_route,
 	.saveroute              = nf_br_saveroute,
 	.reroute                = nf_br_reroute,
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 2f7ffefd2732..010c75fddf7e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -155,9 +155,9 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 }
 EXPORT_SYMBOL(nf_ip_checksum);
 
-static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
-				      unsigned int dataoff, unsigned int len,
-				      u_int8_t protocol)
+__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+			       unsigned int dataoff, unsigned int len,
+			       u_int8_t protocol)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	__sum16 csum = 0;
@@ -175,6 +175,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 	}
 	return csum;
 }
+EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
 
 static int nf_ip_route(struct net *net, struct dst_entry **dst,
 		       struct flowi *fl, bool strict __always_unused)
@@ -188,7 +189,6 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst,
 
 static const struct nf_afinfo nf_ip_afinfo = {
 	.family			= AF_INET,
-	.checksum_partial	= nf_ip_checksum_partial,
 	.route			= nf_ip_route,
 	.saveroute		= nf_ip_saveroute,
 	.reroute		= nf_ip_reroute,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index db69c8af95aa..a57546ce54a6 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -190,15 +190,15 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 };
 
 static const struct nf_ipv6_ops ipv6ops = {
-	.chk_addr	= ipv6_chk_addr,
-	.route_input    = ip6_route_input,
-	.fragment	= ip6_fragment,
-	.checksum	= nf_ip6_checksum,
+	.chk_addr		= ipv6_chk_addr,
+	.route_input    	= ip6_route_input,
+	.fragment		= ip6_fragment,
+	.checksum		= nf_ip6_checksum,
+	.checksum_partial	= nf_ip6_checksum_partial,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
-	.checksum_partial	= nf_ip6_checksum_partial,
 	.route			= nf_ip6_route,
 	.saveroute		= nf_ip6_saveroute,
 	.reroute		= nf_ip6_reroute,
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 159a9cdcfe1e..ca6d8d62496d 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -24,3 +24,27 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 	return csum;
 }
 EXPORT_SYMBOL_GPL(nf_checksum);
+
+__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
+			    unsigned int dataoff, unsigned int len,
+			    u_int8_t protocol, unsigned short family)
+{
+	const struct nf_ipv6_ops *v6ops;
+	__sum16 csum = 0;
+
+	switch (family) {
+	case AF_INET:
+		csum = nf_ip_checksum_partial(skb, hook, dataoff, len,
+					      protocol);
+		break;
+	case AF_INET6:
+		v6ops = rcu_dereference(nf_ipv6_ops);
+		if (v6ops)
+			csum = v6ops->checksum_partial(skb, hook, dataoff, len,
+						       protocol);
+		break;
+	}
+
+	return csum;
+}
+EXPORT_SYMBOL_GPL(nf_checksum_partial);
-- 
cgit v1.2.3


From 7db9a51e0f9931446ed4231feb1040ed5134fc60 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 Dec 2017 16:12:55 +0100
Subject: netfilter: remove saveroute indirection in struct nf_afinfo

This is only used by nf_queue.c and this function comes with no symbol
dependencies with IPv6, it just refers to structure layouts. Therefore,
we can replace it by a direct function call from where it belongs.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h               |  2 --
 include/linux/netfilter_ipv4.h          | 10 ++++++++
 include/linux/netfilter_ipv6.h          |  9 +++++++
 net/bridge/netfilter/nf_tables_bridge.c |  6 -----
 net/ipv4/netfilter.c                    | 28 ----------------------
 net/ipv6/netfilter.c                    | 26 --------------------
 net/netfilter/nf_queue.c                | 42 ++++++++++++++++++++++++++++++++-
 7 files changed, 60 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 70b238eff29f..5fc2443225f9 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -313,8 +313,6 @@ struct nf_afinfo {
 	unsigned short	family;
 	int		(*route)(struct net *net, struct dst_entry **dst,
 				 struct flowi *fl, bool strict);
-	void		(*saveroute)(const struct sk_buff *skb,
-				     struct nf_queue_entry *entry);
 	int		(*reroute)(struct net *net, struct sk_buff *skb,
 				   const struct nf_queue_entry *entry);
 	int		route_key_size;
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 811425ece8d5..8d4ef1e3ce74 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -6,6 +6,16 @@
 
 #include <uapi/linux/netfilter_ipv4.h>
 
+/* Extra routing may needed on local out, as the QUEUE target never returns
+ * control to the table.
+ */
+struct ip_rt_info {
+	__be32 daddr;
+	__be32 saddr;
+	u_int8_t tos;
+	u_int32_t mark;
+};
+
 int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type);
 
 #ifdef CONFIG_INET
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 29e8f1286584..08d58dc018b5 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -9,6 +9,15 @@
 
 #include <uapi/linux/netfilter_ipv6.h>
 
+/* Extra routing may needed on local out, as the QUEUE target never returns
+ * control to the table.
+ */
+struct ip6_rt_info {
+	struct in6_addr daddr;
+	struct in6_addr saddr;
+	u_int32_t mark;
+};
+
 /*
  * Hook functions for ipv6 to allow xt_* modules to be built-in even
  * if IPv6 is a module.
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index b24ac11cacc9..f38350c6bc29 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -95,11 +95,6 @@ static const struct nf_chain_type filter_bridge = {
 	},
 };
 
-static void nf_br_saveroute(const struct sk_buff *skb,
-			    struct nf_queue_entry *entry)
-{
-}
-
 static int nf_br_reroute(struct net *net, struct sk_buff *skb,
 			 const struct nf_queue_entry *entry)
 {
@@ -115,7 +110,6 @@ static int nf_br_route(struct net *net, struct dst_entry **dst,
 static const struct nf_afinfo nf_br_afinfo = {
 	.family                 = AF_BRIDGE,
 	.route                  = nf_br_route,
-	.saveroute              = nf_br_saveroute,
 	.reroute                = nf_br_reroute,
 	.route_key_size         = 0,
 };
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 010c75fddf7e..7878ae6c35b2 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -80,33 +80,6 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
-/*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
-
-struct ip_rt_info {
-	__be32 daddr;
-	__be32 saddr;
-	u_int8_t tos;
-	u_int32_t mark;
-};
-
-static void nf_ip_saveroute(const struct sk_buff *skb,
-			    struct nf_queue_entry *entry)
-{
-	struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
-	if (entry->state.hook == NF_INET_LOCAL_OUT) {
-		const struct iphdr *iph = ip_hdr(skb);
-
-		rt_info->tos = iph->tos;
-		rt_info->daddr = iph->daddr;
-		rt_info->saddr = iph->saddr;
-		rt_info->mark = skb->mark;
-	}
-}
-
 static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
 			 const struct nf_queue_entry *entry)
 {
@@ -190,7 +163,6 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst,
 static const struct nf_afinfo nf_ip_afinfo = {
 	.family			= AF_INET,
 	.route			= nf_ip_route,
-	.saveroute		= nf_ip_saveroute,
 	.reroute		= nf_ip_reroute,
 	.route_key_size		= sizeof(struct ip_rt_info),
 };
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index a57546ce54a6..6d1f470660db 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -68,31 +68,6 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ip6_route_me_harder);
 
-/*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
-
-struct ip6_rt_info {
-	struct in6_addr daddr;
-	struct in6_addr saddr;
-	u_int32_t mark;
-};
-
-static void nf_ip6_saveroute(const struct sk_buff *skb,
-			     struct nf_queue_entry *entry)
-{
-	struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
-	if (entry->state.hook == NF_INET_LOCAL_OUT) {
-		const struct ipv6hdr *iph = ipv6_hdr(skb);
-
-		rt_info->daddr = iph->daddr;
-		rt_info->saddr = iph->saddr;
-		rt_info->mark = skb->mark;
-	}
-}
-
 static int nf_ip6_reroute(struct net *net, struct sk_buff *skb,
 			  const struct nf_queue_entry *entry)
 {
@@ -200,7 +175,6 @@ static const struct nf_ipv6_ops ipv6ops = {
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
 	.route			= nf_ip6_route,
-	.saveroute		= nf_ip6_saveroute,
 	.reroute		= nf_ip6_reroute,
 	.route_key_size		= sizeof(struct ip6_rt_info),
 };
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 0c02fdb7efc9..dfa35bd292c8 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -10,6 +10,8 @@
 #include <linux/proc_fs.h>
 #include <linux/skbuff.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_bridge.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
@@ -108,6 +110,35 @@ void nf_queue_nf_hook_drop(struct net *net)
 }
 EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop);
 
+static void nf_ip_saveroute(const struct sk_buff *skb,
+			    struct nf_queue_entry *entry)
+{
+	struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->state.hook == NF_INET_LOCAL_OUT) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		rt_info->tos = iph->tos;
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+		rt_info->mark = skb->mark;
+	}
+}
+
+static void nf_ip6_saveroute(const struct sk_buff *skb,
+			     struct nf_queue_entry *entry)
+{
+	struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->state.hook == NF_INET_LOCAL_OUT) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+		rt_info->mark = skb->mark;
+	}
+}
+
 static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 		      const struct nf_hook_entries *entries,
 		      unsigned int index, unsigned int queuenum)
@@ -144,7 +175,16 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 
 	nf_queue_entry_get_refs(entry);
 	skb_dst_force(skb);
-	afinfo->saveroute(skb, entry);
+
+	switch (entry->state.pf) {
+	case AF_INET:
+		nf_ip_saveroute(skb, entry);
+		break;
+	case AF_INET6:
+		nf_ip6_saveroute(skb, entry);
+		break;
+	}
+
 	status = qh->outfn(entry, queuenum);
 
 	if (status < 0) {
-- 
cgit v1.2.3


From 3f87c08c615f567799b426aff0341ea8010a0ebb Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 27 Nov 2017 22:29:52 +0100
Subject: netfilter: move route indirection to struct nf_ipv6_ops

We cannot make a direct call to nf_ip6_route() because that would result
in autoloading the 'ipv6' module because of symbol dependencies.
Therefore, define route indirection in nf_ipv6_ops where this really
belongs to.

For IPv4, we can indeed make a direct function call, which is faster,
given IPv4 is built-in in the networking code by default. Still,
CONFIG_INET=n and CONFIG_NETFILTER=y is possible, so define empty inline
stub for IPv4 in such case.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h               |  4 ++--
 include/linux/netfilter_ipv4.h          |  7 +++++++
 include/linux/netfilter_ipv6.h          |  2 ++
 net/bridge/netfilter/nf_tables_bridge.c |  7 -------
 net/ipv4/netfilter.c                    |  6 +++---
 net/ipv6/netfilter.c                    |  2 +-
 net/ipv6/netfilter/nft_fib_ipv6.c       | 12 +++++-------
 net/netfilter/nf_conntrack_h323_main.c  | 30 +++++++++++++++---------------
 net/netfilter/nft_rt.c                  | 15 +++++----------
 net/netfilter/utils.c                   | 21 +++++++++++++++++++++
 net/netfilter/xt_TCPMSS.c               |  5 +----
 net/netfilter/xt_addrtype.c             | 15 ++++++---------
 12 files changed, 68 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 5fc2443225f9..02c35eabd348 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -311,8 +311,6 @@ struct nf_queue_entry;
 
 struct nf_afinfo {
 	unsigned short	family;
-	int		(*route)(struct net *net, struct dst_entry **dst,
-				 struct flowi *fl, bool strict);
 	int		(*reroute)(struct net *net, struct sk_buff *skb,
 				   const struct nf_queue_entry *entry);
 	int		route_key_size;
@@ -331,6 +329,8 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			    unsigned int dataoff, unsigned int len,
 			    u_int8_t protocol, unsigned short family);
+int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+	     bool strict, unsigned short family);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo);
 void nf_unregister_afinfo(const struct nf_afinfo *afinfo);
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 8d4ef1e3ce74..2a4e2c415647 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -24,6 +24,8 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			       unsigned int dataoff, unsigned int len,
 			       u_int8_t protocol);
+int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+		bool strict);
 #else
 static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 				     unsigned int dataoff, u_int8_t protocol)
@@ -38,6 +40,11 @@ static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb,
 {
 	return 0;
 }
+static inline int nf_ip_route(struct net *net, struct dst_entry **dst,
+			      struct flowi *fl, bool strict)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_INET */
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 08d58dc018b5..e5700bb314a1 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -33,6 +33,8 @@ struct nf_ipv6_ops {
 	__sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook,
 				    unsigned int dataoff, unsigned int len,
 				    u_int8_t protocol);
+	int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl,
+		     bool strict);
 };
 
 #ifdef CONFIG_NETFILTER
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index f38350c6bc29..014b6571f2ac 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -101,15 +101,8 @@ static int nf_br_reroute(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
-static int nf_br_route(struct net *net, struct dst_entry **dst,
-		       struct flowi *fl, bool strict __always_unused)
-{
-	return 0;
-}
-
 static const struct nf_afinfo nf_br_afinfo = {
 	.family                 = AF_BRIDGE,
-	.route                  = nf_br_route,
 	.reroute                = nf_br_reroute,
 	.route_key_size         = 0,
 };
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 7878ae6c35b2..e9d47e4ec182 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -150,8 +150,8 @@ __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 }
 EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
 
-static int nf_ip_route(struct net *net, struct dst_entry **dst,
-		       struct flowi *fl, bool strict __always_unused)
+int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+		bool strict __always_unused)
 {
 	struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
 	if (IS_ERR(rt))
@@ -159,10 +159,10 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst,
 	*dst = &rt->dst;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nf_ip_route);
 
 static const struct nf_afinfo nf_ip_afinfo = {
 	.family			= AF_INET,
-	.route			= nf_ip_route,
 	.reroute		= nf_ip_reroute,
 	.route_key_size		= sizeof(struct ip_rt_info),
 };
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 6d1f470660db..f03bb043e4e4 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -170,11 +170,11 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.fragment		= ip6_fragment,
 	.checksum		= nf_ip6_checksum,
 	.checksum_partial	= nf_ip6_checksum_partial,
+	.route			= nf_ip6_route,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
-	.route			= nf_ip6_route,
 	.reroute		= nf_ip6_reroute,
 	.route_key_size		= sizeof(struct ip6_rt_info),
 };
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 54b5899543ef..cc5174c7254c 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -60,7 +60,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
 {
 	const struct net_device *dev = NULL;
 	const struct nf_ipv6_ops *v6ops;
-	const struct nf_afinfo *afinfo;
 	int route_err, addrtype;
 	struct rt6_info *rt;
 	struct flowi6 fl6 = {
@@ -69,8 +68,8 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
 	};
 	u32 ret = 0;
 
-	afinfo = nf_get_afinfo(NFPROTO_IPV6);
-	if (!afinfo)
+	v6ops = nf_get_ipv6_ops();
+	if (!v6ops)
 		return RTN_UNREACHABLE;
 
 	if (priv->flags & NFTA_FIB_F_IIF)
@@ -80,12 +79,11 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
 
 	nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
 
-	v6ops = nf_get_ipv6_ops();
-	if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
+	if (dev && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
 		ret = RTN_LOCAL;
 
-	route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt,
-				  flowi6_to_flowi(&fl6), false);
+	route_err = v6ops->route(nft_net(pkt), (struct dst_entry **)&rt,
+				 flowi6_to_flowi(&fl6), false);
 	if (route_err)
 		goto err;
 
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 7f0e0f66e488..005589c6d0f6 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -24,6 +24,7 @@
 #include <linux/skbuff.h>
 #include <net/route.h>
 #include <net/ip6_route.h>
+#include <linux/netfilter_ipv6.h>
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -721,14 +722,8 @@ static int callforward_do_filter(struct net *net,
 				 const union nf_inet_addr *dst,
 				 u_int8_t family)
 {
-	const struct nf_afinfo *afinfo;
 	int ret = 0;
 
-	/* rcu_read_lock()ed by nf_hook_thresh */
-	afinfo = nf_get_afinfo(family);
-	if (!afinfo)
-		return 0;
-
 	switch (family) {
 	case AF_INET: {
 		struct flowi4 fl1, fl2;
@@ -739,10 +734,10 @@ static int callforward_do_filter(struct net *net,
 
 		memset(&fl2, 0, sizeof(fl2));
 		fl2.daddr = dst->ip;
-		if (!afinfo->route(net, (struct dst_entry **)&rt1,
-				   flowi4_to_flowi(&fl1), false)) {
-			if (!afinfo->route(net, (struct dst_entry **)&rt2,
-					   flowi4_to_flowi(&fl2), false)) {
+		if (!nf_ip_route(net, (struct dst_entry **)&rt1,
+				 flowi4_to_flowi(&fl1), false)) {
+			if (!nf_ip_route(net, (struct dst_entry **)&rt2,
+					 flowi4_to_flowi(&fl2), false)) {
 				if (rt_nexthop(rt1, fl1.daddr) ==
 				    rt_nexthop(rt2, fl2.daddr) &&
 				    rt1->dst.dev  == rt2->dst.dev)
@@ -755,18 +750,23 @@ static int callforward_do_filter(struct net *net,
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
 	case AF_INET6: {
-		struct flowi6 fl1, fl2;
+		const struct nf_ipv6_ops *v6ops;
 		struct rt6_info *rt1, *rt2;
+		struct flowi6 fl1, fl2;
+
+		v6ops = nf_get_ipv6_ops();
+		if (!v6ops)
+			return 0;
 
 		memset(&fl1, 0, sizeof(fl1));
 		fl1.daddr = src->in6;
 
 		memset(&fl2, 0, sizeof(fl2));
 		fl2.daddr = dst->in6;
-		if (!afinfo->route(net, (struct dst_entry **)&rt1,
-				   flowi6_to_flowi(&fl1), false)) {
-			if (!afinfo->route(net, (struct dst_entry **)&rt2,
-					   flowi6_to_flowi(&fl2), false)) {
+		if (!v6ops->route(net, (struct dst_entry **)&rt1,
+				  flowi6_to_flowi(&fl1), false)) {
+			if (!v6ops->route(net, (struct dst_entry **)&rt2,
+					  flowi6_to_flowi(&fl2), false)) {
 				if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr),
 						    rt6_nexthop(rt2, &fl2.daddr)) &&
 				    rt1->dst.dev == rt2->dst.dev)
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index a6b7d05aeacf..11a2071b6dd4 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -27,7 +27,7 @@ static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skb
 {
 	u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst);
 	const struct sk_buff *skb = pkt->skb;
-	const struct nf_afinfo *ai;
+	struct dst_entry *dst = NULL;
 	struct flowi fl;
 
 	memset(&fl, 0, sizeof(fl));
@@ -43,15 +43,10 @@ static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skb
 		break;
 	}
 
-	ai = nf_get_afinfo(nft_pf(pkt));
-	if (ai) {
-		struct dst_entry *dst = NULL;
-
-		ai->route(nft_net(pkt), &dst, &fl, false);
-		if (dst) {
-			mtu = min(mtu, dst_mtu(dst));
-			dst_release(dst);
-		}
+	nf_route(nft_net(pkt), &dst, &fl, false, nft_pf(pkt));
+	if (dst) {
+		mtu = min(mtu, dst_mtu(dst));
+		dst_release(dst);
 	}
 
 	if (mtu <= minlen || mtu > 0xffff)
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index ca6d8d62496d..45c22418c955 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -48,3 +48,24 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 	return csum;
 }
 EXPORT_SYMBOL_GPL(nf_checksum_partial);
+
+int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+	     bool strict, unsigned short family)
+{
+	const struct nf_ipv6_ops *v6ops;
+	int ret = 0;
+
+	switch (family) {
+	case AF_INET:
+		ret = nf_ip_route(net, dst, fl, strict);
+		break;
+	case AF_INET6:
+		v6ops = rcu_dereference(nf_ipv6_ops);
+		if (v6ops)
+			ret = v6ops->route(net, dst, fl, strict);
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_route);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 9dae4d665965..99bb8e410f22 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -48,7 +48,6 @@ static u_int32_t tcpmss_reverse_mtu(struct net *net,
 				    unsigned int family)
 {
 	struct flowi fl;
-	const struct nf_afinfo *ai;
 	struct rtable *rt = NULL;
 	u_int32_t mtu     = ~0U;
 
@@ -62,10 +61,8 @@ static u_int32_t tcpmss_reverse_mtu(struct net *net,
 		memset(fl6, 0, sizeof(*fl6));
 		fl6->daddr = ipv6_hdr(skb)->saddr;
 	}
-	ai = nf_get_afinfo(family);
-	if (ai != NULL)
-		ai->route(net, (struct dst_entry **)&rt, &fl, false);
 
+	nf_route(net, (struct dst_entry **)&rt, &fl, false, family);
 	if (rt != NULL) {
 		mtu = dst_mtu(&rt->dst);
 		dst_release(&rt->dst);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 3b2be2ae6987..911a7c0da504 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -36,7 +36,7 @@ MODULE_ALIAS("ip6t_addrtype");
 static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
 			    const struct in6_addr *addr, u16 mask)
 {
-	const struct nf_afinfo *afinfo;
+	const struct nf_ipv6_ops *v6ops;
 	struct flowi6 flow;
 	struct rt6_info *rt;
 	u32 ret = 0;
@@ -47,17 +47,14 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
 	if (dev)
 		flow.flowi6_oif = dev->ifindex;
 
-	afinfo = nf_get_afinfo(NFPROTO_IPV6);
-	if (afinfo != NULL) {
-		const struct nf_ipv6_ops *v6ops;
-
+	v6ops = nf_get_ipv6_ops();
+	if (v6ops) {
 		if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
-			v6ops = nf_get_ipv6_ops();
-			if (v6ops && v6ops->chk_addr(net, addr, dev, true))
+			if (v6ops->chk_addr(net, addr, dev, true))
 				ret = XT_ADDRTYPE_LOCAL;
 		}
-		route_err = afinfo->route(net, (struct dst_entry **)&rt,
-					  flowi6_to_flowi(&flow), false);
+		route_err = v6ops->route(net, (struct dst_entry **)&rt,
+					 flowi6_to_flowi(&flow), false);
 	} else {
 		route_err = 1;
 	}
-- 
cgit v1.2.3


From ce388f452f0af2013c657dd24be4415d94e7704f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 27 Nov 2017 22:50:26 +0100
Subject: netfilter: move reroute indirection to struct nf_ipv6_ops

We cannot make a direct call to nf_ip6_reroute() because that would result
in autoloading the 'ipv6' module because of symbol dependencies.
Therefore, define reroute indirection in nf_ipv6_ops where this really
belongs to.

For IPv4, we can indeed make a direct function call, which is faster,
given IPv4 is built-in in the networking code by default. Still,
CONFIG_INET=n and CONFIG_NETFILTER=y is possible, so define empty inline
stub for IPv4 in such case.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h               |  3 +--
 include/linux/netfilter_ipv4.h          |  8 ++++++++
 include/linux/netfilter_ipv6.h          |  3 +++
 net/bridge/netfilter/nf_tables_bridge.c |  7 -------
 net/ipv4/netfilter.c                    |  8 ++++----
 net/ipv6/netfilter.c                    |  6 +++---
 net/netfilter/nf_queue.c                |  4 +---
 net/netfilter/utils.c                   | 19 +++++++++++++++++++
 8 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 02c35eabd348..8358e717eabd 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -311,8 +311,6 @@ struct nf_queue_entry;
 
 struct nf_afinfo {
 	unsigned short	family;
-	int		(*reroute)(struct net *net, struct sk_buff *skb,
-				   const struct nf_queue_entry *entry);
 	int		route_key_size;
 };
 
@@ -331,6 +329,7 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			    u_int8_t protocol, unsigned short family);
 int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 	     bool strict, unsigned short family);
+int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo);
 void nf_unregister_afinfo(const struct nf_afinfo *afinfo);
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 2a4e2c415647..b31dabfdb453 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -18,6 +18,8 @@ struct ip_rt_info {
 
 int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type);
 
+struct nf_queue_entry;
+
 #ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 		       unsigned int dataoff, u_int8_t protocol);
@@ -26,6 +28,7 @@ __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			       u_int8_t protocol);
 int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		bool strict);
+int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry);
 #else
 static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 				     unsigned int dataoff, u_int8_t protocol)
@@ -45,6 +48,11 @@ static inline int nf_ip_route(struct net *net, struct dst_entry **dst,
 {
 	return -EOPNOTSUPP;
 }
+static inline int nf_ip_reroute(struct sk_buff *skb,
+				const struct nf_queue_entry *entry)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_INET */
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index e5700bb314a1..288c597e75b3 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -18,6 +18,8 @@ struct ip6_rt_info {
 	u_int32_t mark;
 };
 
+struct nf_queue_entry;
+
 /*
  * Hook functions for ipv6 to allow xt_* modules to be built-in even
  * if IPv6 is a module.
@@ -35,6 +37,7 @@ struct nf_ipv6_ops {
 				    u_int8_t protocol);
 	int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		     bool strict);
+	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
 };
 
 #ifdef CONFIG_NETFILTER
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 014b6571f2ac..e7348b49bc0d 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -95,15 +95,8 @@ static const struct nf_chain_type filter_bridge = {
 	},
 };
 
-static int nf_br_reroute(struct net *net, struct sk_buff *skb,
-			 const struct nf_queue_entry *entry)
-{
-	return 0;
-}
-
 static const struct nf_afinfo nf_br_afinfo = {
 	.family                 = AF_BRIDGE,
-	.reroute                = nf_br_reroute,
 	.route_key_size         = 0,
 };
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e9d47e4ec182..ec73be3b2f14 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -80,8 +80,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
-static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
-			 const struct nf_queue_entry *entry)
+int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
 {
 	const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
 
@@ -92,10 +91,12 @@ static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
 		      skb->mark == rt_info->mark &&
 		      iph->daddr == rt_info->daddr &&
 		      iph->saddr == rt_info->saddr))
-			return ip_route_me_harder(net, skb, RTN_UNSPEC);
+			return ip_route_me_harder(entry->state.net, skb,
+						  RTN_UNSPEC);
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nf_ip_reroute);
 
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 			    unsigned int dataoff, u_int8_t protocol)
@@ -163,7 +164,6 @@ EXPORT_SYMBOL_GPL(nf_ip_route);
 
 static const struct nf_afinfo nf_ip_afinfo = {
 	.family			= AF_INET,
-	.reroute		= nf_ip_reroute,
 	.route_key_size		= sizeof(struct ip_rt_info),
 };
 
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index f03bb043e4e4..d633b78be06f 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -68,7 +68,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ip6_route_me_harder);
 
-static int nf_ip6_reroute(struct net *net, struct sk_buff *skb,
+static int nf_ip6_reroute(struct sk_buff *skb,
 			  const struct nf_queue_entry *entry)
 {
 	struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -78,7 +78,7 @@ static int nf_ip6_reroute(struct net *net, struct sk_buff *skb,
 		if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
 		    !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
 		    skb->mark != rt_info->mark)
-			return ip6_route_me_harder(net, skb);
+			return ip6_route_me_harder(entry->state.net, skb);
 	}
 	return 0;
 }
@@ -171,11 +171,11 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.checksum		= nf_ip6_checksum,
 	.checksum_partial	= nf_ip6_checksum_partial,
 	.route			= nf_ip6_route,
+	.reroute		= nf_ip6_reroute,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
-	.reroute		= nf_ip6_reroute,
 	.route_key_size		= sizeof(struct ip6_rt_info),
 };
 
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index dfa35bd292c8..15382ff83e7a 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -266,7 +266,6 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 	const struct nf_hook_entry *hook_entry;
 	const struct nf_hook_entries *hooks;
 	struct sk_buff *skb = entry->skb;
-	const struct nf_afinfo *afinfo;
 	const struct net *net;
 	unsigned int i;
 	int err;
@@ -293,8 +292,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 		verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
 
 	if (verdict == NF_ACCEPT) {
-		afinfo = nf_get_afinfo(entry->state.pf);
-		if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0)
+		if (nf_reroute(skb, entry) < 0)
 			verdict = NF_DROP;
 	}
 
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 45c22418c955..0b660c568156 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -2,6 +2,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_queue.h>
 
 __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		    unsigned int dataoff, u_int8_t protocol,
@@ -69,3 +70,21 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_route);
+
+int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+	const struct nf_ipv6_ops *v6ops;
+	int ret = 0;
+
+	switch (entry->state.pf) {
+	case AF_INET:
+		ret = nf_ip_reroute(skb, entry);
+		break;
+	case AF_INET6:
+		v6ops = rcu_dereference(nf_ipv6_ops);
+		if (v6ops)
+			ret = v6ops->reroute(skb, entry);
+		break;
+	}
+	return ret;
+}
-- 
cgit v1.2.3


From 464356234f88518f7d0678b979013e78607e8266 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 27 Nov 2017 22:58:37 +0100
Subject: netfilter: remove route_key_size field in struct nf_afinfo

This is only needed by nf_queue, place this code where it belongs.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h |  1 -
 net/ipv4/netfilter.c      |  1 -
 net/ipv6/netfilter.c      |  1 -
 net/netfilter/nf_queue.c  | 22 ++++++++++++++++------
 4 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 8358e717eabd..58a169e425f7 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -311,7 +311,6 @@ struct nf_queue_entry;
 
 struct nf_afinfo {
 	unsigned short	family;
-	int		route_key_size;
 };
 
 extern const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO];
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ec73be3b2f14..bd8e161c2f95 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -164,7 +164,6 @@ EXPORT_SYMBOL_GPL(nf_ip_route);
 
 static const struct nf_afinfo nf_ip_afinfo = {
 	.family			= AF_INET,
-	.route_key_size		= sizeof(struct ip_rt_info),
 };
 
 static int __init ipv4_netfilter_init(void)
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d633b78be06f..18f82a3fb0c0 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -176,7 +176,6 @@ static const struct nf_ipv6_ops ipv6ops = {
 
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
-	.route_key_size		= sizeof(struct ip6_rt_info),
 };
 
 int __init ipv6_netfilter_init(void)
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 15382ff83e7a..7f55af5f3d1a 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -15,6 +15,8 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/protocol.h>
 #include <net/netfilter/nf_queue.h>
 #include <net/dst.h>
@@ -145,9 +147,9 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 {
 	int status = -ENOENT;
 	struct nf_queue_entry *entry = NULL;
-	const struct nf_afinfo *afinfo;
 	const struct nf_queue_handler *qh;
 	struct net *net = state->net;
+	unsigned int route_key_size;
 
 	/* QUEUE == DROP if no one is waiting, to be safe. */
 	qh = rcu_dereference(net->nf.queue_handler);
@@ -156,11 +158,19 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 		goto err;
 	}
 
-	afinfo = nf_get_afinfo(state->pf);
-	if (!afinfo)
-		goto err;
+	switch (state->pf) {
+	case AF_INET:
+		route_key_size = sizeof(struct ip_rt_info);
+		break;
+	case AF_INET6:
+		route_key_size = sizeof(struct ip6_rt_info);
+		break;
+	default:
+		route_key_size = 0;
+		break;
+	}
 
-	entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
+	entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
 	if (!entry) {
 		status = -ENOMEM;
 		goto err;
@@ -170,7 +180,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 		.skb	= skb,
 		.state	= *state,
 		.hook_index = index,
-		.size	= sizeof(*entry) + afinfo->route_key_size,
+		.size	= sizeof(*entry) + route_key_size,
 	};
 
 	nf_queue_entry_get_refs(entry);
-- 
cgit v1.2.3


From b3a61254d83d577f8a44b86a5e68bc124011336a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 9 Dec 2017 17:05:53 +0100
Subject: netfilter: remove struct nf_afinfo and its helper functions

This abstraction has no clients anymore, remove it.

This is what remains from previous authors, so correct copyright
statement after recent modifications and code removal.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h               | 13 -------------
 net/bridge/netfilter/nf_tables_bridge.c | 16 ++++------------
 net/ipv4/netfilter.c                    | 10 ----------
 net/ipv6/netfilter.c                    |  7 +------
 net/netfilter/core.c                    | 25 +------------------------
 5 files changed, 6 insertions(+), 65 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 58a169e425f7..85a1a0b32c66 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -309,16 +309,6 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len);
 struct flowi;
 struct nf_queue_entry;
 
-struct nf_afinfo {
-	unsigned short	family;
-};
-
-extern const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO];
-static inline const struct nf_afinfo *nf_get_afinfo(unsigned short family)
-{
-	return rcu_dereference(nf_afinfo[family]);
-}
-
 __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		    unsigned int dataoff, u_int8_t protocol,
 		    unsigned short family);
@@ -330,9 +320,6 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 	     bool strict, unsigned short family);
 int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 
-int nf_register_afinfo(const struct nf_afinfo *afinfo);
-void nf_unregister_afinfo(const struct nf_afinfo *afinfo);
-
 #include <net/flow.h>
 extern void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index e7348b49bc0d..86774b5c3b73 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -95,30 +95,23 @@ static const struct nf_chain_type filter_bridge = {
 	},
 };
 
-static const struct nf_afinfo nf_br_afinfo = {
-	.family                 = AF_BRIDGE,
-	.route_key_size         = 0,
-};
-
 static int __init nf_tables_bridge_init(void)
 {
 	int ret;
 
-	nf_register_afinfo(&nf_br_afinfo);
 	ret = nft_register_chain_type(&filter_bridge);
 	if (ret < 0)
-		goto err1;
+		return ret;
 
 	ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
 	if (ret < 0)
-		goto err2;
+		goto err_register_subsys;
 
 	return ret;
 
-err2:
+err_register_subsys:
 	nft_unregister_chain_type(&filter_bridge);
-err1:
-	nf_unregister_afinfo(&nf_br_afinfo);
+
 	return ret;
 }
 
@@ -126,7 +119,6 @@ static void __exit nf_tables_bridge_exit(void)
 {
 	unregister_pernet_subsys(&nf_tables_bridge_net_ops);
 	nft_unregister_chain_type(&filter_bridge);
-	nf_unregister_afinfo(&nf_br_afinfo);
 }
 
 module_init(nf_tables_bridge_init);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index bd8e161c2f95..e6774ccb7731 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -161,13 +161,3 @@ int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_ip_route);
-
-static const struct nf_afinfo nf_ip_afinfo = {
-	.family			= AF_INET,
-};
-
-static int __init ipv4_netfilter_init(void)
-{
-	return nf_register_afinfo(&nf_ip_afinfo);
-}
-subsys_initcall(ipv4_netfilter_init);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 18f82a3fb0c0..d95ceca7ff8f 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -174,14 +174,10 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.reroute		= nf_ip6_reroute,
 };
 
-static const struct nf_afinfo nf_ip6_afinfo = {
-	.family			= AF_INET6,
-};
-
 int __init ipv6_netfilter_init(void)
 {
 	RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops);
-	return nf_register_afinfo(&nf_ip6_afinfo);
+	return 0;
 }
 
 /* This can be called from inet6_init() on errors, so it cannot
@@ -190,5 +186,4 @@ int __init ipv6_netfilter_init(void)
 void ipv6_netfilter_fini(void)
 {
 	RCU_INIT_POINTER(nf_ipv6_ops, NULL);
-	nf_unregister_afinfo(&nf_ip6_afinfo);
 }
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 606efc9b14e1..997dd387d259 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -4,8 +4,7 @@
  * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
  * way.
  *
- * Rusty Russell (C)2000 -- This code is GPL.
- * Patrick McHardy (c) 2006-2012
+ * This code is GPL.
  */
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
@@ -28,34 +27,12 @@
 
 #include "nf_internals.h"
 
-static DEFINE_MUTEX(afinfo_mutex);
-
-const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
-EXPORT_SYMBOL(nf_afinfo);
 const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 
 DEFINE_PER_CPU(bool, nf_skb_duplicated);
 EXPORT_SYMBOL_GPL(nf_skb_duplicated);
 
-int nf_register_afinfo(const struct nf_afinfo *afinfo)
-{
-	mutex_lock(&afinfo_mutex);
-	RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
-	mutex_unlock(&afinfo_mutex);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nf_register_afinfo);
-
-void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
-{
-	mutex_lock(&afinfo_mutex);
-	RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL);
-	mutex_unlock(&afinfo_mutex);
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
-
 #ifdef HAVE_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
-- 
cgit v1.2.3


From f6931f5f5b713705c3cc91e4f9c222f2b181e2ef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Dec 2017 16:18:16 +0100
Subject: netfilter: meta: secpath support

replacement for iptables "-m policy --dir in --policy {ipsec,none}".

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_meta.c                 | 43 ++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a3ee277b17a1..2efbf9744c2a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -777,6 +777,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_OIFGROUP: packet output interface group
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
  * @NFT_META_PRANDOM: a 32bit pseudo-random number
+ * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp)
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -804,6 +805,7 @@ enum nft_meta_keys {
 	NFT_META_OIFGROUP,
 	NFT_META_CGROUP,
 	NFT_META_PRANDOM,
+	NFT_META_SECPATH,
 };
 
 /**
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 5a60eb23a7ed..1a91e676f13e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -210,6 +210,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = prandom_u32_state(state);
 		break;
 	}
+#ifdef CONFIG_XFRM
+	case NFT_META_SECPATH:
+		nft_reg_store8(dest, !!skb->sp);
+		break;
+#endif
 	default:
 		WARN_ON(1);
 		goto err;
@@ -308,6 +313,11 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 		prandom_init_once(&nft_prandom_state);
 		len = sizeof(u32);
 		break;
+#ifdef CONFIG_XFRM
+	case NFT_META_SECPATH:
+		len = sizeof(u8);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -318,6 +328,38 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
+static int nft_meta_get_validate(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nft_data **data)
+{
+#ifdef CONFIG_XFRM
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	if (priv->key != NFT_META_SECPATH)
+		return 0;
+
+	switch (ctx->afi->family) {
+	case NFPROTO_NETDEV:
+		hooks = 1 << NF_NETDEV_INGRESS;
+		break;
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+	case NFPROTO_INET:
+		hooks = (1 << NF_INET_PRE_ROUTING) |
+			(1 << NF_INET_LOCAL_IN) |
+			(1 << NF_INET_FORWARD);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+#else
+	return 0;
+#endif
+}
+
 int nft_meta_set_validate(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr,
 			  const struct nft_data **data)
@@ -434,6 +476,7 @@ static const struct nft_expr_ops nft_meta_get_ops = {
 	.eval		= nft_meta_get_eval,
 	.init		= nft_meta_get_init,
 	.dump		= nft_meta_get_dump,
+	.validate	= nft_meta_get_validate,
 };
 
 static const struct nft_expr_ops nft_meta_set_ops = {
-- 
cgit v1.2.3


From a7f87b47e67e4341f6175cdb80e5c2eaadf30dcb Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 30 Dec 2017 22:41:46 +0100
Subject: netfilter: remove defensive check on malformed packets from raw
 sockets

Users cannot forge malformed IPv4/IPv6 headers via raw sockets that they
can inject into the stack. Specifically, not for IPv4 since 55888dfb6ba7
("AF_RAW: Augment raw_send_hdrinc to expand skb to fit iphdr->ihl
(v2)"). IPv6 raw sockets also ensure that packets have a well-formed
IPv6 header available in the skbuff.

At quick glance, br_netfilter also validates layer 3 headers and it
drops malformed both IPv4 and IPv6 packets.

Therefore, let's remove this defensive check all over the place.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_filter.c            |  6 -----
 net/ipv4/netfilter/iptable_mangle.c            |  5 ----
 net/ipv4/netfilter/iptable_raw.c               |  6 -----
 net/ipv4/netfilter/iptable_security.c          |  6 -----
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  5 ----
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       | 10 --------
 net/ipv4/netfilter/nf_tables_ipv4.c            | 17 +------------
 net/ipv4/netfilter/nft_chain_route_ipv4.c      |  5 ----
 net/ipv6/netfilter/ip6table_mangle.c           |  8 ------
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  5 ----
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       |  8 ------
 net/ipv6/netfilter/nf_tables_ipv6.c            | 16 +-----------
 net/netfilter/nf_tables_inet.c                 | 34 +-------------------------
 13 files changed, 3 insertions(+), 128 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 7667f223d7f8..9ac92ea7b93c 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -38,12 +38,6 @@ static unsigned int
 iptable_filter_hook(void *priv, struct sk_buff *skb,
 		    const struct nf_hook_state *state)
 {
-	if (state->hook == NF_INET_LOCAL_OUT &&
-	    (skb->len < sizeof(struct iphdr) ||
-	     ip_hdrlen(skb) < sizeof(struct iphdr)))
-		/* root is playing with raw sockets. */
-		return NF_ACCEPT;
-
 	return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
 }
 
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index aebdb337fd7e..dea138ca8925 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -49,11 +49,6 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
 	u_int32_t mark;
 	int err;
 
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
 	/* Save things which could affect route */
 	mark = skb->mark;
 	iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 2642ecd2645c..a869d1fea7d9 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -26,12 +26,6 @@ static unsigned int
 iptable_raw_hook(void *priv, struct sk_buff *skb,
 		 const struct nf_hook_state *state)
 {
-	if (state->hook == NF_INET_LOCAL_OUT &&
-	    (skb->len < sizeof(struct iphdr) ||
-	     ip_hdrlen(skb) < sizeof(struct iphdr)))
-		/* root is playing with raw sockets. */
-		return NF_ACCEPT;
-
 	return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
 }
 
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index ff226596e4b5..e5379fe57b64 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -43,12 +43,6 @@ static unsigned int
 iptable_security_hook(void *priv, struct sk_buff *skb,
 		      const struct nf_hook_state *state)
 {
-	if (state->hook == NF_INET_LOCAL_OUT &&
-	    (skb->len < sizeof(struct iphdr) ||
-	     ip_hdrlen(skb) < sizeof(struct iphdr)))
-		/* Somebody is playing with raw sockets. */
-		return NF_ACCEPT;
-
 	return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
 }
 
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index bb2c868a5621..de213a397ea8 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -154,11 +154,6 @@ static unsigned int ipv4_conntrack_local(void *priv,
 					 struct sk_buff *skb,
 					 const struct nf_hook_state *state)
 {
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
 	if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
 		return NF_ACCEPT;
 
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 0443ca4120b0..f7ff6a364d7b 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -356,11 +356,6 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 #endif
 	unsigned int ret;
 
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
 	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
@@ -396,11 +391,6 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 	unsigned int ret;
 	int err;
 
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
 	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 8aeb15c2b9b2..f4675253f1e6 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -30,21 +30,6 @@ static unsigned int nft_do_chain_ipv4(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_ipv4_output(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	if (unlikely(skb->len < sizeof(struct iphdr) ||
-		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
-		if (net_ratelimit())
-			pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
-				"packet\n");
-		return NF_ACCEPT;
-	}
-
-	return nft_do_chain_ipv4(priv, skb, state);
-}
-
 static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	.family		= NFPROTO_IPV4,
 	.nhooks		= NF_INET_NUMHOOKS,
@@ -91,7 +76,7 @@ static const struct nf_chain_type filter_ipv4 = {
 			  (1 << NF_INET_POST_ROUTING),
 	.hooks		= {
 		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
-		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain_ipv4,
 		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
 		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
 		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index fb3d49fb62fe..d965c225b9f6 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -33,11 +33,6 @@ static unsigned int nf_route_table_hook(void *priv,
 	const struct iphdr *iph;
 	int err;
 
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
 	nft_set_pktinfo(&pkt, skb, state);
 	nft_set_pktinfo_ipv4(&pkt, skb);
 
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 2b1a9dcdbcb3..b0524b18c4fb 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -42,14 +42,6 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
 	u_int8_t hop_limit;
 	u_int32_t flowlabel, mark;
 	int err;
-#if 0
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr)) {
-		net_warn_ratelimited("ip6t_hook: happy cracking\n");
-		return NF_ACCEPT;
-	}
-#endif
 
 	/* save source/dest address, mark, hoplimit, flowlabel, priority,  */
 	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 7340ca7cc362..11a313fd9273 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -176,11 +176,6 @@ static unsigned int ipv6_conntrack_local(void *priv,
 					 struct sk_buff *skb,
 					 const struct nf_hook_state *state)
 {
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct ipv6hdr)) {
-		net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
-		return NF_ACCEPT;
-	}
 	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
 }
 
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 1d2fb9267d6f..bed57ee65f7b 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -369,10 +369,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 #endif
 	unsigned int ret;
 
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_ACCEPT;
-
 	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
@@ -408,10 +404,6 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
 	unsigned int ret;
 	int err;
 
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_ACCEPT;
-
 	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index d4c9ef030e4f..9cd45b964123 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -28,20 +28,6 @@ static unsigned int nft_do_chain_ipv6(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_ipv6_output(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
-		if (net_ratelimit())
-			pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
-				"packet\n");
-		return NF_ACCEPT;
-	}
-
-	return nft_do_chain_ipv6(priv, skb, state);
-}
-
 static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	.family		= NFPROTO_IPV6,
 	.nhooks		= NF_INET_NUMHOOKS,
@@ -88,7 +74,7 @@ static const struct nf_chain_type filter_ipv6 = {
 			  (1 << NF_INET_POST_ROUTING),
 	.hooks		= {
 		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
-		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain_ipv6,
 		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
 		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
 		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 313987e2b1fe..58b9be7480bb 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -38,38 +38,6 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_inet_output(void *priv, struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-
-	switch (state->pf) {
-	case NFPROTO_IPV4:
-		if (unlikely(skb->len < sizeof(struct iphdr) ||
-			     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
-			if (net_ratelimit())
-				pr_info("ignoring short SOCK_RAW packet\n");
-			return NF_ACCEPT;
-		}
-		nft_set_pktinfo_ipv4(&pkt, skb);
-		break;
-	case NFPROTO_IPV6:
-	        if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
-			if (net_ratelimit())
-				pr_info("ignoring short SOCK_RAW packet\n");
-			return NF_ACCEPT;
-		}
-		nft_set_pktinfo_ipv6(&pkt, skb);
-		break;
-	default:
-		break;
-	}
-
-	return nft_do_chain(&pkt, priv);
-}
-
 static struct nft_af_info nft_af_inet __read_mostly = {
 	.family		= NFPROTO_INET,
 	.nhooks		= NF_INET_NUMHOOKS,
@@ -116,7 +84,7 @@ static const struct nf_chain_type filter_inet = {
 			  (1 << NF_INET_POST_ROUTING),
 	.hooks		= {
 		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
-		[NF_INET_LOCAL_OUT]	= nft_inet_output,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain_inet,
 		[NF_INET_FORWARD]	= nft_do_chain_inet,
 		[NF_INET_PRE_ROUTING]	= nft_do_chain_inet,
 		[NF_INET_POST_ROUTING]	= nft_do_chain_inet,
-- 
cgit v1.2.3


From 0befd061af59c4ba426588930f09eb9ea2475534 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 2 Jan 2018 12:50:12 +0100
Subject: netfilter: nf_tables: remove nft_dereference()

This macro is unnecessary, it just hides details for one single caller.
nfnl_dereference() is just enough.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 3 ---
 net/netfilter/nf_tables_api.c     | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e040b6151acc..e3ec02fd0f67 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1113,9 +1113,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 
 void nft_trace_notify(struct nft_traceinfo *info);
 
-#define nft_dereference(p)					\
-	nfnl_dereference(p, NFNL_SUBSYS_NFTABLES)
-
 #define MODULE_ALIAS_NFT_FAMILY(family)	\
 	MODULE_ALIAS("nft-afinfo-" __stringify(family))
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 15773a3189ce..fa564dac66a2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1222,13 +1222,13 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
 static void nft_chain_stats_replace(struct nft_base_chain *chain,
 				    struct nft_stats __percpu *newstats)
 {
+	struct nft_stats __percpu *oldstats;
+
 	if (newstats == NULL)
 		return;
 
 	if (chain->stats) {
-		struct nft_stats __percpu *oldstats =
-				nft_dereference(chain->stats);
-
+		oldstats = nfnl_dereference(chain->stats, NFNL_SUBSYS_NFTABLES);
 		rcu_assign_pointer(chain->stats, newstats);
 		synchronize_rcu();
 		free_percpu(oldstats);
-- 
cgit v1.2.3


From 90964016e5d34758033e75884e41d68ccb93212e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:03:56 +0100
Subject: netfilter: nf_conntrack: add IPS_OFFLOAD status bit

This new bit tells us that the conntrack entry is owned by the flow
table offload infrastructure.

 # cat /proc/net/nf_conntrack
 ipv4     2 tcp      6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2

Note the [OFFLOAD] tag in the listing.

The timer of such conntrack entries look like stopped from userspace.
In practise, to make sure the conntrack entry does not go away, the
conntrack timer is periodically set to an arbitrary large value that
gets refreshed on every iteration from the garbage collector, so it
never expires- and they display no internal state in the case of TCP
flows. This allows us to save a bitcheck from the packet path via
nf_ct_is_expired().

Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h |  6 +++++-
 net/netfilter/nf_conntrack_core.c                  | 20 ++++++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c               | 15 ++++++++++++++-
 net/netfilter/nf_conntrack_proto_tcp.c             |  3 +++
 net/netfilter/nf_conntrack_standalone.c            | 12 ++++++++----
 5 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 3fea7709a441..fc8c15a24a43 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -101,12 +101,16 @@ enum ip_conntrack_status {
 	IPS_HELPER_BIT = 13,
 	IPS_HELPER = (1 << IPS_HELPER_BIT),
 
+	/* Conntrack has been offloaded to flow table. */
+	IPS_OFFLOAD_BIT = 14,
+	IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
+
 	/* Be careful here, modifying these bits can make things messy,
 	 * so don't let users modify them directly.
 	 */
 	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
 				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
-				 IPS_SEQ_ADJUST | IPS_TEMPLATE),
+				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
 
 	__IPS_MAX_BIT = 14,
 };
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 85f643c1e227..6a64d528d076 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -901,6 +901,9 @@ static unsigned int early_drop_list(struct net *net,
 	hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
 		tmp = nf_ct_tuplehash_to_ctrack(h);
 
+		if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
+			continue;
+
 		if (nf_ct_is_expired(tmp)) {
 			nf_ct_gc_expired(tmp);
 			continue;
@@ -975,6 +978,18 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 	return false;
 }
 
+#define	DAY	(86400 * HZ)
+
+/* Set an arbitrary timeout large enough not to ever expire, this save
+ * us a check for the IPS_OFFLOAD_BIT from the packet path via
+ * nf_ct_is_expired().
+ */
+static void nf_ct_offload_timeout(struct nf_conn *ct)
+{
+	if (nf_ct_expires(ct) < DAY / 2)
+		ct->timeout = nfct_time_stamp + DAY;
+}
+
 static void gc_worker(struct work_struct *work)
 {
 	unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
@@ -1011,6 +1026,11 @@ static void gc_worker(struct work_struct *work)
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
 			scanned++;
+			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
+				nf_ct_offload_timeout(tmp);
+				continue;
+			}
+
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
 				expired_count++;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 316bbdc4a158..7c7921a53b13 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1110,6 +1110,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 				    .len = NF_CT_LABELS_MAX_SIZE },
 };
 
+static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
+{
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		return 0;
+
+	return ctnetlink_filter_match(ct, data);
+}
+
 static int ctnetlink_flush_conntrack(struct net *net,
 				     const struct nlattr * const cda[],
 				     u32 portid, int report)
@@ -1122,7 +1130,7 @@ static int ctnetlink_flush_conntrack(struct net *net,
 			return PTR_ERR(filter);
 	}
 
-	nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
+	nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
 				  portid, report);
 	kfree(filter);
 
@@ -1168,6 +1176,11 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
 
 	ct = nf_ct_tuplehash_to_ctrack(h);
 
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+		nf_ct_put(ct);
+		return -EBUSY;
+	}
+
 	if (cda[CTA_ID]) {
 		u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
 		if (id != (u32)(unsigned long)ct) {
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 684cc29010a0..e97cdc1cf98c 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		return;
+
 	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
 }
 #endif
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5a101caa3e12..46d32baad095 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	WARN_ON(!l4proto);
 
 	ret = -ENOSPC;
-	seq_printf(s, "%-8s %u %-8s %u %ld ",
+	seq_printf(s, "%-8s %u %-8s %u ",
 		   l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
-		   l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
-		   nf_ct_expires(ct)  / HZ);
+		   l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
+
+	if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		seq_printf(s, "%ld ", nf_ct_expires(ct)  / HZ);
 
 	if (l4proto->print_conntrack)
 		l4proto->print_conntrack(s, ct);
@@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
 		goto release;
 
-	if (test_bit(IPS_ASSURED_BIT, &ct->status))
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		seq_puts(s, "[OFFLOAD] ");
+	else if (test_bit(IPS_ASSURED_BIT, &ct->status))
 		seq_puts(s, "[ASSURED] ");
 
 	if (seq_has_overflowed(s))
-- 
cgit v1.2.3


From 3b49e2e94e6ebb8b23d0955d9e898254455734f8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:07 +0100
Subject: netfilter: nf_tables: add flow table netlink frontend

This patch introduces a netlink control plane to create, delete and dump
flow tables. Flow tables are identified by name, this name is used from
rules to refer to an specific flow table. Flow tables use the rhashtable
class and a generic garbage collector to remove expired entries.

This also adds the infrastructure to add different flow table types, so
we can add one for each layer 3 protocol family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h    |  23 +
 include/net/netfilter/nf_tables.h        |  48 ++
 include/uapi/linux/netfilter/nf_tables.h |  53 +++
 net/netfilter/nf_tables_api.c            | 747 ++++++++++++++++++++++++++++++-
 4 files changed, 870 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netfilter/nf_flow_table.h

(limited to 'net')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
new file mode 100644
index 000000000000..3a0779589281
--- /dev/null
+++ b/include/net/netfilter/nf_flow_table.h
@@ -0,0 +1,23 @@
+#ifndef _NF_FLOW_TABLE_H
+#define _NF_FLOW_TABLE_H
+
+#include <linux/rhashtable.h>
+
+struct nf_flowtable;
+
+struct nf_flowtable_type {
+	struct list_head		list;
+	int				family;
+	void				(*gc)(struct work_struct *work);
+	const struct rhashtable_params	*params;
+	nf_hookfn			*hook;
+	struct module			*owner;
+};
+
+struct nf_flowtable {
+	struct rhashtable		rhashtable;
+	const struct nf_flowtable_type	*type;
+	struct delayed_work		gc_work;
+};
+
+#endif /* _FLOW_OFFLOAD_H */
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e3ec02fd0f67..dd238950df81 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -9,6 +9,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <linux/u64_stats_sync.h>
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netlink.h>
 
 #define NFT_JUMP_STACK_SIZE	16
@@ -943,6 +944,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@chains: chains in the table
  *	@sets: sets in the table
  *	@objects: stateful objects in the table
+ *	@flowtables: flow tables in the table
  *	@hgenerator: handle generator state
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
@@ -954,6 +956,7 @@ struct nft_table {
 	struct list_head		chains;
 	struct list_head		sets;
 	struct list_head		objects;
+	struct list_head		flowtables;
 	u64				hgenerator;
 	u32				use;
 	u16				flags:14,
@@ -1084,6 +1087,44 @@ struct nft_object_ops {
 int nft_register_obj(struct nft_object_type *obj_type);
 void nft_unregister_obj(struct nft_object_type *obj_type);
 
+/**
+ *	struct nft_flowtable - nf_tables flow table
+ *
+ *	@list: flow table list node in table list
+ * 	@table: the table the flow table is contained in
+ *	@name: name of this flow table
+ *	@hooknum: hook number
+ *	@priority: hook priority
+ *	@ops_len: number of hooks in array
+ *	@genmask: generation mask
+ *	@use: number of references to this flow table
+ *	@data: rhashtable and garbage collector
+ * 	@ops: array of hooks
+ */
+struct nft_flowtable {
+	struct list_head		list;
+	struct nft_table		*table;
+	char				*name;
+	int				hooknum;
+	int				priority;
+	int				ops_len;
+	u32				genmask:2,
+					use:30;
+	/* runtime data below here */
+	struct nf_hook_ops		*ops ____cacheline_aligned;
+	struct nf_flowtable		data;
+};
+
+struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
+						 const struct nlattr *nla,
+						 u8 genmask);
+void nft_flow_table_iterate(struct net *net,
+			    void (*iter)(struct nf_flowtable *flowtable, void *data),
+			    void *data);
+
+void nft_register_flowtable_type(struct nf_flowtable_type *type);
+void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
+
 /**
  *	struct nft_traceinfo - nft tracing information and state
  *
@@ -1317,4 +1358,11 @@ struct nft_trans_obj {
 #define nft_trans_obj(trans)	\
 	(((struct nft_trans_obj *)trans->data)->obj)
 
+struct nft_trans_flowtable {
+	struct nft_flowtable		*flowtable;
+};
+
+#define nft_trans_flowtable(trans)	\
+	(((struct nft_trans_flowtable *)trans->data)->flowtable)
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 2efbf9744c2a..591b53bce070 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -92,6 +92,9 @@ enum nft_verdicts {
  * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes)
  * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes)
  * @NFT_MSG_GETOBJ_RESET: get and reset a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes)
+ * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes)
+ * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -116,6 +119,9 @@ enum nf_tables_msg_types {
 	NFT_MSG_GETOBJ,
 	NFT_MSG_DELOBJ,
 	NFT_MSG_GETOBJ_RESET,
+	NFT_MSG_NEWFLOWTABLE,
+	NFT_MSG_GETFLOWTABLE,
+	NFT_MSG_DELFLOWTABLE,
 	NFT_MSG_MAX,
 };
 
@@ -1309,6 +1315,53 @@ enum nft_object_attributes {
 };
 #define NFTA_OBJ_MAX		(__NFTA_OBJ_MAX - 1)
 
+/**
+ * enum nft_flowtable_attributes - nf_tables flow table netlink attributes
+ *
+ * @NFTA_FLOWTABLE_TABLE: name of the table containing the expression (NLA_STRING)
+ * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING)
+ * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
+ * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
+ */
+enum nft_flowtable_attributes {
+	NFTA_FLOWTABLE_UNSPEC,
+	NFTA_FLOWTABLE_TABLE,
+	NFTA_FLOWTABLE_NAME,
+	NFTA_FLOWTABLE_HOOK,
+	NFTA_FLOWTABLE_USE,
+	__NFTA_FLOWTABLE_MAX
+};
+#define NFTA_FLOWTABLE_MAX	(__NFTA_FLOWTABLE_MAX - 1)
+
+/**
+ * enum nft_flowtable_hook_attributes - nf_tables flow table hook netlink attributes
+ *
+ * @NFTA_FLOWTABLE_HOOK_NUM: netfilter hook number (NLA_U32)
+ * @NFTA_FLOWTABLE_HOOK_PRIORITY: netfilter hook priority (NLA_U32)
+ * @NFTA_FLOWTABLE_HOOK_DEVS: input devices this flow table is bound to (NLA_NESTED)
+ */
+enum nft_flowtable_hook_attributes {
+	NFTA_FLOWTABLE_HOOK_UNSPEC,
+	NFTA_FLOWTABLE_HOOK_NUM,
+	NFTA_FLOWTABLE_HOOK_PRIORITY,
+	NFTA_FLOWTABLE_HOOK_DEVS,
+	__NFTA_FLOWTABLE_HOOK_MAX
+};
+#define NFTA_FLOWTABLE_HOOK_MAX	(__NFTA_FLOWTABLE_HOOK_MAX - 1)
+
+/**
+ * enum nft_device_attributes - nf_tables device netlink attributes
+ *
+ * @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
+ */
+enum nft_devices_attributes {
+	NFTA_DEVICE_UNSPEC,
+	NFTA_DEVICE_NAME,
+	__NFTA_DEVICE_MAX
+};
+#define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1)
+
+
 /**
  * enum nft_trace_attributes - nf_tables trace netlink attributes
  *
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fa564dac66a2..db0933256ec9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -17,6 +17,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/net_namespace.h>
@@ -24,6 +25,7 @@
 
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
+static LIST_HEAD(nf_tables_flowtables);
 
 /**
  *	nft_register_afinfo - register nf_tables address family info
@@ -345,6 +347,40 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
 	return err;
 }
 
+static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
+				   struct nft_flowtable *flowtable)
+{
+	struct nft_trans *trans;
+
+	trans = nft_trans_alloc(ctx, msg_type,
+				sizeof(struct nft_trans_flowtable));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	if (msg_type == NFT_MSG_NEWFLOWTABLE)
+		nft_activate_next(ctx->net, flowtable);
+
+	nft_trans_flowtable(trans) = flowtable;
+	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+	return 0;
+}
+
+static int nft_delflowtable(struct nft_ctx *ctx,
+			    struct nft_flowtable *flowtable)
+{
+	int err;
+
+	err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
+	if (err < 0)
+		return err;
+
+	nft_deactivate_next(ctx->net, flowtable);
+	ctx->table->use--;
+
+	return err;
+}
+
 /*
  * Tables
  */
@@ -728,6 +764,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
+	INIT_LIST_HEAD(&table->flowtables);
 	table->flags = flags;
 
 	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -749,10 +786,11 @@ err1:
 
 static int nft_flush_table(struct nft_ctx *ctx)
 {
-	int err;
+	struct nft_flowtable *flowtable, *nft;
 	struct nft_chain *chain, *nc;
 	struct nft_object *obj, *ne;
 	struct nft_set *set, *ns;
+	int err;
 
 	list_for_each_entry(chain, &ctx->table->chains, list) {
 		if (!nft_is_active_next(ctx->net, chain))
@@ -778,6 +816,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
 			goto out;
 	}
 
+	list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) {
+		err = nft_delflowtable(ctx, flowtable);
+		if (err < 0)
+			goto out;
+	}
+
 	list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
 		err = nft_delobj(ctx, obj);
 		if (err < 0)
@@ -4839,6 +4883,605 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx,
 		       ctx->afi->family, ctx->report, GFP_KERNEL);
 }
 
+/*
+ * Flow tables
+ */
+void nft_register_flowtable_type(struct nf_flowtable_type *type)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail_rcu(&type->list, &nf_tables_flowtables);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_register_flowtable_type);
+
+void nft_unregister_flowtable_type(struct nf_flowtable_type *type)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del_rcu(&type->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type);
+
+static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
+	[NFTA_FLOWTABLE_TABLE]		= { .type = NLA_STRING,
+					    .len = NFT_NAME_MAXLEN - 1 },
+	[NFTA_FLOWTABLE_NAME]		= { .type = NLA_STRING,
+					    .len = NFT_NAME_MAXLEN - 1 },
+	[NFTA_FLOWTABLE_HOOK]		= { .type = NLA_NESTED },
+};
+
+struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
+						 const struct nlattr *nla,
+						 u8 genmask)
+{
+	struct nft_flowtable *flowtable;
+
+	list_for_each_entry(flowtable, &table->flowtables, list) {
+		if (!nla_strcmp(nla, flowtable->name) &&
+		    nft_active_genmask(flowtable, genmask))
+			return flowtable;
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+
+#define NFT_FLOWTABLE_DEVICE_MAX	8
+
+static int nf_tables_parse_devices(const struct nft_ctx *ctx,
+				   const struct nlattr *attr,
+				   struct net_device *dev_array[], int *len)
+{
+	const struct nlattr *tmp;
+	struct net_device *dev;
+	char ifname[IFNAMSIZ];
+	int rem, n = 0, err;
+
+	nla_for_each_nested(tmp, attr, rem) {
+		if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+			err = -EINVAL;
+			goto err1;
+		}
+
+		nla_strlcpy(ifname, tmp, IFNAMSIZ);
+		dev = dev_get_by_name(ctx->net, ifname);
+		if (!dev) {
+			err = -ENOENT;
+			goto err1;
+		}
+
+		dev_array[n++] = dev;
+		if (n == NFT_FLOWTABLE_DEVICE_MAX) {
+			err = -EFBIG;
+			goto err1;
+		}
+	}
+	if (!len)
+		return -EINVAL;
+
+	err = 0;
+err1:
+	*len = n;
+	return err;
+}
+
+static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
+	[NFTA_FLOWTABLE_HOOK_NUM]	= { .type = NLA_U32 },
+	[NFTA_FLOWTABLE_HOOK_PRIORITY]	= { .type = NLA_U32 },
+	[NFTA_FLOWTABLE_HOOK_DEVS]	= { .type = NLA_NESTED },
+};
+
+static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
+					  const struct nlattr *attr,
+					  struct nft_flowtable *flowtable)
+{
+	struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX];
+	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
+	struct nf_hook_ops *ops;
+	int hooknum, priority;
+	int err, n = 0, i;
+
+	err = nla_parse_nested(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
+			       nft_flowtable_hook_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
+	    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] ||
+	    !tb[NFTA_FLOWTABLE_HOOK_DEVS])
+		return -EINVAL;
+
+	hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+	if (hooknum >= ctx->afi->nhooks)
+		return -EINVAL;
+
+	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+
+	err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS],
+				      dev_array, &n);
+	if (err < 0)
+		goto err1;
+
+	ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL);
+	if (!ops) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	flowtable->ops		= ops;
+	flowtable->ops_len	= n;
+
+	for (i = 0; i < n; i++) {
+		flowtable->ops[i].pf		= NFPROTO_NETDEV;
+		flowtable->ops[i].hooknum	= hooknum;
+		flowtable->ops[i].priority	= priority;
+		flowtable->ops[i].priv		= &flowtable->data.rhashtable;
+		flowtable->ops[i].hook		= flowtable->data.type->hook;
+		flowtable->ops[i].dev		= dev_array[i];
+	}
+
+	err = 0;
+err1:
+	for (i = 0; i < n; i++)
+		dev_put(dev_array[i]);
+
+	return err;
+}
+
+static const struct nf_flowtable_type *
+__nft_flowtable_type_get(const struct nft_af_info *afi)
+{
+	const struct nf_flowtable_type *type;
+
+	list_for_each_entry(type, &nf_tables_flowtables, list) {
+		if (afi->family == type->family)
+			return type;
+	}
+	return NULL;
+}
+
+static const struct nf_flowtable_type *
+nft_flowtable_type_get(const struct nft_af_info *afi)
+{
+	const struct nf_flowtable_type *type;
+
+	type = __nft_flowtable_type_get(afi);
+	if (type != NULL && try_module_get(type->owner))
+		return type;
+
+#ifdef CONFIG_MODULES
+	if (type == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nf-flowtable-%u", afi->family);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (__nft_flowtable_type_get(afi))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+void nft_flow_table_iterate(struct net *net,
+			    void (*iter)(struct nf_flowtable *flowtable, void *data),
+			    void *data)
+{
+	struct nft_flowtable *flowtable;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+		list_for_each_entry_rcu(table, &afi->tables, list) {
+			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+				iter(&flowtable->data, data);
+			}
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
+
+static void nft_unregister_flowtable_net_hooks(struct net *net,
+					       struct nft_flowtable *flowtable)
+{
+	int i;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		if (!flowtable->ops[i].dev)
+			continue;
+
+		nf_unregister_net_hook(net, &flowtable->ops[i]);
+	}
+}
+
+static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
+				  struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const nla[],
+				  struct netlink_ext_ack *extack)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nf_flowtable_type *type;
+	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_ctx ctx;
+	int err, i, k;
+
+	if (!nla[NFTA_FLOWTABLE_TABLE] ||
+	    !nla[NFTA_FLOWTABLE_NAME] ||
+	    !nla[NFTA_FLOWTABLE_HOOK])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					       genmask);
+	if (IS_ERR(flowtable)) {
+		err = PTR_ERR(flowtable);
+		if (err != -ENOENT)
+			return err;
+	} else {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+
+		return 0;
+	}
+
+	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
+	if (!flowtable)
+		return -ENOMEM;
+
+	flowtable->table = table;
+	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+	if (!flowtable->name) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	type = nft_flowtable_type_get(afi);
+	if (IS_ERR(type)) {
+		err = PTR_ERR(type);
+		goto err2;
+	}
+
+	flowtable->data.type = type;
+	err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+	if (err < 0)
+		goto err3;
+
+	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
+					     flowtable);
+	if (err < 0)
+		goto err3;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		err = nf_register_net_hook(net, &flowtable->ops[i]);
+		if (err < 0)
+			goto err4;
+	}
+
+	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
+	if (err < 0)
+		goto err5;
+
+	INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
+	queue_delayed_work(system_power_efficient_wq,
+			   &flowtable->data.gc_work, HZ);
+
+	list_add_tail_rcu(&flowtable->list, &table->flowtables);
+	table->use++;
+
+	return 0;
+err5:
+	i = flowtable->ops_len;
+err4:
+	for (k = i - 1; k >= 0; k--)
+		nf_unregister_net_hook(net, &flowtable->ops[i]);
+
+	kfree(flowtable->ops);
+err3:
+	module_put(type->owner);
+err2:
+	kfree(flowtable->name);
+err1:
+	kfree(flowtable);
+	return err;
+}
+
+static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
+				  struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const nla[],
+				  struct netlink_ext_ack *extack)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_ctx ctx;
+
+	afi = nf_tables_afinfo_lookup(net, family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					       genmask);
+	if (IS_ERR(flowtable))
+                return PTR_ERR(flowtable);
+	if (flowtable->use > 0)
+		return -EBUSY;
+
+	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+	return nft_delflowtable(&ctx, flowtable);
+}
+
+static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
+					 u32 portid, u32 seq, int event,
+					 u32 flags, int family,
+					 struct nft_flowtable *flowtable)
+{
+	struct nlattr *nest, *nest_devs;
+	struct nfgenmsg *nfmsg;
+	struct nlmsghdr *nlh;
+	int i;
+
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= htons(net->nft.base_seq & 0xffff);
+
+	if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
+	    nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
+	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)))
+		goto nla_put_failure;
+
+	nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
+	if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
+	    nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
+		goto nla_put_failure;
+
+	nest_devs = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK_DEVS);
+	if (!nest_devs)
+		goto nla_put_failure;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		if (flowtable->ops[i].dev &&
+		    nla_put_string(skb, NFTA_DEVICE_NAME,
+				   flowtable->ops[i].dev->name))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nest_devs);
+	nla_nest_end(skb, nest);
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+struct nft_flowtable_filter {
+	char		*table;
+};
+
+static int nf_tables_dump_flowtable(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	struct nft_flowtable_filter *filter = cb->data;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+
+	rcu_read_lock();
+	cb->seq = net->nft.base_seq;
+
+	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry_rcu(table, &afi->tables, list) {
+			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+				if (!nft_is_active(net, flowtable))
+					goto cont;
+				if (idx < s_idx)
+					goto cont;
+				if (idx > s_idx)
+					memset(&cb->args[1], 0,
+					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (filter && filter->table[0] &&
+				    strcmp(filter->table, table->name))
+					goto cont;
+
+				if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid,
+								  cb->nlh->nlmsg_seq,
+								  NFT_MSG_NEWFLOWTABLE,
+								  NLM_F_MULTI | NLM_F_APPEND,
+								  afi->family, flowtable) < 0)
+					goto done;
+
+				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+				idx++;
+			}
+		}
+	}
+done:
+	rcu_read_unlock();
+
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
+{
+	struct nft_flowtable_filter *filter = cb->data;
+
+	if (!filter)
+		return 0;
+
+	kfree(filter->table);
+	kfree(filter);
+
+	return 0;
+}
+
+static struct nft_flowtable_filter *
+nft_flowtable_filter_alloc(const struct nlattr * const nla[])
+{
+	struct nft_flowtable_filter *filter;
+
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return ERR_PTR(-ENOMEM);
+
+	if (nla[NFTA_FLOWTABLE_TABLE]) {
+		filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
+					   GFP_KERNEL);
+		if (!filter->table) {
+			kfree(filter);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return filter;
+}
+
+static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
+				  struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const nla[],
+				  struct netlink_ext_ack *extack)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_cur(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct sk_buff *skb2;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_flowtable,
+			.done = nf_tables_dump_flowtable_done,
+		};
+
+		if (nla[NFTA_FLOWTABLE_TABLE]) {
+			struct nft_flowtable_filter *filter;
+
+			filter = nft_flowtable_filter_alloc(nla);
+			if (IS_ERR(filter))
+				return -ENOMEM;
+
+			c.data = filter;
+		}
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	if (!nla[NFTA_FLOWTABLE_NAME])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					       genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(flowtable);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
+					    nlh->nlmsg_seq,
+					    NFT_MSG_NEWFLOWTABLE, 0, family,
+					    flowtable);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
+				       struct nft_flowtable *flowtable,
+				       int event)
+{
+	struct sk_buff *skb;
+	int err;
+
+	if (ctx->report &&
+	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+		return;
+
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
+					    ctx->seq, event, 0,
+					    ctx->afi->family, flowtable);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+		       ctx->report, GFP_KERNEL);
+	return;
+err:
+	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
+}
+
+static void nft_flowtable_destroy(void *ptr, void *arg)
+{
+	kfree(ptr);
+}
+
+static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
+{
+	cancel_delayed_work_sync(&flowtable->data.gc_work);
+	kfree(flowtable->name);
+	rhashtable_free_and_destroy(&flowtable->data.rhashtable,
+				    nft_flowtable_destroy, NULL);
+	module_put(flowtable->data.type->owner);
+}
+
 static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 				   u32 portid, u32 seq)
 {
@@ -4869,6 +5512,49 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void nft_flowtable_event(unsigned long event, struct net_device *dev,
+				struct nft_flowtable *flowtable)
+{
+	int i;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		if (flowtable->ops[i].dev != dev)
+			continue;
+
+		nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
+		flowtable->ops[i].dev = NULL;
+		break;
+	}
+}
+
+static int nf_tables_flowtable_event(struct notifier_block *this,
+				     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct nft_flowtable *flowtable;
+	struct nft_table *table;
+	struct nft_af_info *afi;
+
+	if (event != NETDEV_UNREGISTER)
+		return 0;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
+		list_for_each_entry(table, &afi->tables, list) {
+			list_for_each_entry(flowtable, &table->flowtables, list) {
+				nft_flowtable_event(event, dev, flowtable);
+			}
+		}
+	}
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nf_tables_flowtable_notifier = {
+	.notifier_call	= nf_tables_flowtable_event,
+};
+
 static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
 				 int event)
 {
@@ -5021,6 +5707,21 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
+	[NFT_MSG_NEWFLOWTABLE] = {
+		.call_batch	= nf_tables_newflowtable,
+		.attr_count	= NFTA_FLOWTABLE_MAX,
+		.policy		= nft_flowtable_policy,
+	},
+	[NFT_MSG_GETFLOWTABLE] = {
+		.call		= nf_tables_getflowtable,
+		.attr_count	= NFTA_FLOWTABLE_MAX,
+		.policy		= nft_flowtable_policy,
+	},
+	[NFT_MSG_DELFLOWTABLE] = {
+		.call_batch	= nf_tables_delflowtable,
+		.attr_count	= NFTA_FLOWTABLE_MAX,
+		.policy		= nft_flowtable_policy,
+	},
 };
 
 static void nft_chain_commit_update(struct nft_trans *trans)
@@ -5066,6 +5767,9 @@ static void nf_tables_commit_release(struct nft_trans *trans)
 	case NFT_MSG_DELOBJ:
 		nft_obj_destroy(nft_trans_obj(trans));
 		break;
+	case NFT_MSG_DELFLOWTABLE:
+		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+		break;
 	}
 	kfree(trans);
 }
@@ -5183,6 +5887,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
 					     NFT_MSG_DELOBJ);
 			break;
+		case NFT_MSG_NEWFLOWTABLE:
+			nft_clear(net, nft_trans_flowtable(trans));
+			nf_tables_flowtable_notify(&trans->ctx,
+						   nft_trans_flowtable(trans),
+						   NFT_MSG_NEWFLOWTABLE);
+			nft_trans_destroy(trans);
+			break;
+		case NFT_MSG_DELFLOWTABLE:
+			list_del_rcu(&nft_trans_flowtable(trans)->list);
+			nf_tables_flowtable_notify(&trans->ctx,
+						   nft_trans_flowtable(trans),
+						   NFT_MSG_DELFLOWTABLE);
+			nft_unregister_flowtable_net_hooks(net,
+					nft_trans_flowtable(trans));
+			break;
 		}
 	}
 
@@ -5220,6 +5939,9 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 	case NFT_MSG_NEWOBJ:
 		nft_obj_destroy(nft_trans_obj(trans));
 		break;
+	case NFT_MSG_NEWFLOWTABLE:
+		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+		break;
 	}
 	kfree(trans);
 }
@@ -5309,6 +6031,17 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			nft_clear(trans->ctx.net, nft_trans_obj(trans));
 			nft_trans_destroy(trans);
 			break;
+		case NFT_MSG_NEWFLOWTABLE:
+			trans->ctx.table->use--;
+			list_del_rcu(&nft_trans_flowtable(trans)->list);
+			nft_unregister_flowtable_net_hooks(net,
+					nft_trans_flowtable(trans));
+			break;
+		case NFT_MSG_DELFLOWTABLE:
+			trans->ctx.table->use++;
+			nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+			nft_trans_destroy(trans);
+			break;
 		}
 	}
 
@@ -5865,6 +6598,7 @@ EXPORT_SYMBOL_GPL(__nft_release_basechain);
 /* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */
 static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 {
+	struct nft_flowtable *flowtable, *nf;
 	struct nft_table *table, *nt;
 	struct nft_chain *chain, *nc;
 	struct nft_object *obj, *ne;
@@ -5878,6 +6612,9 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 	list_for_each_entry_safe(table, nt, &afi->tables, list) {
 		list_for_each_entry(chain, &table->chains, list)
 			nf_tables_unregister_hook(net, table, chain);
+		list_for_each_entry(flowtable, &table->flowtables, list)
+			nf_unregister_net_hooks(net, flowtable->ops,
+						flowtable->ops_len);
 		/* No packets are walking on these chains anymore. */
 		ctx.table = table;
 		list_for_each_entry(chain, &table->chains, list) {
@@ -5888,6 +6625,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 				nf_tables_rule_destroy(&ctx, rule);
 			}
 		}
+		list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
+			list_del(&flowtable->list);
+			table->use--;
+			nf_tables_flowtable_destroy(flowtable);
+		}
 		list_for_each_entry_safe(set, ns, &table->sets, list) {
 			list_del(&set->list);
 			table->use--;
@@ -5932,6 +6674,8 @@ static int __init nf_tables_module_init(void)
 	if (err < 0)
 		goto err3;
 
+	register_netdevice_notifier(&nf_tables_flowtable_notifier);
+
 	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
 	return register_pernet_subsys(&nf_tables_net_ops);
 err3:
@@ -5946,6 +6690,7 @@ static void __exit nf_tables_module_exit(void)
 {
 	unregister_pernet_subsys(&nf_tables_net_ops);
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
+	unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
 	rcu_barrier();
 	nf_tables_core_module_exit();
 	kfree(info);
-- 
cgit v1.2.3


From ac2a66665e231847cab11b8c8e844ce43207dd2e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:11 +0100
Subject: netfilter: add generic flow table infrastructure

This patch defines the API to interact with flow tables, this allows to
add, delete and lookup for entries in the flow table. This also adds the
generic garbage code that removes entries that have expired, ie. no
traffic has been seen for a while.

Users of the flow table infrastructure can delete entries via
flow_offload_dead(), which sets the dying bit, this signals the garbage
collector to release an entry from user context.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  94 ++++++++
 net/netfilter/Kconfig                 |   7 +
 net/netfilter/Makefile                |   3 +
 net/netfilter/nf_flow_table.c         | 429 ++++++++++++++++++++++++++++++++++
 4 files changed, 533 insertions(+)
 create mode 100644 net/netfilter/nf_flow_table.c

(limited to 'net')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 3a0779589281..161f71ca78a0 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -1,7 +1,12 @@
 #ifndef _NF_FLOW_TABLE_H
 #define _NF_FLOW_TABLE_H
 
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
 #include <linux/rhashtable.h>
+#include <linux/rcupdate.h>
+#include <net/dst.h>
 
 struct nf_flowtable;
 
@@ -20,4 +25,93 @@ struct nf_flowtable {
 	struct delayed_work		gc_work;
 };
 
+enum flow_offload_tuple_dir {
+	FLOW_OFFLOAD_DIR_ORIGINAL,
+	FLOW_OFFLOAD_DIR_REPLY,
+	__FLOW_OFFLOAD_DIR_MAX		= FLOW_OFFLOAD_DIR_REPLY,
+};
+#define FLOW_OFFLOAD_DIR_MAX	(__FLOW_OFFLOAD_DIR_MAX + 1)
+
+struct flow_offload_tuple {
+	union {
+		struct in_addr		src_v4;
+		struct in6_addr		src_v6;
+	};
+	union {
+		struct in_addr		dst_v4;
+		struct in6_addr		dst_v6;
+	};
+	struct {
+		__be16			src_port;
+		__be16			dst_port;
+	};
+
+	int				iifidx;
+
+	u8				l3proto;
+	u8				l4proto;
+	u8				dir;
+
+	int				oifidx;
+
+	struct dst_entry		*dst_cache;
+};
+
+struct flow_offload_tuple_rhash {
+	struct rhash_head		node;
+	struct flow_offload_tuple	tuple;
+};
+
+#define FLOW_OFFLOAD_SNAT	0x1
+#define FLOW_OFFLOAD_DNAT	0x2
+#define FLOW_OFFLOAD_DYING	0x4
+
+struct flow_offload {
+	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
+	u32					flags;
+	union {
+		/* Your private driver data here. */
+		u32		timeout;
+	};
+};
+
+#define NF_FLOW_TIMEOUT (30 * HZ)
+
+struct nf_flow_route {
+	struct {
+		struct dst_entry	*dst;
+		int			ifindex;
+	} tuple[FLOW_OFFLOAD_DIR_MAX];
+};
+
+struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
+					struct nf_flow_route *route);
+void flow_offload_free(struct flow_offload *flow);
+
+int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
+void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
+struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
+						     struct flow_offload_tuple *tuple);
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+			  void (*iter)(struct flow_offload *flow, void *data),
+			  void *data);
+void nf_flow_offload_work_gc(struct work_struct *work);
+extern const struct rhashtable_params nf_flow_offload_rhash_params;
+
+void flow_offload_dead(struct flow_offload *flow);
+
+int nf_flow_snat_port(const struct flow_offload *flow,
+		      struct sk_buff *skb, unsigned int thoff,
+		      u8 protocol, enum flow_offload_tuple_dir dir);
+int nf_flow_dnat_port(const struct flow_offload *flow,
+		      struct sk_buff *skb, unsigned int thoff,
+		      u8 protocol, enum flow_offload_tuple_dir dir);
+
+struct flow_ports {
+	__be16 source, dest;
+};
+
+#define MODULE_ALIAS_NF_FLOWTABLE(family)	\
+	MODULE_ALIAS("nf-flowtable-" __stringify(family))
+
 #endif /* _FLOW_OFFLOAD_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index af3d9f721b3f..264ce877ef49 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -657,6 +657,13 @@ endif # NF_TABLES_NETDEV
 
 endif # NF_TABLES
 
+config NF_FLOW_TABLE
+	tristate "Netfilter flow table module"
+	help
+	  This option adds the flow table core infrastructure.
+
+	  To compile it as a module, choose M here.
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index eec0c3b72926..2930f2b854be 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -109,6 +109,9 @@ obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
 obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
+# flow table infrastructure
+obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
+
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
new file mode 100644
index 000000000000..2f5099cb85b8
--- /dev/null
+++ b/net/netfilter/nf_flow_table.c
@@ -0,0 +1,429 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+struct flow_offload_entry {
+	struct flow_offload	flow;
+	struct nf_conn		*ct;
+	struct rcu_head		rcu_head;
+};
+
+struct flow_offload *
+flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+{
+	struct flow_offload_entry *entry;
+	struct flow_offload *flow;
+
+	if (unlikely(nf_ct_is_dying(ct) ||
+	    !atomic_inc_not_zero(&ct->ct_general.use)))
+		return NULL;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		goto err_ct_refcnt;
+
+	flow = &entry->flow;
+
+	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
+		goto err_dst_cache_original;
+
+	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
+		goto err_dst_cache_reply;
+
+	entry->ct = ct;
+
+	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
+	case NFPROTO_IPV4:
+		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
+		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
+		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
+			ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
+		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
+			ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
+		break;
+	case NFPROTO_IPV6:
+		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
+		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
+		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
+			ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
+		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
+			ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
+		break;
+	}
+
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
+		  route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
+		  route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
+
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
+		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
+		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
+						FLOW_OFFLOAD_DIR_ORIGINAL;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
+						FLOW_OFFLOAD_DIR_REPLY;
+
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
+		route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
+		route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
+		route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
+	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
+		route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+
+	if (ct->status & IPS_SRC_NAT)
+		flow->flags |= FLOW_OFFLOAD_SNAT;
+	else if (ct->status & IPS_DST_NAT)
+		flow->flags |= FLOW_OFFLOAD_DNAT;
+
+	return flow;
+
+err_dst_cache_reply:
+	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+err_dst_cache_original:
+	kfree(entry);
+err_ct_refcnt:
+	nf_ct_put(ct);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
+
+void flow_offload_free(struct flow_offload *flow)
+{
+	struct flow_offload_entry *e;
+
+	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
+	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+	e = container_of(flow, struct flow_offload_entry, flow);
+	kfree(e);
+}
+EXPORT_SYMBOL_GPL(flow_offload_free);
+
+void flow_offload_dead(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_DYING;
+}
+EXPORT_SYMBOL_GPL(flow_offload_dead);
+
+int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
+{
+	flow->timeout = (u32)jiffies;
+
+	rhashtable_insert_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+			       *flow_table->type->params);
+	rhashtable_insert_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+			       *flow_table->type->params);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(flow_offload_add);
+
+void flow_offload_del(struct nf_flowtable *flow_table,
+		      struct flow_offload *flow)
+{
+	struct flow_offload_entry *e;
+
+	rhashtable_remove_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+			       *flow_table->type->params);
+	rhashtable_remove_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+			       *flow_table->type->params);
+
+	e = container_of(flow, struct flow_offload_entry, flow);
+	kfree_rcu(e, rcu_head);
+}
+EXPORT_SYMBOL_GPL(flow_offload_del);
+
+struct flow_offload_tuple_rhash *
+flow_offload_lookup(struct nf_flowtable *flow_table,
+		    struct flow_offload_tuple *tuple)
+{
+	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+				      *flow_table->type->params);
+}
+EXPORT_SYMBOL_GPL(flow_offload_lookup);
+
+static void nf_flow_release_ct(const struct flow_offload *flow)
+{
+	struct flow_offload_entry *e;
+
+	e = container_of(flow, struct flow_offload_entry, flow);
+	nf_ct_delete(e->ct, 0, 0);
+	nf_ct_put(e->ct);
+}
+
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+			  void (*iter)(struct flow_offload *flow, void *data),
+			  void *data)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct rhashtable_iter hti;
+	struct flow_offload *flow;
+	int err;
+
+	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+	if (err)
+		return err;
+
+	rhashtable_walk_start(&hti);
+
+	while ((tuplehash = rhashtable_walk_next(&hti))) {
+		if (IS_ERR(tuplehash)) {
+			err = PTR_ERR(tuplehash);
+			if (err != -EAGAIN)
+				goto out;
+
+			continue;
+		}
+		if (tuplehash->tuple.dir)
+			continue;
+
+		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+		iter(flow, data);
+	}
+out:
+	rhashtable_walk_stop(&hti);
+	rhashtable_walk_exit(&hti);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
+
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+}
+
+static inline bool nf_flow_is_dying(const struct flow_offload *flow)
+{
+	return flow->flags & FLOW_OFFLOAD_DYING;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table;
+	struct rhashtable_iter hti;
+	struct flow_offload *flow;
+	int err;
+
+	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+
+	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+	if (err)
+		goto schedule;
+
+	rhashtable_walk_start(&hti);
+
+	while ((tuplehash = rhashtable_walk_next(&hti))) {
+		if (IS_ERR(tuplehash)) {
+			err = PTR_ERR(tuplehash);
+			if (err != -EAGAIN)
+				goto out;
+
+			continue;
+		}
+		if (tuplehash->tuple.dir)
+			continue;
+
+		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+		if (nf_flow_has_expired(flow) ||
+		    nf_flow_is_dying(flow)) {
+			flow_offload_del(flow_table, flow);
+			nf_flow_release_ct(flow);
+		}
+	}
+out:
+	rhashtable_walk_stop(&hti);
+	rhashtable_walk_exit(&hti);
+schedule:
+	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
+
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple *tuple = data;
+
+	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple_rhash *tuplehash = data;
+
+	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+					const void *ptr)
+{
+	const struct flow_offload_tuple *tuple = arg->key;
+	const struct flow_offload_tuple_rhash *x = ptr;
+
+	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+		return 1;
+
+	return 0;
+}
+
+const struct rhashtable_params nf_flow_offload_rhash_params = {
+	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
+	.hashfn			= flow_offload_hash,
+	.obj_hashfn		= flow_offload_hash_obj,
+	.obj_cmpfn		= flow_offload_hash_cmp,
+	.automatic_shrinking	= true,
+};
+EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
+
+static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+				__be16 port, __be16 new_port)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+				__be16 port, __be16 new_port)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace2(&udph->check, skb, port,
+					 new_port, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+			    u8 protocol, __be16 port, __be16 new_port)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+int nf_flow_snat_port(const struct flow_offload *flow,
+		      struct sk_buff *skb, unsigned int thoff,
+		      u8 protocol, enum flow_offload_tuple_dir dir)
+{
+	struct flow_ports *hdr;
+	__be16 port, new_port;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+		return -1;
+
+	hdr = (void *)(skb_network_header(skb) + thoff);
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = hdr->source;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
+		hdr->source = new_port;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		port = hdr->dest;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
+		hdr->dest = new_port;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_snat_port);
+
+int nf_flow_dnat_port(const struct flow_offload *flow,
+		      struct sk_buff *skb, unsigned int thoff,
+		      u8 protocol, enum flow_offload_tuple_dir dir)
+{
+	struct flow_ports *hdr;
+	__be16 port, new_port;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+		return -1;
+
+	hdr = (void *)(skb_network_header(skb) + thoff);
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = hdr->dest;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
+		hdr->dest = new_port;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		port = hdr->source;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
+		hdr->source = new_port;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-- 
cgit v1.2.3


From 97add9f0d66da9898da325f84e80533db9cc0ced Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:15 +0100
Subject: netfilter: flow table support for IPv4

This patch adds the IPv4 flow table type, that implements the datapath
flow table to forward IPv4 traffic. Rationale is:

1) Look up for the packet in the flow table, from the ingress hook.
2) If there's a hit, decrement ttl and pass it on to the neighbour layer
   for transmission.
3) If there's a miss, packet is passed up to the classic forwarding
   path.

This patch also supports layer 3 source and destination NAT.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig              |   8 +
 net/ipv4/netfilter/Makefile             |   3 +
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 283 ++++++++++++++++++++++++++++++++
 3 files changed, 294 insertions(+)
 create mode 100644 net/ipv4/netfilter/nf_flow_table_ipv4.c

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index cee51045e2f7..7d5d444964aa 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -78,6 +78,14 @@ config NF_TABLES_ARP
 
 endif # NF_TABLES
 
+config NF_FLOW_TABLE_IPV4
+	select NF_FLOW_TABLE
+	tristate "Netfilter flow table IPv4 module"
+	help
+	  This option adds the flow table IPv4 support.
+
+	  To compile it as a module, choose M here.
+
 config NF_DUP_IPV4
 	tristate "Netfilter IPv4 packet duplication to alternate destination"
 	depends on !NF_CONNTRACK || NF_CONNTRACK
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index adcdae358365..8bb1f0c7a375 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -43,6 +43,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
 obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
 obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
+# flow table support
+obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
+
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
new file mode 100644
index 000000000000..ac56c0f0492a
--- /dev/null
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -0,0 +1,283 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+			      __be32 addr, __be32 new_addr)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+			      __be32 addr, __be32 new_addr)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace4(&udph->check, skb, addr,
+					 new_addr, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+				  unsigned int thoff, __be32 addr,
+				  __be32 new_addr)
+{
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			   struct iphdr *iph, unsigned int thoff,
+			   enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+		iph->saddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+		iph->daddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+	csum_replace4(&iph->check, addr, new_addr);
+
+	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			   struct iphdr *iph, unsigned int thoff,
+			   enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+		iph->daddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+		iph->saddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			  enum flow_offload_tuple_dir dir)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	unsigned int thoff = iph->ihl * 4;
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT &&
+	    (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+	     nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+		return -1;
+	if (flow->flags & FLOW_OFFLOAD_DNAT &&
+	    (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+	     nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+		return -1;
+
+	return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+	return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+			    struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	unsigned int thoff;
+	struct iphdr *iph;
+
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		return -1;
+
+	iph = ip_hdr(skb);
+	thoff = iph->ihl * 4;
+
+	if (ip_is_fragment(iph) ||
+	    unlikely(ip_has_options(thoff)))
+		return -1;
+
+	if (iph->protocol != IPPROTO_TCP &&
+	    iph->protocol != IPPROTO_UDP)
+		return -1;
+
+	thoff = iph->ihl * 4;
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return -1;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v4.s_addr	= iph->saddr;
+	tuple->dst_v4.s_addr	= iph->daddr;
+	tuple->src_port		= ports->source;
+	tuple->dst_port		= ports->dest;
+	tuple->l3proto		= AF_INET;
+	tuple->l4proto		= iph->protocol;
+	tuple->iifidx		= dev->ifindex;
+
+	return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+		return false;
+
+	return true;
+}
+
+static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
+{
+	u32 mtu;
+
+	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+	if (__nf_flow_exceeds_mtu(skb, mtu))
+		return true;
+
+	return false;
+}
+
+static unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+			const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	const struct rtable *rt;
+	struct iphdr *iph;
+	__be32 nexthop;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*iph)))
+		return NF_DROP;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ip(flow, skb, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+	iph = ip_hdr(skb);
+	ip_decrease_ttl(iph);
+
+	skb->dev = outdev;
+	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+	return NF_STOLEN;
+}
+
+static struct nf_flowtable_type flowtable_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.params		= &nf_flow_offload_rhash_params,
+	.gc		= nf_flow_offload_work_gc,
+	.hook		= nf_flow_offload_ip_hook,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nf_flow_ipv4_module_init(void)
+{
+	nft_register_flowtable_type(&flowtable_ipv4);
+
+	return 0;
+}
+
+static void __exit nf_flow_ipv4_module_exit(void)
+{
+	nft_unregister_flowtable_type(&flowtable_ipv4);
+}
+
+module_init(nf_flow_ipv4_module_init);
+module_exit(nf_flow_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
-- 
cgit v1.2.3


From 0995210753a26c4fa1a3d8c63cc230e22a8537cd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:19 +0100
Subject: netfilter: flow table support for IPv6

This patch adds the IPv6 flow table type, that implements the datapath
flow table to forward IPv6 traffic.

This patch exports ip6_dst_mtu_forward() that is required to check for
mtu to pass up packets that need PMTUD handling to the classic
forwarding path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ipv6.h                      |   2 +
 net/ipv6/ip6_output.c                   |   3 +-
 net/ipv6/netfilter/Kconfig              |   8 +
 net/ipv6/netfilter/Makefile             |   3 +
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 277 ++++++++++++++++++++++++++++++++
 5 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv6/netfilter/nf_flow_table_ipv6.c

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 25be4715578c..9dc1230d789c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -969,6 +969,8 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
 			      &inet6_sk(sk)->cork);
 }
 
+unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst);
+
 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
 		   struct flowi6 *fl6);
 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index bcdb615aed6e..19adad6d90bc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -378,7 +378,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 	return dst_output(net, sk, skb);
 }
 
-static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 {
 	unsigned int mtu;
 	struct inet6_dev *idev;
@@ -398,6 +398,7 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 
 	return mtu;
 }
+EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
 
 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 {
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 6acb2eecd986..806e95375ec8 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -71,6 +71,14 @@ config NFT_FIB_IPV6
 endif # NF_TABLES_IPV6
 endif # NF_TABLES
 
+config NF_FLOW_TABLE_IPV6
+	select NF_FLOW_TABLE
+	tristate "Netfilter flow table IPv6 module"
+	help
+	  This option adds the flow table IPv6 support.
+
+	  To compile it as a module, choose M here.
+
 config NF_DUP_IPV6
 	tristate "Netfilter IPv6 packet duplication to alternate destination"
 	depends on !NF_CONNTRACK || NF_CONNTRACK
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index c6ee0cdd0ba9..95611c4b39b0 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -45,6 +45,9 @@ obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
 obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
 obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
 
+# flow table support
+obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o
+
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
new file mode 100644
index 000000000000..d7d073bb19ee
--- /dev/null
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -0,0 +1,277 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+				struct in6_addr *addr,
+				struct in6_addr *new_addr)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+				  new_addr->s6_addr32, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+				struct in6_addr *addr,
+				struct in6_addr *new_addr)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+					  new_addr->s6_addr32, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+				    unsigned int thoff, struct in6_addr *addr,
+				    struct in6_addr *new_addr)
+{
+	switch (ip6h->nexthdr) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+			     struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int thoff,
+			     enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+		ip6h->saddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+		ip6h->daddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+			     struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int thoff,
+			     enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+		ip6h->daddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+		ip6h->saddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+			    struct sk_buff *skb,
+			    enum flow_offload_tuple_dir dir)
+{
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	unsigned int thoff = sizeof(*ip6h);
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT &&
+	    (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+	     nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+		return -1;
+	if (flow->flags & FLOW_OFFLOAD_DNAT &&
+	    (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+	     nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+		return -1;
+
+	return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+			      struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	struct ipv6hdr *ip6h;
+	unsigned int thoff;
+
+	if (!pskb_may_pull(skb, sizeof(*ip6h)))
+		return -1;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_TCP &&
+	    ip6h->nexthdr != IPPROTO_UDP)
+		return -1;
+
+	thoff = sizeof(*ip6h);
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return -1;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v6		= ip6h->saddr;
+	tuple->dst_v6		= ip6h->daddr;
+	tuple->src_port		= ports->source;
+	tuple->dst_port		= ports->dest;
+	tuple->l3proto		= AF_INET6;
+	tuple->l4proto		= ip6h->nexthdr;
+	tuple->iifidx		= dev->ifindex;
+
+	return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+		return false;
+
+	return true;
+}
+
+static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
+{
+	u32 mtu;
+
+	mtu = ip6_dst_mtu_forward(&rt->dst);
+	if (__nf_flow_exceeds_mtu(skb, mtu))
+		return true;
+
+	return false;
+}
+
+static unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+			  const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	struct in6_addr *nexthop;
+	struct ipv6hdr *ip6h;
+	struct rt6_info *rt;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*ip6h)))
+		return NF_DROP;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ipv6(flow, skb, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+	ip6h = ipv6_hdr(skb);
+	ip6h->hop_limit--;
+
+	skb->dev = outdev;
+	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+	return NF_STOLEN;
+}
+
+static struct nf_flowtable_type flowtable_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.params		= &nf_flow_offload_rhash_params,
+	.gc		= nf_flow_offload_work_gc,
+	.hook		= nf_flow_offload_ipv6_hook,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nf_flow_ipv6_module_init(void)
+{
+	nft_register_flowtable_type(&flowtable_ipv6);
+
+	return 0;
+}
+
+static void __exit nf_flow_ipv6_module_exit(void)
+{
+	nft_unregister_flowtable_type(&flowtable_ipv6);
+}
+
+module_init(nf_flow_ipv6_module_init);
+module_exit(nf_flow_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
-- 
cgit v1.2.3


From 7c23b629a8085b11daccd68c62b5116ff498f84a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:22 +0100
Subject: netfilter: flow table support for the mixed IPv4/IPv6 family

This patch adds the IPv6 flow table type, that implements the datapath
flow table to forward IPv6 traffic.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  5 ++++
 net/ipv4/netfilter/nf_flow_table_ipv4.c |  3 ++-
 net/ipv6/netfilter/nf_flow_table_ipv6.c |  3 ++-
 net/netfilter/Kconfig                   |  8 ++++++
 net/netfilter/Makefile                  |  1 +
 net/netfilter/nf_flow_table_inet.c      | 48 +++++++++++++++++++++++++++++++++
 6 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 net/netfilter/nf_flow_table_inet.c

(limited to 'net')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 161f71ca78a0..b22b22082733 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -111,6 +111,11 @@ struct flow_ports {
 	__be16 source, dest;
 };
 
+unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+				     const struct nf_hook_state *state);
+unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+				       const struct nf_hook_state *state);
+
 #define MODULE_ALIAS_NF_FLOWTABLE(family)	\
 	MODULE_ALIAS("nf-flowtable-" __stringify(family))
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index ac56c0f0492a..b2d01eb25f2c 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -202,7 +202,7 @@ static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
 	return false;
 }
 
-static unsigned int
+unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
 {
@@ -254,6 +254,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	return NF_STOLEN;
 }
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index d7d073bb19ee..0c3b9d32f64f 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -196,7 +196,7 @@ static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
 	return false;
 }
 
-static unsigned int
+unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 			  const struct nf_hook_state *state)
 {
@@ -248,6 +248,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	return NF_STOLEN;
 }
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 264ce877ef49..272803079bf2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -657,6 +657,14 @@ endif # NF_TABLES_NETDEV
 
 endif # NF_TABLES
 
+config NF_FLOW_TABLE_INET
+	select NF_FLOW_TABLE
+	tristate "Netfilter flow table mixed IPv4/IPv6 module"
+	help
+          This option adds the flow table mixed IPv4/IPv6 support.
+
+	  To compile it as a module, choose M here.
+
 config NF_FLOW_TABLE
 	tristate "Netfilter flow table module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 2930f2b854be..061365875cde 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
+obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
new file mode 100644
index 000000000000..281209aeba8f
--- /dev/null
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -0,0 +1,48 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int
+nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
+			  const struct nf_hook_state *state)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return nf_flow_offload_ip_hook(priv, skb, state);
+	case htons(ETH_P_IPV6):
+		return nf_flow_offload_ipv6_hook(priv, skb, state);
+	}
+
+	return NF_ACCEPT;
+}
+
+static struct nf_flowtable_type flowtable_inet = {
+	.family		= NFPROTO_INET,
+	.params		= &nf_flow_offload_rhash_params,
+	.gc		= nf_flow_offload_work_gc,
+	.hook		= nf_flow_offload_inet_hook,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nf_flow_inet_module_init(void)
+{
+	nft_register_flowtable_type(&flowtable_inet);
+
+	return 0;
+}
+
+static void __exit nf_flow_inet_module_exit(void)
+{
+	nft_unregister_flowtable_type(&flowtable_inet);
+}
+
+module_init(nf_flow_inet_module_init);
+module_exit(nf_flow_inet_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */
-- 
cgit v1.2.3


From a3c90f7a2323b331ae816d5b0633e68148e25d04 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:26 +0100
Subject: netfilter: nf_tables: flow offload expression

Add new instruction for the nf_tables VM that allows us to specify what
flows are offloaded into a given flow table via name. This new
instruction creates the flow entry and adds it to the flow table.

Only established flows, ie. we have seen traffic in both directions, are
added to the flow table. You can still decide to offload entries at a
later stage via packet counting or checking the ct status in case you
want to offload assured conntracks.

This new extension depends on the conntrack subsystem.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  11 ++
 net/netfilter/Kconfig                    |   7 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_flow_offload.c         | 264 +++++++++++++++++++++++++++++++
 4 files changed, 283 insertions(+)
 create mode 100644 net/netfilter/nft_flow_offload.c

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 591b53bce070..53e8dd2a3a03 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -957,6 +957,17 @@ enum nft_ct_attributes {
 };
 #define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
 
+/**
+ * enum nft_flow_attributes - ct offload expression attributes
+ * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING)
+ */
+enum nft_offload_attributes {
+	NFTA_FLOW_UNSPEC,
+	NFTA_FLOW_TABLE_NAME,
+	__NFTA_FLOW_MAX,
+};
+#define NFTA_FLOW_MAX		(__NFTA_FLOW_MAX - 1)
+
 enum nft_limit_type {
 	NFT_LIMIT_PKTS,
 	NFT_LIMIT_PKT_BYTES
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 272803079bf2..0ee0fcf3abbf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -505,6 +505,13 @@ config NFT_CT
 	  This option adds the "ct" expression that you can use to match
 	  connection tracking information such as the flow state.
 
+config NFT_FLOW_OFFLOAD
+	depends on NF_CONNTRACK
+	tristate "Netfilter nf_tables hardware flow offload module"
+	help
+	  This option adds the "flow_offload" expression that you can use to
+	  choose what flows are placed into the hardware.
+
 config NFT_SET_RBTREE
 	tristate "Netfilter nf_tables rbtree set module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 061365875cde..5d9b8b959e58 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_RT)		+= nft_rt.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
+obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
 obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
 obj-$(CONFIG_NFT_OBJREF)	+= nft_objref.o
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
new file mode 100644
index 000000000000..dd38785dfed9
--- /dev/null
+++ b/net/netfilter/nft_flow_offload.c
@@ -0,0 +1,264 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h> /* for ipv4 options. */
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct nft_flow_offload {
+	struct nft_flowtable	*flowtable;
+};
+
+static int nft_flow_route(const struct nft_pktinfo *pkt,
+			  const struct nf_conn *ct,
+			  struct nf_flow_route *route,
+			  enum ip_conntrack_dir dir)
+{
+	struct dst_entry *this_dst = skb_dst(pkt->skb);
+	struct dst_entry *other_dst = NULL;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
+		break;
+	case NFPROTO_IPV6:
+		fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
+		break;
+	}
+
+	nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+	if (!other_dst)
+		return -ENOENT;
+
+	route->tuple[dir].dst		= this_dst;
+	route->tuple[dir].ifindex	= nft_in(pkt)->ifindex;
+	route->tuple[!dir].dst		= other_dst;
+	route->tuple[!dir].ifindex	= nft_out(pkt)->ifindex;
+
+	return 0;
+}
+
+static bool nft_flow_offload_skip(struct sk_buff *skb)
+{
+	struct ip_options *opt  = &(IPCB(skb)->opt);
+
+	if (unlikely(opt->optlen))
+		return true;
+	if (skb_sec_path(skb))
+		return true;
+
+	return false;
+}
+
+static void nft_flow_offload_eval(const struct nft_expr *expr,
+				  struct nft_regs *regs,
+				  const struct nft_pktinfo *pkt)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+	struct nf_flowtable *flowtable = &priv->flowtable->data;
+	enum ip_conntrack_info ctinfo;
+	struct nf_flow_route route;
+	struct flow_offload *flow;
+	enum ip_conntrack_dir dir;
+	struct nf_conn *ct;
+	int ret;
+
+	if (nft_flow_offload_skip(pkt->skb))
+		goto out;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+	if (!ct)
+		goto out;
+
+	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		break;
+	default:
+		goto out;
+	}
+
+	if (test_bit(IPS_HELPER_BIT, &ct->status))
+		goto out;
+
+	if (ctinfo == IP_CT_NEW ||
+	    ctinfo == IP_CT_RELATED)
+		goto out;
+
+	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+		goto out;
+
+	dir = CTINFO2DIR(ctinfo);
+	if (nft_flow_route(pkt, ct, &route, dir) < 0)
+		goto err_flow_route;
+
+	flow = flow_offload_alloc(ct, &route);
+	if (!flow)
+		goto err_flow_alloc;
+
+	ret = flow_offload_add(flowtable, flow);
+	if (ret < 0)
+		goto err_flow_add;
+
+	return;
+
+err_flow_add:
+	flow_offload_free(flow);
+err_flow_alloc:
+	dst_release(route.tuple[!dir].dst);
+err_flow_route:
+	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+out:
+	regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+				     const struct nft_expr *expr,
+				     const struct nft_data **data)
+{
+	unsigned int hook_mask = (1 << NF_INET_FORWARD);
+
+	return nft_chain_validate_hooks(ctx->chain, hook_mask);
+}
+
+static int nft_flow_offload_init(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nlattr * const tb[])
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_flowtable *flowtable;
+
+	if (!tb[NFTA_FLOW_TABLE_NAME])
+		return -EINVAL;
+
+	flowtable = nf_tables_flowtable_lookup(ctx->table,
+					       tb[NFTA_FLOW_TABLE_NAME],
+					       genmask);
+	if (IS_ERR(flowtable))
+		return PTR_ERR(flowtable);
+
+	priv->flowtable = flowtable;
+	flowtable->use++;
+
+	return nf_ct_netns_get(ctx->net, ctx->afi->family);
+}
+
+static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
+				     const struct nft_expr *expr)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+	priv->flowtable->use--;
+	nf_ct_netns_put(ctx->net, ctx->afi->family);
+}
+
+static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+	if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_flow_offload_type;
+static const struct nft_expr_ops nft_flow_offload_ops = {
+	.type		= &nft_flow_offload_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
+	.eval		= nft_flow_offload_eval,
+	.init		= nft_flow_offload_init,
+	.destroy	= nft_flow_offload_destroy,
+	.validate	= nft_flow_offload_validate,
+	.dump		= nft_flow_offload_dump,
+};
+
+static struct nft_expr_type nft_flow_offload_type __read_mostly = {
+	.name		= "flow_offload",
+	.ops		= &nft_flow_offload_ops,
+	.maxattr	= NFTA_FLOW_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data)
+{
+	struct net_device *dev = data;
+
+	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+		return;
+
+	flow_offload_dead(flow);
+}
+
+static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
+					     void *data)
+{
+	nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
+}
+
+static int flow_offload_netdev_event(struct notifier_block *this,
+				     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (event != NETDEV_DOWN)
+		return NOTIFY_DONE;
+
+	nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+	.notifier_call	= flow_offload_netdev_event,
+};
+
+static int __init nft_flow_offload_module_init(void)
+{
+	int err;
+
+	register_netdevice_notifier(&flow_offload_netdev_notifier);
+
+	err = nft_register_expr(&nft_flow_offload_type);
+	if (err < 0)
+		goto register_expr;
+
+	return 0;
+
+register_expr:
+	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+	return err;
+}
+
+static void __exit nft_flow_offload_module_exit(void)
+{
+	struct net *net;
+
+	nft_unregister_expr(&nft_flow_offload_type);
+	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+	rtnl_lock();
+	for_each_net(net)
+		nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL);
+	rtnl_unlock();
+}
+
+module_init(nft_flow_offload_module_init);
+module_exit(nft_flow_offload_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("flow_offload");
-- 
cgit v1.2.3


From c0453377518d8dd30dcc07b5bdd9e29512ca9aca Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 30 Oct 2017 17:19:25 -0500
Subject: netfilter: ipset: use swap macro instead of _manually_ swapping
 values

Make use of the swap macro and remove unnecessary variables tmp.
This makes the code easier to read and maintain.

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_bitmap_ip.c    | 8 ++------
 net/netfilter/ipset/ip_set_bitmap_ipmac.c | 8 ++------
 net/netfilter/ipset/ip_set_bitmap_port.c  | 8 ++------
 3 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index d8975a0b4282..488d6d05c65c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -263,12 +263,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
 		if (ret)
 			return ret;
-		if (first_ip > last_ip) {
-			u32 tmp = first_ip;
-
-			first_ip = last_ip;
-			last_ip = tmp;
-		}
+		if (first_ip > last_ip)
+			swap(first_ip, last_ip);
 	} else if (tb[IPSET_ATTR_CIDR]) {
 		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 4c279fbd2d5d..c00b6a2e8e3c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -337,12 +337,8 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
 		if (ret)
 			return ret;
-		if (first_ip > last_ip) {
-			u32 tmp = first_ip;
-
-			first_ip = last_ip;
-			last_ip = tmp;
-		}
+		if (first_ip > last_ip)
+			swap(first_ip, last_ip);
 	} else if (tb[IPSET_ATTR_CIDR]) {
 		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 7f9bbd7c98b5..b561ca8b3659 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -238,12 +238,8 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 
 	first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
 	last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
-	if (first_port > last_port) {
-		u16 tmp = first_port;
-
-		first_port = last_port;
-		last_port = tmp;
-	}
+	if (first_port > last_port)
+		swap(first_port, last_port);
 
 	elements = last_port - first_port + 1;
 	set->dsize = ip_set_elem_len(set, tb, 0, 0);
-- 
cgit v1.2.3


From 4750005a85f76b3df1e5df19c283dde96b071515 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 6 Jan 2018 15:22:01 +0100
Subject: netfilter: ipset: Fix "don't update counters" mode when counters used
 at the matching

The matching of the counters was not taken into account, fixed.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h         |   6 ++
 include/linux/netfilter/ipset/ip_set_counter.h |  25 ++++--
 net/netfilter/ipset/ip_set_bitmap_gen.h        |   9 +-
 net/netfilter/ipset/ip_set_core.c              |  25 ++++++
 net/netfilter/ipset/ip_set_hash_gen.h          |  37 +++-----
 net/netfilter/ipset/ip_set_list_set.c          |  21 ++---
 net/netfilter/xt_set.c                         | 119 +++++++++----------------
 7 files changed, 114 insertions(+), 128 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 8e42253e5d4d..34fc80f3eb90 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -122,6 +122,8 @@ struct ip_set_ext {
 	u64 bytes;
 	char *comment;
 	u32 timeout;
+	u8 packets_op;
+	u8 bytes_op;
 };
 
 struct ip_set;
@@ -339,6 +341,10 @@ extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 				 struct ip_set_ext *ext);
 extern int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
 				 const void *e, bool active);
+extern bool ip_set_match_extensions(struct ip_set *set,
+				    const struct ip_set_ext *ext,
+				    struct ip_set_ext *mext,
+				    u32 flags, void *data);
 
 static inline int
 ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr)
diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h
index bb6fba480118..3d33a2c3f39f 100644
--- a/include/linux/netfilter/ipset/ip_set_counter.h
+++ b/include/linux/netfilter/ipset/ip_set_counter.h
@@ -34,20 +34,33 @@ ip_set_get_packets(const struct ip_set_counter *counter)
 	return (u64)atomic64_read(&(counter)->packets);
 }
 
+static inline bool
+ip_set_match_counter(u64 counter, u64 match, u8 op)
+{
+	switch (op) {
+	case IPSET_COUNTER_NONE:
+		return true;
+	case IPSET_COUNTER_EQ:
+		return counter == match;
+	case IPSET_COUNTER_NE:
+		return counter != match;
+	case IPSET_COUNTER_LT:
+		return counter < match;
+	case IPSET_COUNTER_GT:
+		return counter > match;
+	}
+	return false;
+}
+
 static inline void
 ip_set_update_counter(struct ip_set_counter *counter,
-		      const struct ip_set_ext *ext,
-		      struct ip_set_ext *mext, u32 flags)
+		      const struct ip_set_ext *ext, u32 flags)
 {
 	if (ext->packets != ULLONG_MAX &&
 	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
 		ip_set_add_bytes(ext->bytes, counter);
 		ip_set_add_packets(ext->packets, counter);
 	}
-	if (flags & IPSET_FLAG_MATCH_COUNTERS) {
-		mext->packets = ip_set_get_packets(counter);
-		mext->bytes = ip_set_get_bytes(counter);
-	}
 }
 
 static inline bool
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 8afe882f846d..257ca393e6f2 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -127,14 +127,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 
 	if (ret <= 0)
 		return ret;
-	if (SET_WITH_TIMEOUT(set) &&
-	    ip_set_timeout_expired(ext_timeout(x, set)))
-		return 0;
-	if (SET_WITH_COUNTER(set))
-		ip_set_update_counter(ext_counter(x, set), ext, mext, flags);
-	if (SET_WITH_SKBINFO(set))
-		ip_set_get_skbinfo(ext_skbinfo(x, set), ext, mext, flags);
-	return 1;
+	return ip_set_match_extensions(set, ext, mext, flags, x);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 89b44458a761..e00299051e79 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -472,6 +472,31 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
 }
 EXPORT_SYMBOL_GPL(ip_set_put_extensions);
 
+bool
+ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
+			struct ip_set_ext *mext, u32 flags, void *data)
+{
+	if (SET_WITH_TIMEOUT(set) &&
+	    ip_set_timeout_expired(ext_timeout(data, set)))
+		return false;
+	if (SET_WITH_COUNTER(set)) {
+		struct ip_set_counter *counter = ext_counter(data, set);
+
+		if (flags & IPSET_FLAG_MATCH_COUNTERS &&
+		    !(ip_set_match_counter(ip_set_get_packets(counter),
+				mext->packets, mext->packets_op) &&
+		      ip_set_match_counter(ip_set_get_bytes(counter),
+				mext->bytes, mext->bytes_op)))
+			return false;
+		ip_set_update_counter(counter, ext, flags);
+	}
+	if (SET_WITH_SKBINFO(set))
+		ip_set_get_skbinfo(ext_skbinfo(data, set),
+				   ext, mext, flags);
+	return true;
+}
+EXPORT_SYMBOL_GPL(ip_set_match_extensions);
+
 /* Creating/destroying/renaming/swapping affect the existence and
  * the properties of a set. All of these can be executed from userspace
  * only and serialized by the nfnl mutex indirectly from nfnetlink.
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 8ef079db7d34..bbad940c0137 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -917,12 +917,9 @@ static inline int
 mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
 		 struct ip_set_ext *mext, struct ip_set *set, u32 flags)
 {
-	if (SET_WITH_COUNTER(set))
-		ip_set_update_counter(ext_counter(data, set),
-				      ext, mext, flags);
-	if (SET_WITH_SKBINFO(set))
-		ip_set_get_skbinfo(ext_skbinfo(data, set),
-				   ext, mext, flags);
+	if (!ip_set_match_extensions(set, ext, mext, flags, data))
+		return 0;
+	/* nomatch entries return -ENOTEMPTY */
 	return mtype_do_data_match(data);
 }
 
@@ -941,9 +938,9 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
 	struct mtype_elem *data;
 #if IPSET_NET_COUNT == 2
 	struct mtype_elem orig = *d;
-	int i, j = 0, k;
+	int ret, i, j = 0, k;
 #else
-	int i, j = 0;
+	int ret, i, j = 0;
 #endif
 	u32 key, multi = 0;
 
@@ -969,18 +966,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
 			data = ahash_data(n, i, set->dsize);
 			if (!mtype_data_equal(data, d, &multi))
 				continue;
-			if (SET_WITH_TIMEOUT(set)) {
-				if (!ip_set_timeout_expired(
-						ext_timeout(data, set)))
-					return mtype_data_match(data, ext,
-								mext, set,
-								flags);
+			ret = mtype_data_match(data, ext, mext, set, flags);
+			if (ret != 0)
+				return ret;
 #ifdef IP_SET_HASH_WITH_MULTI
-				multi = 0;
+			/* No match, reset multiple match flag */
+			multi = 0;
 #endif
-			} else
-				return mtype_data_match(data, ext,
-							mext, set, flags);
 		}
 #if IPSET_NET_COUNT == 2
 		}
@@ -1027,12 +1019,11 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (!test_bit(i, n->used))
 			continue;
 		data = ahash_data(n, i, set->dsize);
-		if (mtype_data_equal(data, d, &multi) &&
-		    !(SET_WITH_TIMEOUT(set) &&
-		      ip_set_timeout_expired(ext_timeout(data, set)))) {
-			ret = mtype_data_match(data, ext, mext, set, flags);
+		if (!mtype_data_equal(data, d, &multi))
+			continue;
+		ret = mtype_data_match(data, ext, mext, set, flags);
+		if (ret != 0)
 			goto out;
-		}
 	}
 out:
 	return ret;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index e864681b8dc5..072a658fde04 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -55,8 +55,9 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
 	       struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
 {
 	struct list_set *map = set->data;
+	struct ip_set_ext *mext = &opt->ext;
 	struct set_elem *e;
-	u32 cmdflags = opt->cmdflags;
+	u32 flags = opt->cmdflags;
 	int ret;
 
 	/* Don't lookup sub-counters at all */
@@ -64,21 +65,11 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
 	if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
 		opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
 	list_for_each_entry_rcu(e, &map->members, list) {
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(e, set)))
-			continue;
 		ret = ip_set_test(e->id, skb, par, opt);
-		if (ret > 0) {
-			if (SET_WITH_COUNTER(set))
-				ip_set_update_counter(ext_counter(e, set),
-						      ext, &opt->ext,
-						      cmdflags);
-			if (SET_WITH_SKBINFO(set))
-				ip_set_get_skbinfo(ext_skbinfo(e, set),
-						   ext, &opt->ext,
-						   cmdflags);
-			return ret;
-		}
+		if (ret <= 0)
+			continue;
+		if (ip_set_match_extensions(set, ext, mext, flags, e))
+			return 1;
 	}
 	return 0;
 }
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 64285702afd5..16b6b11ee83f 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -39,13 +39,17 @@ match_set(ip_set_id_t index, const struct sk_buff *skb,
 	return inv;
 }
 
-#define ADT_OPT(n, f, d, fs, cfs, t)	\
-struct ip_set_adt_opt n = {		\
-	.family	= f,			\
-	.dim = d,			\
-	.flags = fs,			\
-	.cmdflags = cfs,		\
-	.ext.timeout = t,		\
+#define ADT_OPT(n, f, d, fs, cfs, t, p, b, po, bo)	\
+struct ip_set_adt_opt n = {				\
+	.family	= f,					\
+	.dim = d,					\
+	.flags = fs,					\
+	.cmdflags = cfs,				\
+	.ext.timeout = t,				\
+	.ext.packets = p,				\
+	.ext.bytes = b,					\
+	.ext.packets_op = po,				\
+	.ext.bytes_op = bo,				\
 }
 
 /* Revision 0 interface: backward compatible with netfilter/iptables */
@@ -56,7 +60,8 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_set_info_match_v0 *info = par->matchinfo;
 
 	ADT_OPT(opt, xt_family(par), info->match_set.u.compat.dim,
-		info->match_set.u.compat.flags, 0, UINT_MAX);
+		info->match_set.u.compat.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 
 	return match_set(info->match_set.index, skb, par, &opt,
 			 info->match_set.u.compat.flags & IPSET_INV_MATCH);
@@ -119,7 +124,8 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_set_info_match_v1 *info = par->matchinfo;
 
 	ADT_OPT(opt, xt_family(par), info->match_set.dim,
-		info->match_set.flags, 0, UINT_MAX);
+		info->match_set.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 
 	if (opt.flags & IPSET_RETURN_NOMATCH)
 		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
@@ -160,46 +166,22 @@ set_match_v1_destroy(const struct xt_mtdtor_param *par)
 
 /* Revision 3 match */
 
-static bool
-match_counter0(u64 counter, const struct ip_set_counter_match0 *info)
-{
-	switch (info->op) {
-	case IPSET_COUNTER_NONE:
-		return true;
-	case IPSET_COUNTER_EQ:
-		return counter == info->value;
-	case IPSET_COUNTER_NE:
-		return counter != info->value;
-	case IPSET_COUNTER_LT:
-		return counter < info->value;
-	case IPSET_COUNTER_GT:
-		return counter > info->value;
-	}
-	return false;
-}
-
 static bool
 set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v3 *info = par->matchinfo;
-	int ret;
 
 	ADT_OPT(opt, xt_family(par), info->match_set.dim,
-		info->match_set.flags, info->flags, UINT_MAX);
+		info->match_set.flags, info->flags, UINT_MAX,
+		info->packets.value, info->bytes.value,
+		info->packets.op, info->bytes.op);
 
 	if (info->packets.op != IPSET_COUNTER_NONE ||
 	    info->bytes.op != IPSET_COUNTER_NONE)
 		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
 
-	ret = match_set(info->match_set.index, skb, par, &opt,
-			info->match_set.flags & IPSET_INV_MATCH);
-
-	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
-		return ret;
-
-	if (!match_counter0(opt.ext.packets, &info->packets))
-		return false;
-	return match_counter0(opt.ext.bytes, &info->bytes);
+	return match_set(info->match_set.index, skb, par, &opt,
+			 info->match_set.flags & IPSET_INV_MATCH);
 }
 
 #define set_match_v3_checkentry	set_match_v1_checkentry
@@ -207,46 +189,22 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
 
 /* Revision 4 match */
 
-static bool
-match_counter(u64 counter, const struct ip_set_counter_match *info)
-{
-	switch (info->op) {
-	case IPSET_COUNTER_NONE:
-		return true;
-	case IPSET_COUNTER_EQ:
-		return counter == info->value;
-	case IPSET_COUNTER_NE:
-		return counter != info->value;
-	case IPSET_COUNTER_LT:
-		return counter < info->value;
-	case IPSET_COUNTER_GT:
-		return counter > info->value;
-	}
-	return false;
-}
-
 static bool
 set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v4 *info = par->matchinfo;
-	int ret;
 
 	ADT_OPT(opt, xt_family(par), info->match_set.dim,
-		info->match_set.flags, info->flags, UINT_MAX);
+		info->match_set.flags, info->flags, UINT_MAX,
+		info->packets.value, info->bytes.value,
+		info->packets.op, info->bytes.op);
 
 	if (info->packets.op != IPSET_COUNTER_NONE ||
 	    info->bytes.op != IPSET_COUNTER_NONE)
 		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
 
-	ret = match_set(info->match_set.index, skb, par, &opt,
-			info->match_set.flags & IPSET_INV_MATCH);
-
-	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
-		return ret;
-
-	if (!match_counter(opt.ext.packets, &info->packets))
-		return false;
-	return match_counter(opt.ext.bytes, &info->bytes);
+	return match_set(info->match_set.index, skb, par, &opt,
+			 info->match_set.flags & IPSET_INV_MATCH);
 }
 
 #define set_match_v4_checkentry	set_match_v1_checkentry
@@ -260,9 +218,11 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_set_info_target_v0 *info = par->targinfo;
 
 	ADT_OPT(add_opt, xt_family(par), info->add_set.u.compat.dim,
-		info->add_set.u.compat.flags, 0, UINT_MAX);
+		info->add_set.u.compat.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 	ADT_OPT(del_opt, xt_family(par), info->del_set.u.compat.dim,
-		info->del_set.u.compat.flags, 0, UINT_MAX);
+		info->del_set.u.compat.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
@@ -333,9 +293,11 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_set_info_target_v1 *info = par->targinfo;
 
 	ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
-		info->add_set.flags, 0, UINT_MAX);
+		info->add_set.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 	ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
-		info->del_set.flags, 0, UINT_MAX);
+		info->del_set.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
@@ -402,9 +364,11 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_set_info_target_v2 *info = par->targinfo;
 
 	ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
-		info->add_set.flags, info->flags, info->timeout);
+		info->add_set.flags, info->flags, info->timeout,
+		0, 0, 0, 0);
 	ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
-		info->del_set.flags, 0, UINT_MAX);
+		info->del_set.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
@@ -432,11 +396,14 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 	int ret;
 
 	ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
-		info->add_set.flags, info->flags, info->timeout);
+		info->add_set.flags, info->flags, info->timeout,
+		0, 0, 0, 0);
 	ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
-		info->del_set.flags, 0, UINT_MAX);
+		info->del_set.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 	ADT_OPT(map_opt, xt_family(par), info->map_set.dim,
-		info->map_set.flags, 0, UINT_MAX);
+		info->map_set.flags, 0, UINT_MAX,
+		0, 0, 0, 0);
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-- 
cgit v1.2.3


From f998b6b10144cd9809da6af02758615f789e8aa1 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 6 Jan 2018 15:24:18 +0100
Subject: netfilter: ipset: Missing nfnl_lock()/nfnl_unlock() is added to
 ip_set_net_exit()

Patch "netfilter: ipset: use nfnl_mutex_is_locked" is added the real
mutex locking check, which revealed the missing locking in ip_set_net_exit().

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Reported-by: syzbot+36b06f219f2439fe62e1@syzkaller.appspotmail.com
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index e00299051e79..728bf31bb386 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -2078,6 +2078,7 @@ ip_set_net_exit(struct net *net)
 
 	inst->is_deleted = true; /* flag for ip_set_nfnl_put */
 
+	nfnl_lock(NFNL_SUBSYS_IPSET);
 	for (i = 0; i < inst->ip_set_max; i++) {
 		set = ip_set(inst, i);
 		if (set) {
@@ -2085,6 +2086,7 @@ ip_set_net_exit(struct net *net)
 			ip_set_destroy_set(set);
 		}
 	}
+	nfnl_unlock(NFNL_SUBSYS_IPSET);
 	kfree(rcu_dereference_protected(inst->ip_set_list, 1));
 }
 
-- 
cgit v1.2.3


From 54dc3e3324829d346c959ff774626d9c6c9a65b5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 4 Jan 2018 14:03:54 -0800
Subject: net: ipv6: Allow connect to linklocal address from socket bound to
 vrf

Allow a process bound to a VRF to connect to a linklocal address.
Currently, this fails because of a mismatch between the scope of the
linklocal address and the sk_bound_dev_if inherited by the VRF binding:
    $ ssh -6 fe80::70b8:cff:fedd:ead8%eth1
    ssh: connect to host fe80::70b8:cff:fedd:ead8%eth1 port 22: Invalid argument

Relax the scope check to allow the socket to be bound to the same L3
device as the scope id.

This makes ipv6 linklocal consistent with other relaxed checks enabled
by commits 1ff23beebdd3 ("net: l3mdev: Allow send on enslaved interface")
and 7bb387c5ab12a ("net: Allow IP_MULTICAST_IF to set index to L3 slave").

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  | 20 ++++++++++++++++++++
 net/ipv6/datagram.c |  3 +--
 net/ipv6/tcp_ipv6.c |  3 +--
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 66fd3951e6f3..73b7830b0bb8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -72,6 +72,7 @@
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
 #include <net/smc.h>
+#include <net/l3mdev.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -2399,4 +2400,23 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val)
 	sk->sk_pacing_shift = val;
 }
 
+/* if a socket is bound to a device, check that the given device
+ * index is either the same or that the socket is bound to an L3
+ * master device and the given device index is also enslaved to
+ * that L3 master
+ */
+static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
+{
+	int mdif;
+
+	if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
+		return true;
+
+	mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
+	if (mdif && mdif == sk->sk_bound_dev_if)
+		return true;
+
+	return false;
+}
+
 #endif	/* _SOCK_H */
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a1f918713006..fbf08ce3f5ab 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -221,8 +221,7 @@ ipv4_connected:
 	if (__ipv6_addr_needs_scope_id(addr_type)) {
 		if (addr_len >= sizeof(struct sockaddr_in6) &&
 		    usin->sin6_scope_id) {
-			if (sk->sk_bound_dev_if &&
-			    sk->sk_bound_dev_if != usin->sin6_scope_id) {
+			if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) {
 				err = -EINVAL;
 				goto out;
 			}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aa12a26a96c6..c0f7e69f2e6c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -176,8 +176,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			/* If interface is set while binding, indices
 			 * must coincide.
 			 */
-			if (sk->sk_bound_dev_if &&
-			    sk->sk_bound_dev_if != usin->sin6_scope_id)
+			if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id))
 				return -EINVAL;
 
 			sk->sk_bound_dev_if = usin->sin6_scope_id;
-- 
cgit v1.2.3


From c8c9aeb519496f403563c715616dfc2a921a7eae Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 5 Jan 2018 00:38:05 +0100
Subject: tcp: Split BUG_ON() in tcp_tso_should_defer() into two assertions

The two conditions triggering BUG_ON() are somewhat unrelated:
the tcp_skb_pcount() check is meant to catch TSO flaws, the
second one checks sanity of congestion window bookkeeping.

Split them into two separate BUG_ON() assertions on two lines,
so that we know which one actually triggers, when they do.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 04be9f833927..95461f02ac9a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1944,7 +1944,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 
 	in_flight = tcp_packets_in_flight(tp);
 
-	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
+	BUG_ON(tcp_skb_pcount(skb) <= 1);
+	BUG_ON(tp->snd_cwnd <= in_flight);
 
 	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
 
-- 
cgit v1.2.3


From 23fe846f9a48d5375722b3bd060e0a02ad1ca7f1 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 5 Jan 2018 19:47:14 +0100
Subject: l2tp: adjust comments about L2TPv3 offsets

The "offset" option has been removed by
commit 900631ee6a26 ("l2tp: remove configurable payload offset").

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Acked-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 2 +-
 net/l2tp/l2tp_core.c      | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index f78eef4cc56a..71e62795104d 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -65,7 +65,7 @@ struct sockaddr_l2tpip6 {
  * TUNNEL_MODIFY	- CONN_ID, udpcsum
  * TUNNEL_GETSTATS	- CONN_ID, (stats)
  * TUNNEL_GET		- CONN_ID, (...)
- * SESSION_CREATE	- SESSION_ID, PW_TYPE, offset, data_seq, cookie, peer_cookie, offset, l2spec
+ * SESSION_CREATE	- SESSION_ID, PW_TYPE, data_seq, cookie, peer_cookie, l2spec
  * SESSION_DELETE	- SESSION_ID
  * SESSION_MODIFY	- SESSION_ID, data_seq
  * SESSION_GET		- SESSION_ID, (...)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 786cd7f6a5e8..62285fc6eb59 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -662,10 +662,9 @@ discard:
  * |x|S|x|x|x|x|x|x|              Sequence Number                  |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
- * Cookie value, sublayer format and offset (pad) are negotiated with
- * the peer when the session is set up. Unlike L2TPv2, we do not need
- * to parse the packet header to determine if optional fields are
- * present.
+ * Cookie value and sublayer format are negotiated with the peer when
+ * the session is set up. Unlike L2TPv2, we do not need to parse the
+ * packet header to determine if optional fields are present.
  *
  * Caller must already have parsed the frame and determined that it is
  * a data (not control) frame before coming here. Fields up to the
-- 
cgit v1.2.3


From 373372b31b8f3bd4b6751d422c00f50832a62a89 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.s@alibaba-inc.com>
Date: Tue, 9 Jan 2018 03:52:52 +0800
Subject: net: caif: remove unused hardirq.h

Preempt counter APIs have been split out, currently, hardirq.h just
includes irq_enter/exit APIs which are not used by caif at all.

So, remove the unused hardirq.h.

Signed-off-by: Yang Shi <yang.s@alibaba-inc.com>
Cc: Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/cfpkt_skbuff.c | 1 -
 net/caif/chnl_net.c     | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 71b6ab240dea..38c2b7a890dd 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -8,7 +8,6 @@
 
 #include <linux/string.h>
 #include <linux/skbuff.h>
-#include <linux/hardirq.h>
 #include <linux/export.h>
 #include <net/caif/cfpkt.h>
 
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 922ac1d605b3..53ecda10b790 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -8,7 +8,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
 
 #include <linux/fs.h>
-#include <linux/hardirq.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
-- 
cgit v1.2.3


From 419091f1cc8afce943fd12af0df26201ee20c1c0 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.s@alibaba-inc.com>
Date: Tue, 9 Jan 2018 03:52:53 +0800
Subject: net: ovs: remove unused hardirq.h

Preempt counter APIs have been split out, currently, hardirq.h just
includes irq_enter/exit APIs which are not used by openvswitch at all.

So, remove the unused hardirq.h.

Signed-off-by: Yang Shi <yang.s@alibaba-inc.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: dev@openvswitch.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/vport-internal_dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 3e7747549f90..bb95c43aae76 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -16,7 +16,6 @@
  * 02110-1301, USA
  */
 
-#include <linux/hardirq.h>
 #include <linux/if_vlan.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
-- 
cgit v1.2.3


From f4803f1b73f877a571be4c8e531dfcf190acc691 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.s@alibaba-inc.com>
Date: Tue, 9 Jan 2018 03:52:54 +0800
Subject: net: tipc: remove unused hardirq.h

Preempt counter APIs have been split out, currently, hardirq.h just
includes irq_enter/exit APIs which are not used by TIPC at all.

So, remove the unused hardirq.h.

Signed-off-by: Yang Shi <yang.s@alibaba-inc.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Tested-by: Ying Xue <ying.xue@windriver.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/core.h b/net/tipc/core.h
index 964342689f2c..20b21af2ff14 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -49,7 +49,6 @@
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
 #include <linux/atomic.h>
-#include <asm/hardirq.h>
 #include <linux/netdevice.h>
 #include <linux/in.h>
 #include <linux/list.h>
-- 
cgit v1.2.3


From c5a9f6f0ab4054082dd5ce9bbdaa8e8ff05cf365 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Mon, 17 Jul 2017 13:47:07 +0300
Subject: net/core: Add drop counters to VF statistics

Modern hardware can decide to drop packets going to/from a VF.
Add receive and transmit drop counters to be displayed at hypervisor
layer in iproute2 per VF statistics.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/if_link.h      |  2 ++
 include/uapi/linux/if_link.h |  2 ++
 net/core/rtnetlink.c         | 10 +++++++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 4c54611e03e9..622658dfbf0a 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -13,6 +13,8 @@ struct ifla_vf_stats {
 	__u64 tx_bytes;
 	__u64 broadcast;
 	__u64 multicast;
+	__u64 rx_dropped;
+	__u64 tx_dropped;
 };
 
 struct ifla_vf_info {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 19fc02660e0c..f8f04fed6186 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -732,6 +732,8 @@ enum {
 	IFLA_VF_STATS_BROADCAST,
 	IFLA_VF_STATS_MULTICAST,
 	IFLA_VF_STATS_PAD,
+	IFLA_VF_STATS_RX_DROPPED,
+	IFLA_VF_STATS_TX_DROPPED,
 	__IFLA_VF_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c688dc564b11..5421a3fd3ba1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size_64bit(sizeof(__u64)) +
 			 /* IFLA_VF_STATS_MULTICAST */
 			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_RX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
 			 nla_total_size(sizeof(struct ifla_vf_trust)));
 		return size;
 	} else
@@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+			      vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+			      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+			      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
 		nla_nest_cancel(skb, vfstats);
 		goto nla_put_vf_failure;
 	}
-- 
cgit v1.2.3


From 709af180eed51042eeae6c232d109d4c18e88c8e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sun, 7 Jan 2018 23:50:26 +0000
Subject: ipv6: use ARRAY_SIZE for array sizing calculation on array
 seg6_action_table

Use the ARRAY_SIZE macro on array seg6_action_table to determine size of
the array. Improvement suggested by coccinelle.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 825b8e01f947..ba3767ef5e93 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -501,7 +501,7 @@ static struct seg6_action_desc *__get_action_desc(int action)
 	struct seg6_action_desc *desc;
 	int i, count;
 
-	count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc);
+	count = ARRAY_SIZE(seg6_action_table);
 	for (i = 0; i < count; i++) {
 		desc = &seg6_action_table[i];
 		if (desc->action == action)
-- 
cgit v1.2.3


From 37e2d99b59c4765112533a1d38174fea58d28a51 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Mon, 8 Jan 2018 16:00:24 +0200
Subject: ethtool: Ensure new ring parameters are within bounds during
 SRINGPARAM

Add a sanity check to ensure that all requested ring parameters
are within bounds, which should reduce errors in driver implementation.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index fff6314f4c5e..107b122c8969 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1693,14 +1693,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
 
 static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_ringparam ringparam;
+	struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
 
-	if (!dev->ethtool_ops->set_ringparam)
+	if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
 		return -EFAULT;
 
+	dev->ethtool_ops->get_ringparam(dev, &max);
+
+	/* ensure new ring parameters are within the maximums */
+	if (ringparam.rx_pending > max.rx_max_pending ||
+	    ringparam.rx_mini_pending > max.rx_mini_max_pending ||
+	    ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending ||
+	    ringparam.tx_pending > max.tx_max_pending)
+		return -EINVAL;
+
 	return dev->ethtool_ops->set_ringparam(dev, &ringparam);
 }
 
-- 
cgit v1.2.3


From 8d5dee21f6f01f4632c10b750709a1383eefc7aa Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:23 +0100
Subject: tipc: a couple of cleanups

- We remove the 'reclaiming' member list in struct tipc_group, since
  it doesn't serve any purpose.

- We simplify the GRP_REMIT_MSG branch of tipc_group_protocol_rcv().

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 3e8268d966fa..e5daeb093879 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -85,7 +85,6 @@ struct tipc_group {
 	struct list_head small_win;
 	struct list_head pending;
 	struct list_head active;
-	struct list_head reclaiming;
 	struct tipc_nlist dests;
 	struct net *net;
 	int subid;
@@ -172,7 +171,6 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	INIT_LIST_HEAD(&grp->small_win);
 	INIT_LIST_HEAD(&grp->active);
 	INIT_LIST_HEAD(&grp->pending);
-	INIT_LIST_HEAD(&grp->reclaiming);
 	grp->members = RB_ROOT;
 	grp->net = net;
 	grp->portid = portid;
@@ -575,7 +573,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 		if (!list_empty(active) && active_cnt >= reclaim_limit) {
 			rm = list_first_entry(active, struct tipc_member, list);
 			rm->state = MBR_RECLAIMING;
-			list_move_tail(&rm->list, &grp->reclaiming);
+			list_del_init(&rm->list);
 			tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq);
 		}
 		/* If max active, become pending and wait for reclaimed space */
@@ -600,12 +598,12 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 		if (m->advertised > ADV_IDLE)
 			break;
 		m->state = MBR_JOINED;
+		grp->active_cnt--;
 		if (m->advertised < ADV_IDLE) {
 			pr_warn_ratelimited("Rcv unexpected msg after REMIT\n");
 			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
 		}
-		grp->active_cnt--;
-		list_del_init(&m->list);
+
 		if (list_empty(&grp->pending))
 			return;
 
@@ -761,18 +759,14 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			m->advertised = ADV_IDLE + in_flight;
 			return;
 		}
-		/* All messages preceding the REMIT have been read */
-		if (m->advertised <= remitted) {
-			m->state = MBR_JOINED;
-			in_flight = 0;
-		}
-		/* ..and the REMIT overtaken by more messages => re-advertise */
+		/* This should never happen */
 		if (m->advertised < remitted)
-			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
+			pr_warn_ratelimited("Unexpected REMIT msg\n");
 
-		m->advertised = ADV_IDLE + in_flight;
+		/* All messages preceding the REMIT have been read */
+		m->state = MBR_JOINED;
 		grp->active_cnt--;
-		list_del_init(&m->list);
+		m->advertised = ADV_IDLE;
 
 		/* Set oldest pending member to active and advertise */
 		if (list_empty(&grp->pending))
-- 
cgit v1.2.3


From 4ea5dab541717fc55cad609360b100857af770b0 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:24 +0100
Subject: tipc: let group member stay in JOINED mode if unable to reclaim

We handle a corner case in the function tipc_group_update_rcv_win().
During extreme pessure it might happen that a message receiver has all
its active senders in RECLAIMING or REMITTED mode, meaning that there
is nobody to reclaim advertisements from if an additional sender tries
to go active.

Currently we just set the new sender to ACTIVE anyway, hence at least
theoretically opening up for a receiver queue overflow by exceeding the
MAX_ACTIVE limit. The correct solution to this is to instead add the
member to the pending queue, while letting the oldest member in that
queue revert to JOINED state.

In this commit we refactor the code for handling message arrival from
a JOINED member, both to make it more comprehensible and to cover the
case described above.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index e5daeb093879..652fa66a87f6 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -569,24 +569,34 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 
 	switch (m->state) {
 	case MBR_JOINED:
-		/* Reclaim advertised space from least active member */
-		if (!list_empty(active) && active_cnt >= reclaim_limit) {
+		/* First, decide if member can go active */
+		if (active_cnt <= max_active) {
+			m->state = MBR_ACTIVE;
+			list_add_tail(&m->list, active);
+			grp->active_cnt++;
+			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
+		} else {
+			m->state = MBR_PENDING;
+			list_add_tail(&m->list, &grp->pending);
+		}
+
+		if (active_cnt < reclaim_limit)
+			break;
+
+		/* Reclaim from oldest active member, if possible */
+		if (!list_empty(active)) {
 			rm = list_first_entry(active, struct tipc_member, list);
 			rm->state = MBR_RECLAIMING;
 			list_del_init(&rm->list);
 			tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq);
-		}
-		/* If max active, become pending and wait for reclaimed space */
-		if (active_cnt >= max_active) {
-			m->state = MBR_PENDING;
-			list_add_tail(&m->list, &grp->pending);
 			break;
 		}
-		/* Otherwise become active */
-		m->state = MBR_ACTIVE;
-		list_add_tail(&m->list, &grp->active);
-		grp->active_cnt++;
-		/* Fall through */
+		/* Nobody to reclaim from; - revert oldest pending to JOINED */
+		pm = list_first_entry(&grp->pending, struct tipc_member, list);
+		list_del_init(&pm->list);
+		pm->state = MBR_JOINED;
+		tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq);
+		break;
 	case MBR_ACTIVE:
 		if (!list_is_last(&m->list, &grp->active))
 			list_move_tail(&m->list, &grp->active);
-- 
cgit v1.2.3


From 0233493a5fad227645f7f02539cb42db72e76030 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:25 +0100
Subject: tipc: adjustment to group member FSM

Analysis reveals that the member state MBR_QURANTINED in reality is
unnecessary, and can be replaced by the state MBR_JOINING at all
occurrencs.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 652fa66a87f6..a352e098f0e7 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -49,7 +49,6 @@
 #define ADV_ACTIVE (ADV_UNIT * 12)
 
 enum mbr_state {
-	MBR_QUARANTINED,
 	MBR_DISCOVERED,
 	MBR_JOINING,
 	MBR_PUBLISHED,
@@ -138,7 +137,7 @@ u16 tipc_group_bc_snd_nxt(struct tipc_group *grp)
 
 static bool tipc_group_is_receiver(struct tipc_member *m)
 {
-	return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING;
+	return m && m->state != MBR_JOINING && m->state != MBR_LEAVING;
 }
 
 static bool tipc_group_is_sender(struct tipc_member *m)
@@ -690,7 +689,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 	case GRP_JOIN_MSG:
 		if (!m)
 			m = tipc_group_create_member(grp, node, port,
-						     MBR_QUARANTINED);
+						     MBR_JOINING);
 		if (!m)
 			return;
 		m->bc_syncpt = msg_grp_bc_syncpt(hdr);
-- 
cgit v1.2.3


From 7ad32bcb7855ae8a60a8cf98e1b9da77cfdba4d0 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:26 +0100
Subject: tipc: create group member event messages when they are needed

In the current implementation, a group socket receiving topology
events about other members just converts the topology event message
into a group event message and stores it until it reaches the right
state to issue it to the user. This complicates the code unnecessarily,
and becomes impractical when we in the coming commits will need to
create and issue membership events independently.

In this commit, we change this so that we just notice the type and
origin of the incoming topology event, and then drop the buffer. Only
when it is time to actually send a group event to the user do we
explicitly create a new message and send it upwards.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c  | 95 +++++++++++++++++++++++++++++++------------------------
 net/tipc/group.h  |  2 +-
 net/tipc/socket.c |  3 +-
 3 files changed, 56 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index a352e098f0e7..e08b7acc7b2d 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -64,7 +64,6 @@ struct tipc_member {
 	struct rb_node tree_node;
 	struct list_head list;
 	struct list_head small_win;
-	struct sk_buff *event_msg;
 	struct sk_buff_head deferredq;
 	struct tipc_group *group;
 	u32 node;
@@ -632,6 +631,40 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 	}
 }
 
+static void tipc_group_create_event(struct tipc_group *grp,
+				    struct tipc_member *m,
+				    u32 event, u16 seqno,
+				    struct sk_buff_head *inputq)
+{	u32 dnode = tipc_own_addr(grp->net);
+	struct tipc_event evt;
+	struct sk_buff *skb;
+	struct tipc_msg *hdr;
+
+	evt.event = event;
+	evt.found_lower = m->instance;
+	evt.found_upper = m->instance;
+	evt.port.ref = m->port;
+	evt.port.node = m->node;
+	evt.s.seq.type = grp->type;
+	evt.s.seq.lower = m->instance;
+	evt.s.seq.upper = m->instance;
+
+	skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT,
+			      GROUP_H_SIZE, sizeof(evt), dnode, m->node,
+			      grp->portid, m->port, 0);
+	if (!skb)
+		return;
+
+	hdr = buf_msg(skb);
+	msg_set_nametype(hdr, grp->type);
+	msg_set_grp_evt(hdr, event);
+	msg_set_dest_droppable(hdr, true);
+	msg_set_grp_bc_seqno(hdr, seqno);
+	memcpy(msg_data(hdr), &evt, sizeof(evt));
+	TIPC_SKB_CB(skb)->orig_member = m->instance;
+	__skb_queue_tail(inputq, skb);
+}
+
 static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 				  int mtyp, struct sk_buff_head *xmitq)
 {
@@ -677,7 +710,6 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 	u32 node = msg_orignode(hdr);
 	u32 port = msg_origport(hdr);
 	struct tipc_member *m, *pm;
-	struct tipc_msg *ehdr;
 	u16 remitted, in_flight;
 
 	if (!grp)
@@ -704,9 +736,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			*usr_wakeup = true;
 			m->usr_pending = false;
 			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
-			ehdr = buf_msg(m->event_msg);
-			msg_set_grp_bc_seqno(ehdr, m->bc_syncpt);
-			__skb_queue_tail(inputq, m->event_msg);
+			tipc_group_create_event(grp, m, TIPC_PUBLISHED,
+						m->bc_syncpt, inputq);
 		}
 		list_del_init(&m->small_win);
 		tipc_group_update_member(m, 0);
@@ -725,10 +756,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			m->state = MBR_LEAVING;
 			return;
 		}
-		/* Otherwise deliver already received WITHDRAW event */
-		ehdr = buf_msg(m->event_msg);
-		msg_set_grp_bc_seqno(ehdr, m->bc_syncpt);
-		__skb_queue_tail(inputq, m->event_msg);
+		/* Otherwise deliver member WITHDRAW event */
+		tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
+					m->bc_syncpt, inputq);
 		return;
 	case GRP_ADV_MSG:
 		if (!m)
@@ -797,11 +827,10 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 void tipc_group_member_evt(struct tipc_group *grp,
 			   bool *usr_wakeup,
 			   int *sk_rcvbuf,
-			   struct sk_buff *skb,
+			   struct tipc_msg *hdr,
 			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq)
 {
-	struct tipc_msg *hdr = buf_msg(skb);
 	struct tipc_event *evt = (void *)msg_data(hdr);
 	u32 instance = evt->found_lower;
 	u32 node = evt->port.node;
@@ -813,21 +842,12 @@ void tipc_group_member_evt(struct tipc_group *grp,
 	u32 self;
 
 	if (!grp)
-		goto drop;
+		return;
 
 	net = grp->net;
 	self = tipc_own_addr(net);
 	if (!grp->loopback && node == self && port == grp->portid)
-		goto drop;
-
-	/* Convert message before delivery to user */
-	msg_set_hdr_sz(hdr, GROUP_H_SIZE);
-	msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE);
-	msg_set_type(hdr, TIPC_GRP_MEMBER_EVT);
-	msg_set_origport(hdr, port);
-	msg_set_orignode(hdr, node);
-	msg_set_nametype(hdr, grp->type);
-	msg_set_grp_evt(hdr, event);
+		return;
 
 	m = tipc_group_find_member(grp, node, port);
 
@@ -836,59 +856,52 @@ void tipc_group_member_evt(struct tipc_group *grp,
 			m = tipc_group_create_member(grp, node, port,
 						     MBR_DISCOVERED);
 		if (!m)
-			goto drop;
+			return;
+
+		m->instance = instance;
 
 		/* Hold back event if JOIN message not yet received */
 		if (m->state == MBR_DISCOVERED) {
-			m->event_msg = skb;
 			m->state = MBR_PUBLISHED;
 		} else {
-			msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
-			__skb_queue_tail(inputq, skb);
+			tipc_group_create_event(grp, m, TIPC_PUBLISHED,
+						m->bc_syncpt, inputq);
 			m->state = MBR_JOINED;
 			*usr_wakeup = true;
 			m->usr_pending = false;
 		}
-		m->instance = instance;
-		TIPC_SKB_CB(skb)->orig_member = m->instance;
 		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
 		tipc_group_update_member(m, 0);
 	} else if (event == TIPC_WITHDRAWN) {
 		if (!m)
-			goto drop;
-
-		TIPC_SKB_CB(skb)->orig_member = m->instance;
+			return;
 
 		*usr_wakeup = true;
 		m->usr_pending = false;
 		node_up = tipc_node_is_up(net, node);
-		m->event_msg = NULL;
 
 		if (node_up) {
 			/* Hold back event if a LEAVE msg should be expected */
 			if (m->state != MBR_LEAVING) {
-				m->event_msg = skb;
 				tipc_group_decr_active(grp, m);
 				m->state = MBR_LEAVING;
 			} else {
-				msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
-				__skb_queue_tail(inputq, skb);
+				tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
+							m->bc_syncpt, inputq);
 			}
 		} else {
 			if (m->state != MBR_LEAVING) {
 				tipc_group_decr_active(grp, m);
 				m->state = MBR_LEAVING;
-				msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt);
+				tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
+							m->bc_rcv_nxt, inputq);
 			} else {
-				msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
+				tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
+							m->bc_syncpt, inputq);
 			}
-			__skb_queue_tail(inputq, skb);
 		}
 		list_del_init(&m->list);
 		list_del_init(&m->small_win);
 	}
 	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
-	return;
-drop:
-	kfree_skb(skb);
 }
diff --git a/net/tipc/group.h b/net/tipc/group.h
index d525e1cd7de5..5ffffd0121a2 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -54,7 +54,7 @@ void tipc_group_filter_msg(struct tipc_group *grp,
 			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq);
 void tipc_group_member_evt(struct tipc_group *grp, bool *wakeup,
-			   int *sk_rcvbuf, struct sk_buff *skb,
+			   int *sk_rcvbuf, struct tipc_msg *hdr,
 			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq);
 void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index b51d5cba5094..36744ebef74f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1933,8 +1933,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		break;
 	case TOP_SRV:
 		tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf,
-				      skb, inputq, xmitq);
-		skb = NULL;
+				      hdr, inputq, xmitq);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From c2b22bcf2e18a279afd80a8c57e936014acf3348 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:27 +0100
Subject: tipc: simplify group LEAVE sequence

After the changes in the previous commit the group LEAVE sequence
can be simplified.

We now let the arrival of a LEAVE message unconditionally issue a group
DOWN event to the user. When a topology WITHDRAW event is received, the
member, if it still there, is set to state LEAVING, but we only issue a
group DOWN event when the link to the peer node is gone, so that no
LEAVE message is to be expected.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 40 +++++++++-------------------------------
 1 file changed, 9 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index e08b7acc7b2d..bdc54be9c07e 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -749,14 +749,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		list_del_init(&m->list);
 		list_del_init(&m->small_win);
 		*usr_wakeup = true;
-
-		/* Wait until WITHDRAW event is received */
-		if (m->state != MBR_LEAVING) {
-			tipc_group_decr_active(grp, m);
-			m->state = MBR_LEAVING;
-			return;
-		}
-		/* Otherwise deliver member WITHDRAW event */
+		tipc_group_decr_active(grp, m);
+		m->state = MBR_LEAVING;
 		tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
 					m->bc_syncpt, inputq);
 		return;
@@ -838,7 +832,6 @@ void tipc_group_member_evt(struct tipc_group *grp,
 	int event = evt->event;
 	struct tipc_member *m;
 	struct net *net;
-	bool node_up;
 	u32 self;
 
 	if (!grp)
@@ -878,30 +871,15 @@ void tipc_group_member_evt(struct tipc_group *grp,
 
 		*usr_wakeup = true;
 		m->usr_pending = false;
-		node_up = tipc_node_is_up(net, node);
-
-		if (node_up) {
-			/* Hold back event if a LEAVE msg should be expected */
-			if (m->state != MBR_LEAVING) {
-				tipc_group_decr_active(grp, m);
-				m->state = MBR_LEAVING;
-			} else {
-				tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
-							m->bc_syncpt, inputq);
-			}
-		} else {
-			if (m->state != MBR_LEAVING) {
-				tipc_group_decr_active(grp, m);
-				m->state = MBR_LEAVING;
-				tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
-							m->bc_rcv_nxt, inputq);
-			} else {
-				tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
-							m->bc_syncpt, inputq);
-			}
-		}
+		tipc_group_decr_active(grp, m);
+		m->state = MBR_LEAVING;
 		list_del_init(&m->list);
 		list_del_init(&m->small_win);
+
+		/* Only send event if no LEAVE message can be expected */
+		if (!tipc_node_is_up(net, node))
+			tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
+						m->bc_rcv_nxt, inputq);
 	}
 	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
 }
-- 
cgit v1.2.3


From d12d2e12cec2d66eab6cd58f592dad9fd386b97d Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:28 +0100
Subject: tipc: send out join messages as soon as new member is discovered

When a socket is joining a group, we look up in the binding table to
find if there are already other members of the group present. This is
used for being able to return EAGAIN instead of EHOSTUNREACH if the
user proceeds directly to a send attempt.

However, the information in the binding table can be used to directly
set the created member in state MBR_PUBLISHED and send a JOIN message
to the peer, instead of waiting for a topology PUBLISH event to do this.
When there are many members in a group, the propagation time for such
events can be significant, and we can save time during the join
operation if we use the initial lookup result fully.

In this commit, we eliminate the member state MBR_DISCOVERED which has
been the result of the initial lookup, and do instead go directly to
MBR_PUBLISHED, which initiates the setup.

After this change, the tipc_member FSM looks as follows:

     +-----------+
---->| PUBLISHED |-----------------------------------------------+
PUB- +-----------+                                 LEAVE/WITHRAW |
LISH       |JOIN                                                 |
           |     +-------------------------------------------+   |
           |     |                            LEAVE/WITHDRAW |   |
           |     |                +------------+             |   |
           |     |   +----------->|  PENDING   |---------+   |   |
           |     |   |msg/maxactv +-+---+------+  LEAVE/ |   |   |
           |     |   |              |   |       WITHDRAW |   |   |
           |     |   |   +----------+   |                |   |   |
           |     |   |   |revert/maxactv|                |   |   |
           |     |   |   V              V                V   V   V
           |   +----------+  msg  +------------+       +-----------+
           +-->|  JOINED  |------>|   ACTIVE   |------>|  LEAVING  |--->
           |   +----------+       +--- -+------+ LEAVE/+-----------+DOWN
           |        A   A               |      WITHDRAW A   A    A   EVT
           |        |   |               |RECLAIM        |   |    |
           |        |   |REMIT          V               |   |    |
           |        |   |== adv   +------------+        |   |    |
           |        |   +---------| RECLAIMING |--------+   |    |
           |        |             +-----+------+  LEAVE/    |    |
           |        |                   |REMIT   WITHDRAW   |    |
           |        |                   |< adv              |    |
           |        |msg/               V            LEAVE/ |    |
           |        |adv==ADV_IDLE+------------+   WITHDRAW |    |
           |        +-------------|  REMITTED  |------------+    |
           |                      +------------+                 |
           |PUBLISH                                              |
JOIN +-----------+                                LEAVE/WITHDRAW |
---->|  JOINING  |-----------------------------------------------+
     +-----------+

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c      | 100 +++++++++++++++++++++++++++++++-------------------
 net/tipc/group.h      |   4 +-
 net/tipc/name_table.c |   2 +-
 net/tipc/socket.c     |   4 +-
 4 files changed, 68 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index bdc54be9c07e..6ca07f0da60c 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -49,7 +49,6 @@
 #define ADV_ACTIVE (ADV_UNIT * 12)
 
 enum mbr_state {
-	MBR_DISCOVERED,
 	MBR_JOINING,
 	MBR_PUBLISHED,
 	MBR_JOINED,
@@ -141,7 +140,7 @@ static bool tipc_group_is_receiver(struct tipc_member *m)
 
 static bool tipc_group_is_sender(struct tipc_member *m)
 {
-	return m && m->state >= MBR_JOINED;
+	return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED;
 }
 
 u32 tipc_group_exclude(struct tipc_group *grp)
@@ -184,6 +183,21 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	return NULL;
 }
 
+void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf)
+{
+	struct rb_root *tree = &grp->members;
+	struct tipc_member *m, *tmp;
+	struct sk_buff_head xmitq;
+
+	skb_queue_head_init(&xmitq);
+	rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) {
+		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq);
+		tipc_group_update_member(m, 0);
+	}
+	tipc_node_distr_xmit(net, &xmitq);
+	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
+}
+
 void tipc_group_delete(struct net *net, struct tipc_group *grp)
 {
 	struct rb_root *tree = &grp->members;
@@ -274,7 +288,7 @@ static void tipc_group_add_to_tree(struct tipc_group *grp,
 
 static struct tipc_member *tipc_group_create_member(struct tipc_group *grp,
 						    u32 node, u32 port,
-						    int state)
+						    u32 instance, int state)
 {
 	struct tipc_member *m;
 
@@ -287,6 +301,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp,
 	m->group = grp;
 	m->node = node;
 	m->port = port;
+	m->instance = instance;
 	m->bc_acked = grp->bc_snd_nxt - 1;
 	grp->member_cnt++;
 	tipc_group_add_to_tree(grp, m);
@@ -295,9 +310,10 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp,
 	return m;
 }
 
-void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port)
+void tipc_group_add_member(struct tipc_group *grp, u32 node,
+			   u32 port, u32 instance)
 {
-	tipc_group_create_member(grp, node, port, MBR_DISCOVERED);
+	tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED);
 }
 
 static void tipc_group_delete_member(struct tipc_group *grp,
@@ -623,7 +639,6 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 		tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq);
 		break;
 	case MBR_RECLAIMING:
-	case MBR_DISCOVERED:
 	case MBR_JOINING:
 	case MBR_LEAVING:
 	default:
@@ -721,26 +736,26 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 	case GRP_JOIN_MSG:
 		if (!m)
 			m = tipc_group_create_member(grp, node, port,
-						     MBR_JOINING);
+						     0, MBR_JOINING);
 		if (!m)
 			return;
 		m->bc_syncpt = msg_grp_bc_syncpt(hdr);
 		m->bc_rcv_nxt = m->bc_syncpt;
 		m->window += msg_adv_win(hdr);
 
-		/* Wait until PUBLISH event is received */
-		if (m->state == MBR_DISCOVERED) {
-			m->state = MBR_JOINING;
-		} else if (m->state == MBR_PUBLISHED) {
-			m->state = MBR_JOINED;
-			*usr_wakeup = true;
-			m->usr_pending = false;
-			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
-			tipc_group_create_event(grp, m, TIPC_PUBLISHED,
-						m->bc_syncpt, inputq);
-		}
+		/* Wait until PUBLISH event is received if necessary */
+		if (m->state != MBR_PUBLISHED)
+			return;
+
+		/* Member can be taken into service */
+		m->state = MBR_JOINED;
+		*usr_wakeup = true;
+		m->usr_pending = false;
 		list_del_init(&m->small_win);
 		tipc_group_update_member(m, 0);
+		tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
+		tipc_group_create_event(grp, m, TIPC_PUBLISHED,
+					m->bc_syncpt, inputq);
 		return;
 	case GRP_LEAVE_MSG:
 		if (!m)
@@ -844,30 +859,36 @@ void tipc_group_member_evt(struct tipc_group *grp,
 
 	m = tipc_group_find_member(grp, node, port);
 
-	if (event == TIPC_PUBLISHED) {
-		if (!m)
-			m = tipc_group_create_member(grp, node, port,
-						     MBR_DISCOVERED);
-		if (!m)
-			return;
+	switch (event) {
+	case TIPC_PUBLISHED:
+		/* Send and wait for arrival of JOIN message if necessary */
+		if (!m) {
+			m = tipc_group_create_member(grp, node, port, instance,
+						     MBR_PUBLISHED);
+			if (!m)
+				break;
+			tipc_group_update_member(m, 0);
+			tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
+			break;
+		}
 
-		m->instance = instance;
+		if (m->state != MBR_JOINING)
+			break;
 
-		/* Hold back event if JOIN message not yet received */
-		if (m->state == MBR_DISCOVERED) {
-			m->state = MBR_PUBLISHED;
-		} else {
-			tipc_group_create_event(grp, m, TIPC_PUBLISHED,
-						m->bc_syncpt, inputq);
-			m->state = MBR_JOINED;
-			*usr_wakeup = true;
-			m->usr_pending = false;
-		}
-		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
+		/* Member can be taken into service */
+		m->instance = instance;
+		m->state = MBR_JOINED;
+		*usr_wakeup = true;
+		m->usr_pending = false;
+		list_del_init(&m->small_win);
 		tipc_group_update_member(m, 0);
-	} else if (event == TIPC_WITHDRAWN) {
+		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
+		tipc_group_create_event(grp, m, TIPC_PUBLISHED,
+					m->bc_syncpt, inputq);
+		break;
+	case TIPC_WITHDRAWN:
 		if (!m)
-			return;
+			break;
 
 		*usr_wakeup = true;
 		m->usr_pending = false;
@@ -880,6 +901,9 @@ void tipc_group_member_evt(struct tipc_group *grp,
 		if (!tipc_node_is_up(net, node))
 			tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
 						m->bc_rcv_nxt, inputq);
+		break;
+	default:
+		break;
 	}
 	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
 }
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 5ffffd0121a2..dee79477d499 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -44,8 +44,10 @@ struct tipc_msg;
 
 struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 				     struct tipc_group_req *mreq);
+void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcv_buf);
 void tipc_group_delete(struct net *net, struct tipc_group *grp);
-void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port);
+void tipc_group_add_member(struct tipc_group *grp, u32 node,
+			   u32 port, u32 instance);
 struct tipc_nlist *tipc_group_dests(struct tipc_group *grp);
 void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
 		     int *scope);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index b3829bcf63c7..e04ab72f313c 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -732,7 +732,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 		list_for_each_entry(p, &info->zone_list, zone_list) {
 			if (!tipc_in_scope(domain, p->node))
 				continue;
-			tipc_group_add_member(grp, p->node, p->ref);
+			tipc_group_add_member(grp, p->node, p->ref, p->lower);
 		}
 	}
 	spin_unlock_bh(&seq->lock);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 36744ebef74f..e3a02f1fcab5 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2757,10 +2757,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 		tipc_group_delete(net, grp);
 		tsk->group = NULL;
 	}
-
-	/* Eliminate any risk that a broadcast overtakes the sent JOIN */
+	/* Eliminate any risk that a broadcast overtakes sent JOINs */
 	tsk->mc_method.rcast = true;
 	tsk->mc_method.mandatory = true;
+	tipc_group_join(net, grp, &tsk->sk.sk_rcvbuf);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 8348500f80d5660af29c475e1f15d412d83564c9 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:29 +0100
Subject: tipc: add option to suppress PUBLISH events for pre-existing
 publications

Currently, when a user is subscribing for binding table publications,
he will receive a PUBLISH event for all already existing matching items
in the binding table.

However, a group socket making a subscriptions doesn't need this initial
status update from the binding table, because it has already scanned it
during the join operation. Worse, the multiplicatory effect of issuing
mutual events for dozens or hundreds group members within a short time
frame put a heavy load on the topology server, with the end result that
scale out operations on a big group tend to take much longer than needed.

We now add a new filter option, TIPC_SUB_NO_STATUS, for topology server
subscriptions, so that this initial avalanche of events is suppressed.
This change, along with the previous commit, significantly improves the
range and speed of group scale out operations.

We keep the new option internal for the tipc driver, at least for now.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c      |  4 +++-
 net/tipc/name_table.c | 13 +++++++------
 net/tipc/name_table.h |  2 +-
 net/tipc/server.c     |  4 ++--
 net/tipc/server.h     |  3 ++-
 net/tipc/subscr.c     | 10 ++++++----
 6 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 6ca07f0da60c..cf996bd6ec98 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -177,7 +177,9 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	grp->scope = mreq->scope;
 	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
 	grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS;
-	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid))
+	if (tipc_topsrv_kern_subscr(net, portid, type,
+				    TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS,
+				    0, ~0, &grp->subid))
 		return grp;
 	kfree(grp);
 	return NULL;
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e04ab72f313c..60af9885f160 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -405,12 +405,13 @@ found:
 }
 
 /**
- * tipc_nameseq_subscribe - attach a subscription, and issue
- * the prescribed number of events if there is any sub-
+ * tipc_nameseq_subscribe - attach a subscription, and optionally
+ * issue the prescribed number of events if there is any sub-
  * sequence overlapping with the requested sequence
  */
 static void tipc_nameseq_subscribe(struct name_seq *nseq,
-				   struct tipc_subscription *s)
+				   struct tipc_subscription *s,
+				   bool status)
 {
 	struct sub_seq *sseq = nseq->sseqs;
 	struct tipc_name_seq ns;
@@ -420,7 +421,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 	tipc_subscrp_get(s);
 	list_add(&s->nameseq_list, &nseq->subscriptions);
 
-	if (!sseq)
+	if (!status || !sseq)
 		return;
 
 	while (sseq != &nseq->sseqs[nseq->first_free]) {
@@ -811,7 +812,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
 /**
  * tipc_nametbl_subscribe - add a subscription object to the name table
  */
-void tipc_nametbl_subscribe(struct tipc_subscription *s)
+void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status)
 {
 	struct tipc_net *tn = net_generic(s->net, tipc_net_id);
 	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
@@ -825,7 +826,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s)
 		seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
 	if (seq) {
 		spin_lock_bh(&seq->lock);
-		tipc_nameseq_subscribe(seq, s);
+		tipc_nameseq_subscribe(seq, s, status);
 		spin_unlock_bh(&seq->lock);
 	} else {
 		tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 71926e429446..73a148c85c15 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -121,7 +121,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 					     u32 lower, u32 node, u32 ref,
 					     u32 key);
-void tipc_nametbl_subscribe(struct tipc_subscription *s);
+void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status);
 void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
 void tipc_nametbl_stop(struct net *net);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index d60c30342327..950c54cbcf3a 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -490,7 +490,7 @@ void tipc_conn_terminate(struct tipc_server *s, int conid)
 }
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
-			     u32 lower, u32 upper, int *conid)
+			     u32 filter, u32 lower, u32 upper, int *conid)
 {
 	struct tipc_subscriber *scbr;
 	struct tipc_subscr sub;
@@ -501,7 +501,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
 	sub.seq.lower = lower;
 	sub.seq.upper = upper;
 	sub.timeout = TIPC_WAIT_FOREVER;
-	sub.filter = TIPC_SUB_PORTS;
+	sub.filter = filter;
 	*(u32 *)&sub.usr_handle = port;
 
 	con = tipc_alloc_conn(tipc_topsrv(net));
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 2113c9192633..ea1effbff23e 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -41,6 +41,7 @@
 #include <net/net_namespace.h>
 
 #define TIPC_SERVER_NAME_LEN	32
+#define TIPC_SUB_NO_STATUS      0x80
 
 /**
  * struct tipc_server - TIPC server structure
@@ -84,7 +85,7 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
 		      struct sockaddr_tipc *addr, void *data, size_t len);
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
-			     u32 lower, u32 upper, int *conid);
+			     u32 filter, u32 lower, u32 upper, int *conid);
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
 
 /**
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 251065dfd8df..1052341a0ea9 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -286,7 +286,8 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 }
 
 static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
-				   struct tipc_subscriber *subscriber, int swap)
+				   struct tipc_subscriber *subscriber, int swap,
+				   bool status)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub = NULL;
@@ -299,7 +300,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 	spin_lock_bh(&subscriber->lock);
 	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
 	sub->subscriber = subscriber;
-	tipc_nametbl_subscribe(sub);
+	tipc_nametbl_subscribe(sub, status);
 	tipc_subscrb_get(subscriber);
 	spin_unlock_bh(&subscriber->lock);
 
@@ -323,6 +324,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 {
 	struct tipc_subscriber *subscriber = usr_data;
 	struct tipc_subscr *s = (struct tipc_subscr *)buf;
+	bool status;
 	int swap;
 
 	/* Determine subscriber's endianness */
@@ -334,8 +336,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
 		return tipc_subscrp_cancel(s, subscriber);
 	}
-
-	tipc_subscrp_subscribe(net, s, subscriber, swap);
+	status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap));
+	tipc_subscrp_subscribe(net, s, subscriber, swap, status);
 }
 
 /* Handle one request to establish a new subscriber */
-- 
cgit v1.2.3


From 232d07b74a33b9f5d48516dc1d8ce41723ada593 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:30 +0100
Subject: tipc: improve groupcast scope handling

When a member joins a group, it also indicates a binding scope. This
makes it possible to create both node local groups, invisible to other
nodes, as well as cluster global groups, visible everywhere.

In order to avoid that different members end up having permanently
differing views of group size and memberhip, we must inhibit locally
and globally bound members from joining the same group.

We do this by using the binding scope as an additional separator between
groups. I.e., a member must ignore all membership events from sockets
using a different scope than itself, and all lookups for message
destinations must require an exact match between the message's lookup
scope and the potential target's binding scope.

Apart from making it possible to create local groups using the same
identity on different nodes, a side effect of this is that it now also
becomes possible to create a cluster global group with the same identity
across the same nodes, without interfering with the local groups.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  7 ++--
 net/tipc/group.c          | 13 ++++---
 net/tipc/name_table.c     | 40 ++++++++++-----------
 net/tipc/name_table.h     |  4 +--
 net/tipc/server.c         |  4 +--
 net/tipc/server.h         |  6 ++--
 net/tipc/socket.c         | 88 ++++++++++++++++++++++++++++-------------------
 net/tipc/subscr.c         | 10 ++++--
 net/tipc/subscr.h         |  2 +-
 9 files changed, 99 insertions(+), 75 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 35f79d1f8c3a..14bacc7e6cef 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -117,10 +117,9 @@ static inline unsigned int tipc_node(__u32 addr)
 /*
  * Publication scopes when binding port names and port name sequences
  */
-
-#define TIPC_ZONE_SCOPE		1
-#define TIPC_CLUSTER_SCOPE	2
-#define TIPC_NODE_SCOPE		3
+#define TIPC_ZONE_SCOPE         1
+#define TIPC_CLUSTER_SCOPE      2
+#define TIPC_NODE_SCOPE         3
 
 /*
  * Limiting values for messages
diff --git a/net/tipc/group.c b/net/tipc/group.c
index cf996bd6ec98..1908773c9fca 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -87,7 +87,6 @@ struct tipc_group {
 	int subid;
 	u32 type;
 	u32 instance;
-	u32 domain;
 	u32 scope;
 	u32 portid;
 	u16 member_cnt;
@@ -158,6 +157,8 @@ int tipc_group_size(struct tipc_group *grp)
 struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 				     struct tipc_group_req *mreq)
 {
+	u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS;
+	bool global = mreq->scope != TIPC_NODE_SCOPE;
 	struct tipc_group *grp;
 	u32 type = mreq->type;
 
@@ -171,15 +172,14 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	grp->members = RB_ROOT;
 	grp->net = net;
 	grp->portid = portid;
-	grp->domain = addr_domain(net, mreq->scope);
 	grp->type = type;
 	grp->instance = mreq->instance;
 	grp->scope = mreq->scope;
 	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
 	grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS;
-	if (tipc_topsrv_kern_subscr(net, portid, type,
-				    TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS,
-				    0, ~0, &grp->subid))
+	filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE;
+	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0,
+				    filter, &grp->subid))
 		return grp;
 	kfree(grp);
 	return NULL;
@@ -732,6 +732,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 	if (!grp)
 		return;
 
+	if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net))
+		return;
+
 	m = tipc_group_find_member(grp, node, port);
 
 	switch (msg_type(hdr)) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 60af9885f160..64cdd3c302b0 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -328,7 +328,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
 		tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
 					    TIPC_PUBLISHED, publ->ref,
-					    publ->node, created_subseq);
+					    publ->node, publ->scope,
+					    created_subseq);
 	}
 	return publ;
 }
@@ -398,7 +399,8 @@ found:
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
 		tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
 					    TIPC_WITHDRAWN, publ->ref,
-					    publ->node, removed_subseq);
+					    publ->node, publ->scope,
+					    removed_subseq);
 	}
 
 	return publ;
@@ -435,6 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 							    sseq->upper,
 							    TIPC_PUBLISHED,
 							    crs->ref, crs->node,
+							    crs->scope,
 							    must_report);
 				must_report = 0;
 			}
@@ -598,7 +601,7 @@ not_found:
 	return ref;
 }
 
-bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
+bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 			 struct list_head *dsts, int *dstcnt, u32 exclude,
 			 bool all)
 {
@@ -608,9 +611,6 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
 	struct name_seq *seq;
 	struct sub_seq *sseq;
 
-	if (!tipc_in_scope(domain, self))
-		return false;
-
 	*dstcnt = 0;
 	rcu_read_lock();
 	seq = nametbl_find_seq(net, type);
@@ -621,7 +621,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
 	if (likely(sseq)) {
 		info = sseq->info;
 		list_for_each_entry(publ, &info->zone_list, zone_list) {
-			if (!tipc_in_scope(domain, publ->node))
+			if (publ->scope != scope)
 				continue;
 			if (publ->ref == exclude && publ->node == self)
 				continue;
@@ -639,13 +639,14 @@ exit:
 	return !list_empty(dsts);
 }
 
-int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct list_head *dports)
+int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
+			   u32 scope, bool exact, struct list_head *dports)
 {
-	struct name_seq *seq;
-	struct sub_seq *sseq;
 	struct sub_seq *sseq_stop;
 	struct name_info *info;
+	struct publication *p;
+	struct name_seq *seq;
+	struct sub_seq *sseq;
 	int res = 0;
 
 	rcu_read_lock();
@@ -657,15 +658,12 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
 	sseq_stop = seq->sseqs + seq->first_free;
 	for (; sseq != sseq_stop; sseq++) {
-		struct publication *publ;
-
 		if (sseq->lower > upper)
 			break;
-
 		info = sseq->info;
-		list_for_each_entry(publ, &info->node_list, node_list) {
-			if (publ->scope <= limit)
-				tipc_dest_push(dports, 0, publ->ref);
+		list_for_each_entry(p, &info->node_list, node_list) {
+			if (p->scope == scope || (!exact && p->scope < scope))
+				tipc_dest_push(dports, 0, p->ref);
 		}
 
 		if (info->cluster_list_size != info->node_list_size)
@@ -682,7 +680,7 @@ exit:
  * - Determines if any node local ports overlap
  */
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
-				   u32 upper, u32 domain,
+				   u32 upper, u32 scope,
 				   struct tipc_nlist *nodes)
 {
 	struct sub_seq *sseq, *stop;
@@ -701,7 +699,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 	for (; sseq != stop && sseq->lower <= upper; sseq++) {
 		info = sseq->info;
 		list_for_each_entry(publ, &info->zone_list, zone_list) {
-			if (tipc_in_scope(domain, publ->node))
+			if (publ->scope == scope)
 				tipc_nlist_add(nodes, publ->node);
 		}
 	}
@@ -713,7 +711,7 @@ exit:
 /* tipc_nametbl_build_group - build list of communication group members
  */
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
-			      u32 type, u32 domain)
+			      u32 type, u32 scope)
 {
 	struct sub_seq *sseq, *stop;
 	struct name_info *info;
@@ -731,7 +729,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 	for (; sseq != stop; sseq++) {
 		info = sseq->info;
 		list_for_each_entry(p, &info->zone_list, zone_list) {
-			if (!tipc_in_scope(domain, p->node))
+			if (p->scope != scope)
 				continue;
 			tipc_group_add_member(grp, p->node, p->ref, p->lower);
 		}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 73a148c85c15..b595d8aa00f0 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -100,8 +100,8 @@ struct name_table {
 int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
-int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct list_head *dports);
+int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
+			   u32 scope, bool exact, struct list_head *dports);
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 			      u32 type, u32 domain);
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 950c54cbcf3a..8ee5e86b7870 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -489,8 +489,8 @@ void tipc_conn_terminate(struct tipc_server *s, int conid)
 	}
 }
 
-bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
-			     u32 filter, u32 lower, u32 upper, int *conid)
+bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
+			     u32 upper, u32 filter, int *conid)
 {
 	struct tipc_subscriber *scbr;
 	struct tipc_subscr sub;
diff --git a/net/tipc/server.h b/net/tipc/server.h
index ea1effbff23e..17f49ee44cfd 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -41,6 +41,8 @@
 #include <net/net_namespace.h>
 
 #define TIPC_SERVER_NAME_LEN	32
+#define TIPC_SUB_CLUSTER_SCOPE  0x20
+#define TIPC_SUB_NODE_SCOPE     0x40
 #define TIPC_SUB_NO_STATUS      0x80
 
 /**
@@ -84,8 +86,8 @@ struct tipc_server {
 int tipc_conn_sendmsg(struct tipc_server *s, int conid,
 		      struct sockaddr_tipc *addr, void *data, size_t len);
 
-bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
-			     u32 filter, u32 lower, u32 upper, int *conid);
+bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
+			     u32 upper, u32 filter, int *conid);
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
 
 /**
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e3a02f1fcab5..b24dab3996c9 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -928,21 +928,22 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
 	struct list_head *cong_links = &tsk->cong_links;
 	int blks = tsk_blocks(GROUP_H_SIZE + dlen);
 	struct tipc_group *grp = tsk->group;
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_member *first = NULL;
 	struct tipc_member *mbr = NULL;
 	struct net *net = sock_net(sk);
 	u32 node, port, exclude;
-	u32 type, inst, domain;
 	struct list_head dsts;
+	u32 type, inst, scope;
 	int lookups = 0;
 	int dstcnt, rc;
 	bool cong;
 
 	INIT_LIST_HEAD(&dsts);
 
-	type = dest->addr.name.name.type;
+	type = msg_nametype(hdr);
 	inst = dest->addr.name.name.instance;
-	domain = addr_domain(net, dest->scope);
+	scope = msg_lookup_scope(hdr);
 	exclude = tipc_group_exclude(grp);
 
 	while (++lookups < 4) {
@@ -950,7 +951,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
 
 		/* Look for a non-congested destination member, if any */
 		while (1) {
-			if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts,
+			if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts,
 						 &dstcnt, exclude, false))
 				return -EHOSTUNREACH;
 			tipc_dest_pop(&dsts, &node, &port);
@@ -1079,22 +1080,23 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
 {
 	struct sock *sk = sock->sk;
 	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
-	struct tipc_name_seq *seq = &dest->addr.nameseq;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_group *grp = tsk->group;
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
-	u32 domain, exclude, dstcnt;
+	u32 type, inst, scope, exclude;
 	struct list_head dsts;
+	u32 dstcnt;
 
 	INIT_LIST_HEAD(&dsts);
 
-	if (seq->lower != seq->upper)
-		return -ENOTSUPP;
-
-	domain = addr_domain(net, dest->scope);
+	type = msg_nametype(hdr);
+	inst = dest->addr.name.name.instance;
+	scope = msg_lookup_scope(hdr);
 	exclude = tipc_group_exclude(grp);
-	if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain,
-				 &dsts, &dstcnt, exclude, true))
+
+	if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts,
+				 &dstcnt, exclude, true))
 		return -EHOSTUNREACH;
 
 	if (dstcnt == 1) {
@@ -1116,24 +1118,29 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
 void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		       struct sk_buff_head *inputq)
 {
-	u32 scope = TIPC_CLUSTER_SCOPE;
 	u32 self = tipc_own_addr(net);
+	u32 type, lower, upper, scope;
 	struct sk_buff *skb, *_skb;
-	u32 lower = 0, upper = ~0;
-	struct sk_buff_head tmpq;
 	u32 portid, oport, onode;
+	struct sk_buff_head tmpq;
 	struct list_head dports;
-	struct tipc_msg *msg;
-	int user, mtyp, hsz;
+	struct tipc_msg *hdr;
+	int user, mtyp, hlen;
+	bool exact;
 
 	__skb_queue_head_init(&tmpq);
 	INIT_LIST_HEAD(&dports);
 
 	skb = tipc_skb_peek(arrvq, &inputq->lock);
 	for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
-		msg = buf_msg(skb);
-		user = msg_user(msg);
-		mtyp = msg_type(msg);
+		hdr = buf_msg(skb);
+		user = msg_user(hdr);
+		mtyp = msg_type(hdr);
+		hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
+		oport = msg_origport(hdr);
+		onode = msg_orignode(hdr);
+		type = msg_nametype(hdr);
+
 		if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) {
 			spin_lock_bh(&inputq->lock);
 			if (skb_peek(arrvq) == skb) {
@@ -1144,21 +1151,31 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 			spin_unlock_bh(&inputq->lock);
 			continue;
 		}
-		hsz = skb_headroom(skb) + msg_hdr_sz(msg);
-		oport = msg_origport(msg);
-		onode = msg_orignode(msg);
-		if (onode == self)
-			scope = TIPC_NODE_SCOPE;
-
-		/* Create destination port list and message clones: */
-		if (!msg_in_group(msg)) {
-			lower = msg_namelower(msg);
-			upper = msg_nameupper(msg);
+
+		/* Group messages require exact scope match */
+		if (msg_in_group(hdr)) {
+			lower = 0;
+			upper = ~0;
+			scope = msg_lookup_scope(hdr);
+			exact = true;
+		} else {
+			/* TIPC_NODE_SCOPE means "any scope" in this context */
+			if (onode == self)
+				scope = TIPC_NODE_SCOPE;
+			else
+				scope = TIPC_CLUSTER_SCOPE;
+			exact = false;
+			lower = msg_namelower(hdr);
+			upper = msg_nameupper(hdr);
 		}
-		tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper,
-					  scope, &dports);
+
+		/* Create destination port list: */
+		tipc_nametbl_mc_lookup(net, type, lower, upper,
+				       scope, exact, &dports);
+
+		/* Clone message per destination */
 		while (tipc_dest_pop(&dports, NULL, &portid)) {
-			_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
+			_skb = __pskb_copy(skb, hlen, GFP_ATOMIC);
 			if (_skb) {
 				msg_set_destport(buf_msg(_skb), portid);
 				__skb_queue_tail(&tmpq, _skb);
@@ -2731,7 +2748,6 @@ void tipc_sk_rht_destroy(struct net *net)
 static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 {
 	struct net *net = sock_net(&tsk->sk);
-	u32 domain = addr_domain(net, mreq->scope);
 	struct tipc_group *grp = tsk->group;
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq seq;
@@ -2739,6 +2755,8 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 
 	if (mreq->type < TIPC_RESERVED_TYPES)
 		return -EACCES;
+	if (mreq->scope > TIPC_NODE_SCOPE)
+		return -EINVAL;
 	if (grp)
 		return -EACCES;
 	grp = tipc_group_create(net, tsk->portid, mreq);
@@ -2751,7 +2769,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 	seq.type = mreq->type;
 	seq.lower = mreq->instance;
 	seq.upper = seq.lower;
-	tipc_nametbl_build_group(net, grp, mreq->type, domain);
+	tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope);
 	rc = tipc_sk_publish(tsk, mreq->scope, &seq);
 	if (rc) {
 		tipc_group_delete(net, grp);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 1052341a0ea9..44df528ed6ab 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -118,15 +118,19 @@ void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
 
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
 				 u32 found_upper, u32 event, u32 port_ref,
-				 u32 node, int must)
+				 u32 node, u32 scope, int must)
 {
+	u32 filter = htohl(sub->evt.s.filter, sub->swap);
 	struct tipc_name_seq seq;
 
 	tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
 	if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
 		return;
-	if (!must &&
-	    !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS))
+	if (!must && !(filter & TIPC_SUB_PORTS))
+		return;
+	if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE)
+		return;
+	if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE)
 		return;
 
 	tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index ee52957dc952..f3edca775d9f 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -71,7 +71,7 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
 				 u32 found_lower, u32 found_upper, u32 event,
-				 u32 port_ref, u32 node, int must);
+				 u32 port_ref, u32 node, u32 scope, int must);
 void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
 			      struct tipc_name_seq *out);
 u32 tipc_subscrp_convert_seq_type(u32 type, int swap);
-- 
cgit v1.2.3


From eb929a91b213d2a72c5a8b4af9a1acf63bfb8287 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:31 +0100
Subject: tipc: improve poll() for group member socket

The current criteria for returning POLLOUT from a group member socket is
too simplistic. It basically returns POLLOUT as soon as the group has
external destinations, something obviously leading to a lot of spinning
during destination congestion situations. At the same time, the internal
congestion handling is unnecessarily complex.

We now change this as follows.

- We introduce an 'open' flag in  struct tipc_group. This flag is used
  only to help poll() get the setting of POLLOUT right, and *not* for
  congeston handling as such. This means that a user can choose to
  ignore an  EAGAIN for a destination and go on sending messages to
  other destinations in the group if he wants to.

- The flag is set to false every time we return EAGAIN on a send call.

- The flag is set to true every time any member, i.e., not necessarily
  the member that caused EAGAIN, is removed from the small_win list.

- We remove the group member 'usr_pending' flag. The size of the send
  window and presence in the 'small_win' list is sufficient criteria
  for recognizing congestion.

This solution seems to be a reasonable compromise between 'anycast',
which is normally not waiting for POLLOUT for a specific destination,
and the other three send modes, which are.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c  | 64 +++++++++++++++++++++++++++++++------------------------
 net/tipc/group.h  |  2 +-
 net/tipc/socket.c |  8 +++----
 3 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 1908773c9fca..497ee34bfab9 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -74,7 +74,6 @@ struct tipc_member {
 	u16 bc_rcv_nxt;
 	u16 bc_syncpt;
 	u16 bc_acked;
-	bool usr_pending;
 };
 
 struct tipc_group {
@@ -96,11 +95,27 @@ struct tipc_group {
 	u16 bc_ackers;
 	bool loopback;
 	bool events;
+	bool open;
 };
 
 static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 				  int mtyp, struct sk_buff_head *xmitq);
 
+bool tipc_group_is_open(struct tipc_group *grp)
+{
+	return grp->open;
+}
+
+static void tipc_group_open(struct tipc_member *m, bool *wakeup)
+{
+	*wakeup = false;
+	if (list_empty(&m->small_win))
+		return;
+	list_del_init(&m->small_win);
+	m->group->open = true;
+	*wakeup = true;
+}
+
 static void tipc_group_decr_active(struct tipc_group *grp,
 				   struct tipc_member *m)
 {
@@ -406,20 +421,20 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport,
 	int adv, state;
 
 	m = tipc_group_find_dest(grp, dnode, dport);
-	*mbr = m;
-	if (!m)
+	if (!tipc_group_is_receiver(m)) {
+		*mbr = NULL;
 		return false;
-	if (m->usr_pending)
-		return true;
+	}
+	*mbr = m;
+
 	if (m->window >= len)
 		return false;
-	m->usr_pending = true;
+
+	grp->open = false;
 
 	/* If not fully advertised, do it now to prevent mutual blocking */
 	adv = m->advertised;
 	state = m->state;
-	if (state < MBR_JOINED)
-		return true;
 	if (state == MBR_JOINED && adv == ADV_IDLE)
 		return true;
 	if (state == MBR_ACTIVE && adv == ADV_ACTIVE)
@@ -437,9 +452,10 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len)
 	struct tipc_member *m = NULL;
 
 	/* If prev bcast was replicast, reject until all receivers have acked */
-	if (grp->bc_ackers)
+	if (grp->bc_ackers) {
+		grp->open = false;
 		return true;
-
+	}
 	if (list_empty(&grp->small_win))
 		return false;
 
@@ -754,9 +770,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 
 		/* Member can be taken into service */
 		m->state = MBR_JOINED;
-		*usr_wakeup = true;
-		m->usr_pending = false;
-		list_del_init(&m->small_win);
+		tipc_group_open(m, usr_wakeup);
 		tipc_group_update_member(m, 0);
 		tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
 		tipc_group_create_event(grp, m, TIPC_PUBLISHED,
@@ -767,8 +781,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			return;
 		m->bc_syncpt = msg_grp_bc_syncpt(hdr);
 		list_del_init(&m->list);
-		list_del_init(&m->small_win);
-		*usr_wakeup = true;
+		tipc_group_open(m, usr_wakeup);
 		tipc_group_decr_active(grp, m);
 		m->state = MBR_LEAVING;
 		tipc_group_create_event(grp, m, TIPC_WITHDRAWN,
@@ -778,26 +791,25 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		if (!m)
 			return;
 		m->window += msg_adv_win(hdr);
-		*usr_wakeup = m->usr_pending;
-		m->usr_pending = false;
-		list_del_init(&m->small_win);
+		tipc_group_open(m, usr_wakeup);
 		return;
 	case GRP_ACK_MSG:
 		if (!m)
 			return;
 		m->bc_acked = msg_grp_bc_acked(hdr);
 		if (--grp->bc_ackers)
-			break;
+			return;
+		list_del_init(&m->small_win);
+		m->group->open = true;
 		*usr_wakeup = true;
-		m->usr_pending = false;
+		tipc_group_update_member(m, 0);
 		return;
 	case GRP_RECLAIM_MSG:
 		if (!m)
 			return;
-		*usr_wakeup = m->usr_pending;
-		m->usr_pending = false;
 		tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq);
 		m->window = ADV_IDLE;
+		tipc_group_open(m, usr_wakeup);
 		return;
 	case GRP_REMIT_MSG:
 		if (!m || m->state != MBR_RECLAIMING)
@@ -883,9 +895,7 @@ void tipc_group_member_evt(struct tipc_group *grp,
 		/* Member can be taken into service */
 		m->instance = instance;
 		m->state = MBR_JOINED;
-		*usr_wakeup = true;
-		m->usr_pending = false;
-		list_del_init(&m->small_win);
+		tipc_group_open(m, usr_wakeup);
 		tipc_group_update_member(m, 0);
 		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
 		tipc_group_create_event(grp, m, TIPC_PUBLISHED,
@@ -895,12 +905,10 @@ void tipc_group_member_evt(struct tipc_group *grp,
 		if (!m)
 			break;
 
-		*usr_wakeup = true;
-		m->usr_pending = false;
 		tipc_group_decr_active(grp, m);
 		m->state = MBR_LEAVING;
 		list_del_init(&m->list);
-		list_del_init(&m->small_win);
+		tipc_group_open(m, usr_wakeup);
 
 		/* Only send event if no LEAVE message can be expected */
 		if (!tipc_node_is_up(net, node))
diff --git a/net/tipc/group.h b/net/tipc/group.h
index dee79477d499..f4a596ed9848 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -67,9 +67,9 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack);
 bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport,
 		     int len, struct tipc_member **m);
 bool tipc_group_bc_cong(struct tipc_group *grp, int len);
+bool tipc_group_is_open(struct tipc_group *grp);
 void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 			       u32 port, struct sk_buff_head *xmitq);
 u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
 void tipc_group_update_member(struct tipc_member *m, int len);
-int tipc_group_size(struct tipc_group *grp);
 #endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index b24dab3996c9..1f236271766c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -715,7 +715,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	struct tipc_group *grp = tsk->group;
+	struct tipc_group *grp;
 	u32 revents = 0;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
@@ -736,9 +736,9 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			revents |= POLLIN | POLLRDNORM;
 		break;
 	case TIPC_OPEN:
-		if (!grp || tipc_group_size(grp))
-			if (!tsk->cong_link_cnt)
-				revents |= POLLOUT;
+		grp = tsk->group;
+		if ((!grp || tipc_group_is_open(grp)) && !tsk->cong_link_cnt)
+			revents |= POLLOUT;
 		if (!tipc_sk_type_connectionless(sk))
 			break;
 		if (skb_queue_empty(&sk->sk_receive_queue))
-- 
cgit v1.2.3


From 141b52a98ab45a835ff1ea869414faccdc255a72 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 10 Jan 2018 01:20:01 -0800
Subject: net: use the right variant of kfree

kvzalloc'ed memory should be kvfree'd.

Fixes: e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index d7925ef8743d..852a54c769a3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7645,7 +7645,7 @@ err_rxq_info:
 	/* Rollback successful reg's and free other resources */
 	while (i--)
 		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
-	kfree(dev->_rx);
+	kvfree(dev->_rx);
 	dev->_rx = NULL;
 	return err;
 }
-- 
cgit v1.2.3


From 82aaff2f63443e1d6cc4a186ed9c2a5718123906 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 10 Jan 2018 01:20:02 -0800
Subject: net: free RX queue structures

Looks like commit e817f85652c1 ("xdp: generic XDP handling of
xdp_rxq_info") replaced kvfree(dev->_rx) in free_netdev() with
a call to netif_free_rx_queues() which doesn't actually free
the rings?

While at it remove the unnecessary temporary variable.

Fixes: e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 852a54c769a3..74e1e5d31337 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7653,16 +7653,15 @@ err_rxq_info:
 static void netif_free_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
-	struct netdev_rx_queue *rx;
 
 	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 	if (!dev->_rx)
 		return;
 
-	rx = dev->_rx;
-
 	for (i = 0; i < count; i++)
-		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
+
+	kvfree(dev->_rx);
 }
 
 static void netdev_init_one_queue(struct net_device *dev,
-- 
cgit v1.2.3


From fe19c04ca13737a48277fad28d912efbd72c1772 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 13:53:45 +0100
Subject: netfilter: nf_tables: remove nhooks field from struct nft_af_info

We already validate the hook through bitmask, so this check is
superfluous. When removing this, this patch is also fixing a bug in the
new flowtable codebase, since ctx->afi points to the table family
instead of the netdev family which is where the flowtable is really
hooked in.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       | 2 --
 net/bridge/netfilter/nf_tables_bridge.c | 1 -
 net/ipv4/netfilter/nf_tables_arp.c      | 1 -
 net/ipv4/netfilter/nf_tables_ipv4.c     | 1 -
 net/ipv6/netfilter/nf_tables_ipv6.c     | 1 -
 net/netfilter/nf_tables_api.c           | 5 +----
 net/netfilter/nf_tables_inet.c          | 1 -
 net/netfilter/nf_tables_netdev.c        | 1 -
 8 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index dd238950df81..536aaec96a07 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -973,7 +973,6 @@ enum nft_af_flags {
  *
  *	@list: used internally
  *	@family: address family
- *	@nhooks: number of hooks in this family
  *	@owner: module owner
  *	@tables: used internally
  *	@flags: family flags
@@ -981,7 +980,6 @@ enum nft_af_flags {
 struct nft_af_info {
 	struct list_head		list;
 	int				family;
-	unsigned int			nhooks;
 	struct module			*owner;
 	struct list_head		tables;
 	u32				flags;
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 86774b5c3b73..66c97b1e3303 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -44,7 +44,6 @@ nft_do_chain_bridge(void *priv,
 
 static struct nft_af_info nft_af_bridge __read_mostly = {
 	.family		= NFPROTO_BRIDGE,
-	.nhooks		= NF_BR_NUMHOOKS,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index f84c17763f6f..f9089b2ad905 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -29,7 +29,6 @@ nft_do_chain_arp(void *priv,
 
 static struct nft_af_info nft_af_arp __read_mostly = {
 	.family		= NFPROTO_ARP,
-	.nhooks		= NF_ARP_NUMHOOKS,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index f4675253f1e6..a98f2de63771 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -32,7 +32,6 @@ static unsigned int nft_do_chain_ipv4(void *priv,
 
 static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	.family		= NFPROTO_IPV4,
-	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 9cd45b964123..bddd39dc1cf3 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -30,7 +30,6 @@ static unsigned int nft_do_chain_ipv6(void *priv,
 
 static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	.family		= NFPROTO_IPV6,
-	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 336b81689ac9..93e4e67e4b4d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1328,9 +1328,6 @@ static int nft_chain_parse_hook(struct net *net,
 		return -EINVAL;
 
 	hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
-	if (hook->num >= afi->nhooks)
-		return -EINVAL;
-
 	hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 
 	type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
@@ -4993,7 +4990,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 		return -EINVAL;
 
 	hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
-	if (hooknum >= ctx->afi->nhooks)
+	if (hooknum != NF_NETDEV_INGRESS)
 		return -EINVAL;
 
 	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 58b9be7480bb..00b1fc9cea2e 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -40,7 +40,6 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
 
 static struct nft_af_info nft_af_inet __read_mostly = {
 	.family		= NFPROTO_INET,
-	.nhooks		= NF_INET_NUMHOOKS,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 42f6f6d42a6d..3da3dc7de945 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -40,7 +40,6 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 
 static struct nft_af_info nft_af_netdev __read_mostly = {
 	.family		= NFPROTO_NETDEV,
-	.nhooks		= NF_NETDEV_NUMHOOKS,
 	.owner		= THIS_MODULE,
 	.flags		= NFT_AF_NEEDS_DEV,
 };
-- 
cgit v1.2.3


From e7bb5c714020a2dce85b12766899f528883585ac Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 14:07:52 +0100
Subject: netfilter: nf_tables: remove flag field from struct nft_af_info

Replace it by a direct check for the netdev protocol family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 6 ------
 net/netfilter/nf_tables_api.c     | 2 +-
 net/netfilter/nf_tables_netdev.c  | 1 -
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 536aaec96a07..9a85893a5e30 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -964,10 +964,6 @@ struct nft_table {
 	char				*name;
 };
 
-enum nft_af_flags {
-	NFT_AF_NEEDS_DEV	= (1 << 0),
-};
-
 /**
  *	struct nft_af_info - nf_tables address family info
  *
@@ -975,14 +971,12 @@ enum nft_af_flags {
  *	@family: address family
  *	@owner: module owner
  *	@tables: used internally
- *	@flags: family flags
  */
 struct nft_af_info {
 	struct list_head		list;
 	int				family;
 	struct module			*owner;
 	struct list_head		tables;
-	u32				flags;
 };
 
 int nft_register_afinfo(struct net *, struct nft_af_info *);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 93e4e67e4b4d..a1b73d39dd71 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1350,7 +1350,7 @@ static int nft_chain_parse_hook(struct net *net,
 	hook->type = type;
 
 	hook->dev = NULL;
-	if (afi->flags & NFT_AF_NEEDS_DEV) {
+	if (afi->family == NFPROTO_NETDEV) {
 		char ifname[IFNAMSIZ];
 
 		if (!ha[NFTA_HOOK_DEV]) {
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 3da3dc7de945..c7f671daa7d0 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -41,7 +41,6 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 static struct nft_af_info nft_af_netdev __read_mostly = {
 	.family		= NFPROTO_NETDEV,
 	.owner		= THIS_MODULE,
-	.flags		= NFT_AF_NEEDS_DEV,
 };
 
 static int nf_tables_netdev_init_net(struct net *net)
-- 
cgit v1.2.3


From c9c17211ec2f36467369a9abf48e8322ad22e856 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 12:17:52 +0100
Subject: netfilter: nf_tables: no need for struct nft_af_info to
 enable/disable table

nf_tables_table_enable() and nf_tables_table_disable() take a pointer to
struct nft_af_info that is never used, remove it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a1b73d39dd71..64cca37018a8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -611,10 +611,7 @@ err:
 	return err;
 }
 
-static void _nf_tables_table_disable(struct net *net,
-				     const struct nft_af_info *afi,
-				     struct nft_table *table,
-				     u32 cnt)
+static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
 {
 	struct nft_chain *chain;
 	u32 i = 0;
@@ -632,9 +629,7 @@ static void _nf_tables_table_disable(struct net *net,
 	}
 }
 
-static int nf_tables_table_enable(struct net *net,
-				  const struct nft_af_info *afi,
-				  struct nft_table *table)
+static int nf_tables_table_enable(struct net *net, struct nft_table *table)
 {
 	struct nft_chain *chain;
 	int err, i = 0;
@@ -654,15 +649,13 @@ static int nf_tables_table_enable(struct net *net,
 	return 0;
 err:
 	if (i)
-		_nf_tables_table_disable(net, afi, table, i);
+		nft_table_disable(net, table, i);
 	return err;
 }
 
-static void nf_tables_table_disable(struct net *net,
-				    const struct nft_af_info *afi,
-				    struct nft_table *table)
+static void nf_tables_table_disable(struct net *net, struct nft_table *table)
 {
-	_nf_tables_table_disable(net, afi, table, 0);
+	nft_table_disable(net, table, 0);
 }
 
 static int nf_tables_updtable(struct nft_ctx *ctx)
@@ -691,7 +684,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 		nft_trans_table_enable(trans) = false;
 	} else if (!(flags & NFT_TABLE_F_DORMANT) &&
 		   ctx->table->flags & NFT_TABLE_F_DORMANT) {
-		ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table);
+		ret = nf_tables_table_enable(ctx->net, ctx->table);
 		if (ret >= 0) {
 			ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
 			nft_trans_table_enable(trans) = true;
@@ -5795,7 +5788,6 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			if (nft_trans_table_update(trans)) {
 				if (!nft_trans_table_enable(trans)) {
 					nf_tables_table_disable(net,
-								trans->ctx.afi,
 								trans->ctx.table);
 					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 				}
@@ -5957,7 +5949,6 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			if (nft_trans_table_update(trans)) {
 				if (nft_trans_table_enable(trans)) {
 					nf_tables_table_disable(net,
-								trans->ctx.afi,
 								trans->ctx.table);
 					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 				}
-- 
cgit v1.2.3


From 1ea26cca52e46c0f29ee9fdd567312ba93a7d651 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 13:40:22 +0100
Subject: netfilter: nf_tables: remove struct nft_af_info parameter in
 nf_tables_chain_type_lookup()

Pass family number instead, this comes in preparation for the removal of
struct nft_af_info.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 64cca37018a8..9efcbe27789d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -423,7 +423,7 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];
 
 static const struct nf_chain_type *
-__nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
 {
 	int i;
 
@@ -436,22 +436,20 @@ __nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
 }
 
 static const struct nf_chain_type *
-nf_tables_chain_type_lookup(const struct nft_af_info *afi,
-			    const struct nlattr *nla,
-			    bool autoload)
+nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
 {
 	const struct nf_chain_type *type;
 
-	type = __nf_tables_chain_type_lookup(afi->family, nla);
+	type = __nf_tables_chain_type_lookup(nla, family);
 	if (type != NULL)
 		return type;
 #ifdef CONFIG_MODULES
 	if (autoload) {
 		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-chain-%u-%.*s", afi->family,
+		request_module("nft-chain-%u-%.*s", family,
 			       nla_len(nla), (const char *)nla_data(nla));
 		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		type = __nf_tables_chain_type_lookup(afi->family, nla);
+		type = __nf_tables_chain_type_lookup(nla, family);
 		if (type != NULL)
 			return ERR_PTR(-EAGAIN);
 	}
@@ -1325,8 +1323,8 @@ static int nft_chain_parse_hook(struct net *net,
 
 	type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
 	if (nla[NFTA_CHAIN_TYPE]) {
-		type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE],
-						   create);
+		type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
+						   afi->family, create);
 		if (IS_ERR(type))
 			return PTR_ERR(type);
 	}
-- 
cgit v1.2.3


From 36596dadf54a920d26286cf9f421fb4ef648b51f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jan 2018 02:38:03 +0100
Subject: netfilter: nf_tables: add single table list for all families

Place all existing user defined tables in struct net *, instead of
having one list per family. This saves us from one level of indentation
in netlink dump functions.

Place pointer to struct nft_af_info in struct nft_table temporarily, as
we still need this to put back reference module reference counter on
table removal.

This patch comes in preparation for the removal of struct nft_af_info.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   8 +-
 include/net/netns/nftables.h      |   1 +
 net/netfilter/nf_tables_api.c     | 509 ++++++++++++++++++--------------------
 net/netfilter/nf_tables_netdev.c  |  21 +-
 net/netfilter/nft_compat.c        |  16 +-
 net/netfilter/nft_ct.c            |  16 +-
 net/netfilter/nft_flow_offload.c  |   4 +-
 net/netfilter/nft_log.c           |   4 +-
 net/netfilter/nft_masq.c          |   2 +-
 net/netfilter/nft_meta.c          |   4 +-
 net/netfilter/nft_nat.c           |   2 +-
 net/netfilter/nft_redir.c         |   2 +-
 12 files changed, 286 insertions(+), 303 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9a85893a5e30..c55e836e6a2f 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -143,22 +143,22 @@ static inline void nft_data_debug(const struct nft_data *data)
  *	struct nft_ctx - nf_tables rule/set context
  *
  *	@net: net namespace
- * 	@afi: address family info
  * 	@table: the table the chain is contained in
  * 	@chain: the chain the rule is contained in
  *	@nla: netlink attributes
  *	@portid: netlink portID of the original message
  *	@seq: netlink sequence number
+ *	@family: protocol family
  *	@report: notify via unicast netlink message
  */
 struct nft_ctx {
 	struct net			*net;
-	struct nft_af_info		*afi;
 	struct nft_table		*table;
 	struct nft_chain		*chain;
 	const struct nlattr * const 	*nla;
 	u32				portid;
 	u32				seq;
+	u8				family;
 	bool				report;
 };
 
@@ -949,6 +949,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
  *	@genmask: generation mask
+ *	@afinfo: address family info
  *	@name: name of the table
  */
 struct nft_table {
@@ -961,6 +962,7 @@ struct nft_table {
 	u32				use;
 	u16				flags:14,
 					genmask:2;
+	struct nft_af_info		*afi;
 	char				*name;
 };
 
@@ -970,13 +972,11 @@ struct nft_table {
  *	@list: used internally
  *	@family: address family
  *	@owner: module owner
- *	@tables: used internally
  */
 struct nft_af_info {
 	struct list_head		list;
 	int				family;
 	struct module			*owner;
-	struct list_head		tables;
 };
 
 int nft_register_afinfo(struct net *, struct nft_af_info *);
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 4109b5f3010f..7f86a63ac21f 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -8,6 +8,7 @@ struct nft_af_info;
 
 struct netns_nftables {
 	struct list_head	af_info;
+	struct list_head	tables;
 	struct list_head	commit_list;
 	struct nft_af_info	*ipv4;
 	struct nft_af_info	*ipv6;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9efcbe27789d..084d1f553c46 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -37,7 +37,6 @@ static LIST_HEAD(nf_tables_flowtables);
  */
 int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
 {
-	INIT_LIST_HEAD(&afi->tables);
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	list_add_tail_rcu(&afi->list, &net->nft.af_info);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
@@ -99,13 +98,13 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
 			 const struct sk_buff *skb,
 			 const struct nlmsghdr *nlh,
-			 struct nft_af_info *afi,
+			 u8 family,
 			 struct nft_table *table,
 			 struct nft_chain *chain,
 			 const struct nlattr * const *nla)
 {
 	ctx->net	= net;
-	ctx->afi	= afi;
+	ctx->family	= family;
 	ctx->table	= table;
 	ctx->chain	= chain;
 	ctx->nla   	= nla;
@@ -385,30 +384,31 @@ static int nft_delflowtable(struct nft_ctx *ctx,
  * Tables
  */
 
-static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
+static struct nft_table *nft_table_lookup(const struct net *net,
 					  const struct nlattr *nla,
-					  u8 genmask)
+					  u8 family, u8 genmask)
 {
 	struct nft_table *table;
 
-	list_for_each_entry(table, &afi->tables, list) {
+	list_for_each_entry(table, &net->nft.tables, list) {
 		if (!nla_strcmp(nla, table->name) &&
+		    table->afi->family == family &&
 		    nft_active_genmask(table, genmask))
 			return table;
 	}
 	return NULL;
 }
 
-static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
+static struct nft_table *nf_tables_table_lookup(const struct net *net,
 						const struct nlattr *nla,
-						u8 genmask)
+						u8 family, u8 genmask)
 {
 	struct nft_table *table;
 
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	table = nft_table_lookup(afi, nla, genmask);
+	table = nft_table_lookup(net, nla, family, genmask);
 	if (table != NULL)
 		return table;
 
@@ -507,7 +507,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
 		goto err;
 
 	err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
-					event, 0, ctx->afi->family, ctx->table);
+					event, 0, ctx->family, ctx->table);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -524,7 +524,6 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 				 struct netlink_callback *cb)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	unsigned int idx = 0, s_idx = cb->args[0];
 	struct net *net = sock_net(skb->sk);
@@ -533,30 +532,27 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 	rcu_read_lock();
 	cb->seq = net->nft.base_seq;
 
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (family != NFPROTO_UNSPEC && family != afi->family)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (family != NFPROTO_UNSPEC && family != table->afi->family)
 			continue;
 
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			if (idx < s_idx)
-				goto cont;
-			if (idx > s_idx)
-				memset(&cb->args[1], 0,
-				       sizeof(cb->args) - sizeof(cb->args[0]));
-			if (!nft_is_active(net, table))
-				continue;
-			if (nf_tables_fill_table_info(skb, net,
-						      NETLINK_CB(cb->skb).portid,
-						      cb->nlh->nlmsg_seq,
-						      NFT_MSG_NEWTABLE,
-						      NLM_F_MULTI,
-						      afi->family, table) < 0)
-				goto done;
-
-			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+		if (idx < s_idx)
+			goto cont;
+		if (idx > s_idx)
+			memset(&cb->args[1], 0,
+			       sizeof(cb->args) - sizeof(cb->args[0]));
+		if (!nft_is_active(net, table))
+			continue;
+		if (nf_tables_fill_table_info(skb, net,
+					      NETLINK_CB(cb->skb).portid,
+					      cb->nlh->nlmsg_seq,
+					      NFT_MSG_NEWTABLE, NLM_F_MULTI,
+					      table->afi->family, table) < 0)
+			goto done;
+
+		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
-			idx++;
-		}
+		idx++;
 	}
 done:
 	rcu_read_unlock();
@@ -588,7 +584,8 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -719,7 +716,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 		return PTR_ERR(afi);
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(afi, name, genmask);
+	table = nf_tables_table_lookup(net, name, afi->family, genmask);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -729,7 +726,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+		nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 		return nf_tables_updtable(&ctx);
 	}
 
@@ -756,14 +753,15 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
 	INIT_LIST_HEAD(&table->flowtables);
+	table->afi = afi;
 	table->flags = flags;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
 	if (err < 0)
 		goto err4;
 
-	list_add_tail_rcu(&table->list, &afi->tables);
+	list_add_tail_rcu(&table->list, &net->nft.tables);
 	return 0;
 err4:
 	kfree(table->name);
@@ -837,30 +835,28 @@ out:
 
 static int nft_flush(struct nft_ctx *ctx, int family)
 {
-	struct nft_af_info *afi;
 	struct nft_table *table, *nt;
 	const struct nlattr * const *nla = ctx->nla;
 	int err = 0;
 
-	list_for_each_entry(afi, &ctx->net->nft.af_info, list) {
-		if (family != AF_UNSPEC && afi->family != family)
+	list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) {
+		if (family != AF_UNSPEC && table->afi->family != family)
 			continue;
 
-		ctx->afi = afi;
-		list_for_each_entry_safe(table, nt, &afi->tables, list) {
-			if (!nft_is_active_next(ctx->net, table))
-				continue;
+		ctx->family = table->afi->family;
 
-			if (nla[NFTA_TABLE_NAME] &&
-			    nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
-				continue;
+		if (!nft_is_active_next(ctx->net, table))
+			continue;
 
-			ctx->table = table;
+		if (nla[NFTA_TABLE_NAME] &&
+		    nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
+			continue;
 
-			err = nft_flush_table(ctx);
-			if (err < 0)
-				goto out;
-		}
+		ctx->table = table;
+
+		err = nft_flush_table(ctx);
+		if (err < 0)
+			goto out;
 	}
 out:
 	return err;
@@ -878,7 +874,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	int family = nfmsg->nfgen_family;
 	struct nft_ctx ctx;
 
-	nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
 	if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
 		return nft_flush(&ctx, family);
 
@@ -886,7 +882,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -894,7 +891,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	    table->use > 0)
 		return -EBUSY;
 
-	ctx.afi = afi;
+	ctx.family = afi->family;
 	ctx.table = table;
 
 	return nft_flush_table(&ctx);
@@ -906,7 +903,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 
 	kfree(ctx->table->name);
 	kfree(ctx->table);
-	module_put(ctx->afi->owner);
+	module_put(ctx->table->afi->owner);
 }
 
 int nft_register_chain_type(const struct nf_chain_type *ctype)
@@ -1107,7 +1104,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 		goto err;
 
 	err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
-					event, 0, ctx->afi->family, ctx->table,
+					event, 0, ctx->family, ctx->table,
 					ctx->chain);
 	if (err < 0) {
 		kfree_skb(skb);
@@ -1125,7 +1122,6 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 				 struct netlink_callback *cb)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	unsigned int idx = 0, s_idx = cb->args[0];
@@ -1135,31 +1131,30 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 	rcu_read_lock();
 	cb->seq = net->nft.base_seq;
 
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (family != NFPROTO_UNSPEC && family != afi->family)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (family != NFPROTO_UNSPEC && family != table->afi->family)
 			continue;
 
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			list_for_each_entry_rcu(chain, &table->chains, list) {
-				if (idx < s_idx)
-					goto cont;
-				if (idx > s_idx)
-					memset(&cb->args[1], 0,
-					       sizeof(cb->args) - sizeof(cb->args[0]));
-				if (!nft_is_active(net, chain))
-					continue;
-				if (nf_tables_fill_chain_info(skb, net,
-							      NETLINK_CB(cb->skb).portid,
-							      cb->nlh->nlmsg_seq,
-							      NFT_MSG_NEWCHAIN,
-							      NLM_F_MULTI,
-							      afi->family, table, chain) < 0)
-					goto done;
+		list_for_each_entry_rcu(chain, &table->chains, list) {
+			if (idx < s_idx)
+				goto cont;
+			if (idx > s_idx)
+				memset(&cb->args[1], 0,
+				       sizeof(cb->args) - sizeof(cb->args[0]));
+			if (!nft_is_active(net, chain))
+				continue;
+			if (nf_tables_fill_chain_info(skb, net,
+						      NETLINK_CB(cb->skb).portid,
+						      cb->nlh->nlmsg_seq,
+						      NFT_MSG_NEWCHAIN,
+						      NLM_F_MULTI,
+						      table->afi->family, table,
+						      chain) < 0)
+				goto done;
 
-				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
-				idx++;
-			}
+			idx++;
 		}
 	}
 done:
@@ -1193,7 +1188,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1301,8 +1297,8 @@ struct nft_chain_hook {
 
 static int nft_chain_parse_hook(struct net *net,
 				const struct nlattr * const nla[],
-				struct nft_af_info *afi,
-				struct nft_chain_hook *hook, bool create)
+				struct nft_chain_hook *hook, u8 family,
+				bool create)
 {
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
 	const struct nf_chain_type *type;
@@ -1321,10 +1317,10 @@ static int nft_chain_parse_hook(struct net *net,
 	hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
 	hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 
-	type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
+	type = chain_type[family][NFT_CHAIN_T_DEFAULT];
 	if (nla[NFTA_CHAIN_TYPE]) {
 		type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
-						   afi->family, create);
+						   family, create);
 		if (IS_ERR(type))
 			return PTR_ERR(type);
 	}
@@ -1341,7 +1337,7 @@ static int nft_chain_parse_hook(struct net *net,
 	hook->type = type;
 
 	hook->dev = NULL;
-	if (afi->family == NFPROTO_NETDEV) {
+	if (family == NFPROTO_NETDEV) {
 		char ifname[IFNAMSIZ];
 
 		if (!ha[NFTA_HOOK_DEV]) {
@@ -1376,7 +1372,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 {
 	const struct nlattr * const *nla = ctx->nla;
 	struct nft_table *table = ctx->table;
-	struct nft_af_info *afi = ctx->afi;
 	struct nft_base_chain *basechain;
 	struct nft_stats __percpu *stats;
 	struct net *net = ctx->net;
@@ -1390,7 +1385,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		struct nft_chain_hook hook;
 		struct nf_hook_ops *ops;
 
-		err = nft_chain_parse_hook(net, nla, afi, &hook, create);
+		err = nft_chain_parse_hook(net, nla, &hook, family, create);
 		if (err < 0)
 			return err;
 
@@ -1483,7 +1478,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 		if (!nft_is_base_chain(chain))
 			return -EBUSY;
 
-		err = nft_chain_parse_hook(ctx->net, nla, ctx->afi, &hook,
+		err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
 					   create);
 		if (err < 0)
 			return err;
@@ -1576,7 +1571,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1616,7 +1612,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 		}
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
 
 	if (chain != NULL) {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
@@ -1650,7 +1646,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1662,7 +1659,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	    chain->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
 
 	use = chain->use;
 	list_for_each_entry(rule, &chain->rules, list) {
@@ -1827,7 +1824,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
+	type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
 	if (IS_ERR(type))
 		return PTR_ERR(type);
 
@@ -2050,7 +2047,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
 		goto err;
 
 	err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
-				       event, 0, ctx->afi->family, ctx->table,
+				       event, 0, ctx->family, ctx->table,
 				       ctx->chain, rule);
 	if (err < 0) {
 		kfree_skb(skb);
@@ -2074,7 +2071,6 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
 	const struct nft_rule_dump_ctx *ctx = cb->data;
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
@@ -2085,39 +2081,37 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	rcu_read_lock();
 	cb->seq = net->nft.base_seq;
 
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (family != NFPROTO_UNSPEC && family != afi->family)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (family != NFPROTO_UNSPEC && family != table->afi->family)
+			continue;
+
+		if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
 			continue;
 
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			if (ctx && ctx->table &&
-			    strcmp(ctx->table, table->name) != 0)
+		list_for_each_entry_rcu(chain, &table->chains, list) {
+			if (ctx && ctx->chain &&
+			    strcmp(ctx->chain, chain->name) != 0)
 				continue;
 
-			list_for_each_entry_rcu(chain, &table->chains, list) {
-				if (ctx && ctx->chain &&
-				    strcmp(ctx->chain, chain->name) != 0)
-					continue;
-
-				list_for_each_entry_rcu(rule, &chain->rules, list) {
-					if (!nft_is_active(net, rule))
-						goto cont;
-					if (idx < s_idx)
-						goto cont;
-					if (idx > s_idx)
-						memset(&cb->args[1], 0,
-						       sizeof(cb->args) - sizeof(cb->args[0]));
-					if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
-								      cb->nlh->nlmsg_seq,
-								      NFT_MSG_NEWRULE,
-								      NLM_F_MULTI | NLM_F_APPEND,
-								      afi->family, table, chain, rule) < 0)
-						goto done;
-
-					nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+			list_for_each_entry_rcu(rule, &chain->rules, list) {
+				if (!nft_is_active(net, rule))
+					goto cont;
+				if (idx < s_idx)
+					goto cont;
+				if (idx > s_idx)
+					memset(&cb->args[1], 0,
+					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
+							      cb->nlh->nlmsg_seq,
+							      NFT_MSG_NEWRULE,
+							      NLM_F_MULTI | NLM_F_APPEND,
+							      table->afi->family,
+							      table, chain, rule) < 0)
+					goto done;
+
+				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
-					idx++;
-				}
+				idx++;
 			}
 		}
 	}
@@ -2195,7 +2189,8 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2272,7 +2267,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2311,7 +2307,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			return PTR_ERR(old_rule);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
 
 	n = 0;
 	size = 0;
@@ -2446,7 +2442,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2457,7 +2454,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			return PTR_ERR(chain);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
 
 	if (chain) {
 		if (nla[NFTA_RULE_HANDLE]) {
@@ -2650,13 +2647,13 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 		if (afi == NULL)
 			return -EAFNOSUPPORT;
 
-		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE],
-					       genmask);
+		table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE],
+					       afi->family, genmask);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
 
-	nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(ctx, net, skb, nlh, afi->family, table, NULL, nla);
 	return 0;
 }
 
@@ -2783,7 +2780,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 		goto nla_put_failure;
 
 	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= ctx->afi->family;
+	nfmsg->nfgen_family	= ctx->family;
 	nfmsg->version		= NFNETLINK_V0;
 	nfmsg->res_id		= htons(ctx->net->nft.base_seq & 0xffff);
 
@@ -2875,10 +2872,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nft_set *set;
 	unsigned int idx, s_idx = cb->args[0];
-	struct nft_af_info *afi;
 	struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
 	struct net *net = sock_net(skb->sk);
-	int cur_family = cb->args[3];
 	struct nft_ctx *ctx = cb->data, ctx_set;
 
 	if (cb->args[1])
@@ -2887,51 +2882,44 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
 	rcu_read_lock();
 	cb->seq = net->nft.base_seq;
 
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (ctx->afi && ctx->afi != afi)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (ctx->family != NFPROTO_UNSPEC &&
+		    ctx->family != table->afi->family)
+			continue;
+
+		if (ctx->table && ctx->table != table)
 			continue;
 
-		if (cur_family) {
-			if (afi->family != cur_family)
+		if (cur_table) {
+			if (cur_table != table)
 				continue;
 
-			cur_family = 0;
+			cur_table = NULL;
 		}
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			if (ctx->table && ctx->table != table)
-				continue;
+		idx = 0;
+		list_for_each_entry_rcu(set, &table->sets, list) {
+			if (idx < s_idx)
+				goto cont;
+			if (!nft_is_active(net, set))
+				goto cont;
 
-			if (cur_table) {
-				if (cur_table != table)
-					continue;
+			ctx_set = *ctx;
+			ctx_set.table = table;
+			ctx_set.family = table->afi->family;
 
-				cur_table = NULL;
+			if (nf_tables_fill_set(skb, &ctx_set, set,
+					       NFT_MSG_NEWSET,
+					       NLM_F_MULTI) < 0) {
+				cb->args[0] = idx;
+				cb->args[2] = (unsigned long) table;
+				goto done;
 			}
-			idx = 0;
-			list_for_each_entry_rcu(set, &table->sets, list) {
-				if (idx < s_idx)
-					goto cont;
-				if (!nft_is_active(net, set))
-					goto cont;
-
-				ctx_set = *ctx;
-				ctx_set.table = table;
-				ctx_set.afi = afi;
-				if (nf_tables_fill_set(skb, &ctx_set, set,
-						       NFT_MSG_NEWSET,
-						       NLM_F_MULTI) < 0) {
-					cb->args[0] = idx;
-					cb->args[2] = (unsigned long) table;
-					cb->args[3] = afi->family;
-					goto done;
-				}
-				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
-				idx++;
-			}
-			if (s_idx)
-				s_idx = 0;
+			idx++;
 		}
+		if (s_idx)
+			s_idx = 0;
 	}
 	cb->args[1] = 1;
 done:
@@ -3141,11 +3129,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 
 	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
@@ -3410,12 +3399,12 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE],
-				       genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE],
+				       afi->family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(ctx, net, skb, nlh, afi->family, table, NULL, nla);
 	return 0;
 }
 
@@ -3520,7 +3509,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct nft_set_dump_ctx *dump_ctx = cb->data;
 	struct net *net = sock_net(skb->sk);
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nft_set_dump_args args;
@@ -3532,21 +3520,19 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	int event;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (afi != dump_ctx->ctx.afi)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
+		    dump_ctx->ctx.family != table->afi->family)
 			continue;
 
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			if (table != dump_ctx->ctx.table)
-				continue;
+		if (table != dump_ctx->ctx.table)
+			continue;
 
-			list_for_each_entry_rcu(set, &table->sets, list) {
-				if (set == dump_ctx->set) {
-					set_found = true;
-					break;
-				}
+		list_for_each_entry_rcu(set, &table->sets, list) {
+			if (set == dump_ctx->set) {
+				set_found = true;
+				break;
 			}
-			break;
 		}
 		break;
 	}
@@ -3566,7 +3552,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 		goto nla_put_failure;
 
 	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = afi->family;
+	nfmsg->nfgen_family = table->afi->family;
 	nfmsg->version      = NFNETLINK_V0;
 	nfmsg->res_id	    = htons(net->nft.base_seq & 0xffff);
 
@@ -3629,7 +3615,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 		goto nla_put_failure;
 
 	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= ctx->afi->family;
+	nfmsg->nfgen_family	= ctx->family;
 	nfmsg->version		= NFNETLINK_V0;
 	nfmsg->res_id		= htons(ctx->net->nft.base_seq & 0xffff);
 
@@ -3986,7 +3972,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		list_for_each_entry(binding, &set->bindings, list) {
 			struct nft_ctx bind_ctx = {
 				.net	= ctx->net,
-				.afi	= ctx->afi,
+				.family	= ctx->family,
 				.table	= ctx->table,
 				.chain	= (struct nft_chain *)binding->chain,
 			};
@@ -4533,7 +4519,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -4551,7 +4538,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		return 0;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 
 	type = nft_obj_type_get(objtype);
 	if (IS_ERR(type))
@@ -4628,7 +4615,6 @@ struct nft_obj_filter {
 static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	unsigned int idx = 0, s_idx = cb->args[0];
 	struct nft_obj_filter *filter = cb->data;
@@ -4643,38 +4629,37 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 	rcu_read_lock();
 	cb->seq = net->nft.base_seq;
 
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (family != NFPROTO_UNSPEC && family != afi->family)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (family != NFPROTO_UNSPEC && family != table->afi->family)
 			continue;
 
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			list_for_each_entry_rcu(obj, &table->objects, list) {
-				if (!nft_is_active(net, obj))
-					goto cont;
-				if (idx < s_idx)
-					goto cont;
-				if (idx > s_idx)
-					memset(&cb->args[1], 0,
-					       sizeof(cb->args) - sizeof(cb->args[0]));
-				if (filter && filter->table[0] &&
-				    strcmp(filter->table, table->name))
-					goto cont;
-				if (filter &&
-				    filter->type != NFT_OBJECT_UNSPEC &&
-				    obj->ops->type->type != filter->type)
-					goto cont;
+		list_for_each_entry_rcu(obj, &table->objects, list) {
+			if (!nft_is_active(net, obj))
+				goto cont;
+			if (idx < s_idx)
+				goto cont;
+			if (idx > s_idx)
+				memset(&cb->args[1], 0,
+				       sizeof(cb->args) - sizeof(cb->args[0]));
+			if (filter && filter->table[0] &&
+			    strcmp(filter->table, table->name))
+				goto cont;
+			if (filter &&
+			    filter->type != NFT_OBJECT_UNSPEC &&
+			    obj->ops->type->type != filter->type)
+				goto cont;
 
-				if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
-							    cb->nlh->nlmsg_seq,
-							    NFT_MSG_NEWOBJ,
-							    NLM_F_MULTI | NLM_F_APPEND,
-							    afi->family, table, obj, reset) < 0)
-					goto done;
+			if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
+						    cb->nlh->nlmsg_seq,
+						    NFT_MSG_NEWOBJ,
+						    NLM_F_MULTI | NLM_F_APPEND,
+						    table->afi->family, table,
+						    obj, reset) < 0)
+				goto done;
 
-				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
-				idx++;
-			}
+			idx++;
 		}
 	}
 done:
@@ -4761,7 +4746,8 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -4821,7 +4807,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], afi->family,
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -4832,7 +4819,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	if (obj->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 
 	return nft_delobj(&ctx, obj);
 }
@@ -4870,7 +4857,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx,
 				 struct nft_object *obj, int event)
 {
 	nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
-		       ctx->afi->family, ctx->report, GFP_KERNEL);
+		       ctx->family, ctx->report, GFP_KERNEL);
 }
 
 /*
@@ -5060,7 +5047,7 @@ void nft_flow_table_iterate(struct net *net,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		list_for_each_entry_rcu(table, &afi->tables, list) {
+		list_for_each_entry_rcu(table, &net->nft.tables, list) {
 			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
 				iter(&flowtable->data, data);
 			}
@@ -5108,7 +5095,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
+				       afi->family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -5125,7 +5113,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		return 0;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 
 	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
 	if (!flowtable)
@@ -5206,7 +5194,8 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
+				       afi->family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -5217,7 +5206,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	if (flowtable->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
 
 	return nft_delflowtable(&ctx, flowtable);
 }
@@ -5286,40 +5275,37 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 
 	rcu_read_lock();
 	cb->seq = net->nft.base_seq;
 
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		if (family != NFPROTO_UNSPEC && family != afi->family)
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (family != NFPROTO_UNSPEC && family != table->afi->family)
 			continue;
 
-		list_for_each_entry_rcu(table, &afi->tables, list) {
-			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
-				if (!nft_is_active(net, flowtable))
-					goto cont;
-				if (idx < s_idx)
-					goto cont;
-				if (idx > s_idx)
-					memset(&cb->args[1], 0,
-					       sizeof(cb->args) - sizeof(cb->args[0]));
-				if (filter && filter->table[0] &&
-				    strcmp(filter->table, table->name))
-					goto cont;
+		list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+			if (!nft_is_active(net, flowtable))
+				goto cont;
+			if (idx < s_idx)
+				goto cont;
+			if (idx > s_idx)
+				memset(&cb->args[1], 0,
+				       sizeof(cb->args) - sizeof(cb->args[0]));
+			if (filter && filter->table[0] &&
+			    strcmp(filter->table, table->name))
+				goto cont;
 
-				if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid,
-								  cb->nlh->nlmsg_seq,
-								  NFT_MSG_NEWFLOWTABLE,
-								  NLM_F_MULTI | NLM_F_APPEND,
-								  afi->family, flowtable) < 0)
-					goto done;
+			if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid,
+							  cb->nlh->nlmsg_seq,
+							  NFT_MSG_NEWFLOWTABLE,
+							  NLM_F_MULTI | NLM_F_APPEND,
+							  table->afi->family, flowtable) < 0)
+				goto done;
 
-				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
-				idx++;
-			}
+			idx++;
 		}
 	}
 done:
@@ -5402,7 +5388,8 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
+				       afi->family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -5445,7 +5432,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 	err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
 					    ctx->seq, event, 0,
-					    ctx->afi->family, flowtable);
+					    ctx->family, flowtable);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -5523,17 +5510,14 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct nft_flowtable *flowtable;
 	struct nft_table *table;
-	struct nft_af_info *afi;
 
 	if (event != NETDEV_UNREGISTER)
 		return 0;
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
-		list_for_each_entry(table, &afi->tables, list) {
-			list_for_each_entry(flowtable, &table->flowtables, list) {
-				nft_flowtable_event(event, dev, flowtable);
-			}
+	list_for_each_entry(table, &dev_net(dev)->nft.tables, list) {
+		list_for_each_entry(flowtable, &table->flowtables, list) {
+			nft_flowtable_event(event, dev, flowtable);
 		}
 	}
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
@@ -6552,6 +6536,7 @@ EXPORT_SYMBOL_GPL(nft_data_dump);
 static int __net_init nf_tables_init_net(struct net *net)
 {
 	INIT_LIST_HEAD(&net->nft.af_info);
+	INIT_LIST_HEAD(&net->nft.tables);
 	INIT_LIST_HEAD(&net->nft.commit_list);
 	net->nft.base_seq = 1;
 	return 0;
@@ -6594,10 +6579,10 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 	struct nft_set *set, *ns;
 	struct nft_ctx ctx = {
 		.net	= net,
-		.afi	= afi,
+		.family	= afi->family,
 	};
 
-	list_for_each_entry_safe(table, nt, &afi->tables, list) {
+	list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
 		list_for_each_entry(chain, &table->chains, list)
 			nf_tables_unregister_hook(net, table, chain);
 		list_for_each_entry(flowtable, &table->flowtables, list)
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index c7f671daa7d0..01b61a67a2ac 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -107,7 +107,6 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 				  unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain, *nr;
 	struct nft_ctx ctx = {
@@ -119,20 +118,18 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
-		ctx.afi = afi;
-		if (afi->family != NFPROTO_NETDEV)
+	list_for_each_entry(table, &ctx.net->nft.tables, list) {
+		if (table->afi->family != NFPROTO_NETDEV)
 			continue;
 
-		list_for_each_entry(table, &afi->tables, list) {
-			ctx.table = table;
-			list_for_each_entry_safe(chain, nr, &table->chains, list) {
-				if (!nft_is_base_chain(chain))
-					continue;
+		ctx.family = table->afi->family;
+		ctx.table = table;
+		list_for_each_entry_safe(chain, nr, &table->chains, list) {
+			if (!nft_is_base_chain(chain))
+				continue;
 
-				ctx.chain = chain;
-				nft_netdev_event(event, dev, &ctx);
-			}
+			ctx.chain = chain;
+			nft_netdev_event(event, dev, &ctx);
 		}
 	}
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index dcff0dc8d28b..7fa17e241c14 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -144,7 +144,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 {
 	par->net	= ctx->net;
 	par->table	= ctx->table->name;
-	switch (ctx->afi->family) {
+	switch (ctx->family) {
 	case AF_INET:
 		entry->e4.ip.proto = proto;
 		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
@@ -175,7 +175,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 	} else {
 		par->hook_mask = 0;
 	}
-	par->family	= ctx->afi->family;
+	par->family	= ctx->family;
 	par->nft_compat = true;
 }
 
@@ -267,7 +267,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	par.net = ctx->net;
 	par.target = target;
 	par.targinfo = info;
-	par.family = ctx->afi->family;
+	par.family = ctx->family;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 
@@ -358,7 +358,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 {
 	par->net	= ctx->net;
 	par->table	= ctx->table->name;
-	switch (ctx->afi->family) {
+	switch (ctx->family) {
 	case AF_INET:
 		entry->e4.ip.proto = proto;
 		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
@@ -389,7 +389,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 	} else {
 		par->hook_mask = 0;
 	}
-	par->family	= ctx->afi->family;
+	par->family	= ctx->family;
 	par->nft_compat = true;
 }
 
@@ -446,7 +446,7 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	par.net = ctx->net;
 	par.match = match;
 	par.matchinfo = info;
-	par.family = ctx->afi->family;
+	par.family = ctx->family;
 	if (par.match->destroy != NULL)
 		par.match->destroy(&par);
 
@@ -648,7 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	mt_name = nla_data(tb[NFTA_MATCH_NAME]);
 	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
-	family = ctx->afi->family;
+	family = ctx->family;
 
 	/* Re-use the existing match if it's already loaded. */
 	list_for_each_entry(nft_match, &nft_match_list, head) {
@@ -733,7 +733,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 
 	tg_name = nla_data(tb[NFTA_TARGET_NAME]);
 	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
-	family = ctx->afi->family;
+	family = ctx->family;
 
 	/* Re-use the existing target if it's already loaded. */
 	list_for_each_entry(nft_target, &nft_target_list, head) {
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 2647b895f4b0..6ab274b14484 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -405,7 +405,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 		if (tb[NFTA_CT_DIRECTION] == NULL)
 			return -EINVAL;
 
-		switch (ctx->afi->family) {
+		switch (ctx->family) {
 		case NFPROTO_IPV4:
 			len = FIELD_SIZEOF(struct nf_conntrack_tuple,
 					   src.u3.ip);
@@ -456,7 +456,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	err = nf_ct_netns_get(ctx->net, ctx->afi->family);
+	err = nf_ct_netns_get(ctx->net, ctx->family);
 	if (err < 0)
 		return err;
 
@@ -550,7 +550,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		goto err1;
 
-	err = nf_ct_netns_get(ctx->net, ctx->afi->family);
+	err = nf_ct_netns_get(ctx->net, ctx->family);
 	if (err < 0)
 		goto err1;
 
@@ -564,7 +564,7 @@ err1:
 static void nft_ct_get_destroy(const struct nft_ctx *ctx,
 			       const struct nft_expr *expr)
 {
-	nf_ct_netns_put(ctx->net, ctx->afi->family);
+	nf_ct_netns_put(ctx->net, ctx->family);
 }
 
 static void nft_ct_set_destroy(const struct nft_ctx *ctx,
@@ -573,7 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
 	struct nft_ct *priv = nft_expr_priv(expr);
 
 	__nft_ct_set_destroy(ctx, priv);
-	nf_ct_netns_put(ctx->net, ctx->afi->family);
+	nf_ct_netns_put(ctx->net, ctx->family);
 }
 
 static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -734,7 +734,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 	struct nft_ct_helper_obj *priv = nft_obj_data(obj);
 	struct nf_conntrack_helper *help4, *help6;
 	char name[NF_CT_HELPER_NAME_LEN];
-	int family = ctx->afi->family;
+	int family = ctx->family;
 
 	if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO])
 		return -EINVAL;
@@ -753,14 +753,14 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		if (ctx->afi->family == NFPROTO_IPV6)
+		if (ctx->family == NFPROTO_IPV6)
 			return -EINVAL;
 
 		help4 = nf_conntrack_helper_try_module_get(name, family,
 							   priv->l4proto);
 		break;
 	case NFPROTO_IPV6:
-		if (ctx->afi->family == NFPROTO_IPV4)
+		if (ctx->family == NFPROTO_IPV4)
 			return -EINVAL;
 
 		help6 = nf_conntrack_helper_try_module_get(name, family,
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index dd38785dfed9..4503b8dcf9c0 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -151,7 +151,7 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
 	priv->flowtable = flowtable;
 	flowtable->use++;
 
-	return nf_ct_netns_get(ctx->net, ctx->afi->family);
+	return nf_ct_netns_get(ctx->net, ctx->family);
 }
 
 static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
@@ -160,7 +160,7 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
 	struct nft_flow_offload *priv = nft_expr_priv(expr);
 
 	priv->flowtable->use--;
-	nf_ct_netns_put(ctx->net, ctx->afi->family);
+	nf_ct_netns_put(ctx->net, ctx->family);
 }
 
 static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 6f6e64423643..a27be36dc0af 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -112,7 +112,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
-	err = nf_logger_find_get(ctx->afi->family, li->type);
+	err = nf_logger_find_get(ctx->family, li->type);
 	if (err < 0)
 		goto err1;
 
@@ -133,7 +133,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
-	nf_logger_put(ctx->afi->family, li->type);
+	nf_logger_put(ctx->family, li->type);
 }
 
 static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 6ac03d4266c9..9d8655bc1bea 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -73,7 +73,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
 		}
 	}
 
-	return nf_ct_netns_get(ctx->net, ctx->afi->family);
+	return nf_ct_netns_get(ctx->net, ctx->family);
 }
 EXPORT_SYMBOL_GPL(nft_masq_init);
 
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 1a91e676f13e..8fb91940e2e7 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -339,7 +339,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
 	if (priv->key != NFT_META_SECPATH)
 		return 0;
 
-	switch (ctx->afi->family) {
+	switch (ctx->family) {
 	case NFPROTO_NETDEV:
 		hooks = 1 << NF_NETDEV_INGRESS;
 		break;
@@ -370,7 +370,7 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
 	if (priv->key != NFT_META_PKTTYPE)
 		return 0;
 
-	switch (ctx->afi->family) {
+	switch (ctx->family) {
 	case NFPROTO_BRIDGE:
 		hooks = 1 << NF_BR_PRE_ROUTING;
 		break;
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index ed548d06b6dd..1f36954c2ba9 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -142,7 +142,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return -EINVAL;
 
 	family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
-	if (family != ctx->afi->family)
+	if (family != ctx->family)
 		return -EOPNOTSUPP;
 
 	switch (family) {
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 1e66538bf0ff..c64cbe78dee7 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -75,7 +75,7 @@ int nft_redir_init(const struct nft_ctx *ctx,
 			return -EINVAL;
 	}
 
-	return nf_ct_netns_get(ctx->net, ctx->afi->family);
+	return nf_ct_netns_get(ctx->net, ctx->family);
 }
 EXPORT_SYMBOL_GPL(nft_redir_init);
 
-- 
cgit v1.2.3


From dd4cbef7235154f163501ffbf396c0dadd830c9c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jan 2018 02:42:11 +0100
Subject: netfilter: nf_tables: get rid of pernet families

Now that we have a single table list for each netns, we can get rid of
one pointer per family and the global afinfo list, thus, shrinking
struct netns for nftables that now becomes 64 bytes smaller.

And call __nft_release_afinfo() from __net_exit path accordingly to
release netnamespace objects on removal.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       |  4 +--
 include/net/netns/nftables.h            |  7 ----
 net/bridge/netfilter/nf_tables_bridge.c | 38 +++-------------------
 net/ipv4/netfilter/nf_tables_arp.c      | 41 ++++++------------------
 net/ipv4/netfilter/nf_tables_ipv4.c     | 40 +++++------------------
 net/ipv6/netfilter/nf_tables_ipv6.c     | 40 +++++------------------
 net/netfilter/nf_tables_api.c           | 57 +++++++++++++++------------------
 net/netfilter/nf_tables_inet.c          | 41 +++++-------------------
 net/netfilter/nf_tables_netdev.c        | 46 ++++++--------------------
 9 files changed, 75 insertions(+), 239 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index c55e836e6a2f..12f83d223caa 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -979,8 +979,8 @@ struct nft_af_info {
 	struct module			*owner;
 };
 
-int nft_register_afinfo(struct net *, struct nft_af_info *);
-void nft_unregister_afinfo(struct net *, struct nft_af_info *);
+int nft_register_afinfo(struct nft_af_info *);
+void nft_unregister_afinfo(struct nft_af_info *);
 
 int nft_register_chain_type(const struct nf_chain_type *);
 void nft_unregister_chain_type(const struct nf_chain_type *);
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 7f86a63ac21f..48134353411d 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -7,15 +7,8 @@
 struct nft_af_info;
 
 struct netns_nftables {
-	struct list_head	af_info;
 	struct list_head	tables;
 	struct list_head	commit_list;
-	struct nft_af_info	*ipv4;
-	struct nft_af_info	*ipv6;
-	struct nft_af_info	*inet;
-	struct nft_af_info	*arp;
-	struct nft_af_info	*bridge;
-	struct nft_af_info	*netdev;
 	unsigned int		base_seq;
 	u8			gencursor;
 };
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 66c97b1e3303..dbf7195f059c 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -47,34 +47,6 @@ static struct nft_af_info nft_af_bridge __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int nf_tables_bridge_init_net(struct net *net)
-{
-	net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
-	if (net->nft.bridge == NULL)
-		return -ENOMEM;
-
-	memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge));
-
-	if (nft_register_afinfo(net, net->nft.bridge) < 0)
-		goto err;
-
-	return 0;
-err:
-	kfree(net->nft.bridge);
-	return -ENOMEM;
-}
-
-static void nf_tables_bridge_exit_net(struct net *net)
-{
-	nft_unregister_afinfo(net, net->nft.bridge);
-	kfree(net->nft.bridge);
-}
-
-static struct pernet_operations nf_tables_bridge_net_ops = {
-	.init	= nf_tables_bridge_init_net,
-	.exit	= nf_tables_bridge_exit_net,
-};
-
 static const struct nf_chain_type filter_bridge = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -98,17 +70,17 @@ static int __init nf_tables_bridge_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&filter_bridge);
+	ret = nft_register_afinfo(&nft_af_bridge);
 	if (ret < 0)
 		return ret;
 
-	ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
+	ret = nft_register_chain_type(&filter_bridge);
 	if (ret < 0)
-		goto err_register_subsys;
+		goto err_register_chain;
 
 	return ret;
 
-err_register_subsys:
+err_register_chain:
 	nft_unregister_chain_type(&filter_bridge);
 
 	return ret;
@@ -116,8 +88,8 @@ err_register_subsys:
 
 static void __exit nf_tables_bridge_exit(void)
 {
-	unregister_pernet_subsys(&nf_tables_bridge_net_ops);
 	nft_unregister_chain_type(&filter_bridge);
+	nft_unregister_afinfo(&nft_af_bridge);
 }
 
 module_init(nf_tables_bridge_init);
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index f9089b2ad905..07667388ceb5 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -32,34 +32,6 @@ static struct nft_af_info nft_af_arp __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int nf_tables_arp_init_net(struct net *net)
-{
-	net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
-	if (net->nft.arp== NULL)
-		return -ENOMEM;
-
-	memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
-
-	if (nft_register_afinfo(net, net->nft.arp) < 0)
-		goto err;
-
-	return 0;
-err:
-	kfree(net->nft.arp);
-	return -ENOMEM;
-}
-
-static void nf_tables_arp_exit_net(struct net *net)
-{
-	nft_unregister_afinfo(net, net->nft.arp);
-	kfree(net->nft.arp);
-}
-
-static struct pernet_operations nf_tables_arp_net_ops = {
-	.init   = nf_tables_arp_init_net,
-	.exit   = nf_tables_arp_exit_net,
-};
-
 static const struct nf_chain_type filter_arp = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -77,21 +49,26 @@ static int __init nf_tables_arp_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&filter_arp);
+	ret = nft_register_afinfo(&nft_af_arp);
 	if (ret < 0)
 		return ret;
 
-	ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+	ret = nft_register_chain_type(&filter_arp);
 	if (ret < 0)
-		nft_unregister_chain_type(&filter_arp);
+		goto err_register_chain;
+
+	return 0;
+
+err_register_chain:
+	nft_unregister_chain_type(&filter_arp);
 
 	return ret;
 }
 
 static void __exit nf_tables_arp_exit(void)
 {
-	unregister_pernet_subsys(&nf_tables_arp_net_ops);
 	nft_unregister_chain_type(&filter_arp);
+	nft_unregister_afinfo(&nft_af_arp);
 }
 
 module_init(nf_tables_arp_init);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index a98f2de63771..e1441738acb4 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -35,34 +35,6 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int nf_tables_ipv4_init_net(struct net *net)
-{
-	net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
-	if (net->nft.ipv4 == NULL)
-		return -ENOMEM;
-
-	memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
-
-	if (nft_register_afinfo(net, net->nft.ipv4) < 0)
-		goto err;
-
-	return 0;
-err:
-	kfree(net->nft.ipv4);
-	return -ENOMEM;
-}
-
-static void nf_tables_ipv4_exit_net(struct net *net)
-{
-	nft_unregister_afinfo(net, net->nft.ipv4);
-	kfree(net->nft.ipv4);
-}
-
-static struct pernet_operations nf_tables_ipv4_net_ops = {
-	.init	= nf_tables_ipv4_init_net,
-	.exit	= nf_tables_ipv4_exit_net,
-};
-
 static const struct nf_chain_type filter_ipv4 = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -86,21 +58,25 @@ static int __init nf_tables_ipv4_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&filter_ipv4);
+	ret = nft_register_afinfo(&nft_af_ipv4);
 	if (ret < 0)
 		return ret;
 
-	ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+	ret = nft_register_chain_type(&filter_ipv4);
 	if (ret < 0)
-		nft_unregister_chain_type(&filter_ipv4);
+		goto err_register_chain;
+
+	return 0;
 
+err_register_chain:
+	nft_unregister_afinfo(&nft_af_ipv4);
 	return ret;
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
-	unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
 	nft_unregister_chain_type(&filter_ipv4);
+	nft_unregister_afinfo(&nft_af_ipv4);
 }
 
 module_init(nf_tables_ipv4_init);
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index bddd39dc1cf3..912d0e5516b0 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -33,34 +33,6 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int nf_tables_ipv6_init_net(struct net *net)
-{
-	net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
-	if (net->nft.ipv6 == NULL)
-		return -ENOMEM;
-
-	memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6));
-
-	if (nft_register_afinfo(net, net->nft.ipv6) < 0)
-		goto err;
-
-	return 0;
-err:
-	kfree(net->nft.ipv6);
-	return -ENOMEM;
-}
-
-static void nf_tables_ipv6_exit_net(struct net *net)
-{
-	nft_unregister_afinfo(net, net->nft.ipv6);
-	kfree(net->nft.ipv6);
-}
-
-static struct pernet_operations nf_tables_ipv6_net_ops = {
-	.init	= nf_tables_ipv6_init_net,
-	.exit	= nf_tables_ipv6_exit_net,
-};
-
 static const struct nf_chain_type filter_ipv6 = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -84,20 +56,24 @@ static int __init nf_tables_ipv6_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&filter_ipv6);
+	ret = nft_register_afinfo(&nft_af_ipv6);
 	if (ret < 0)
 		return ret;
 
-	ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
+	ret = nft_register_chain_type(&filter_ipv6);
 	if (ret < 0)
-		nft_unregister_chain_type(&filter_ipv6);
+		goto err_register_chain;
+
+	return 0;
 
+err_register_chain:
+	nft_unregister_afinfo(&nft_af_ipv6);
 	return ret;
 }
 
 static void __exit nf_tables_ipv6_exit(void)
 {
-	unregister_pernet_subsys(&nf_tables_ipv6_net_ops);
+	nft_unregister_afinfo(&nft_af_ipv6);
 	nft_unregister_chain_type(&filter_ipv6);
 }
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 084d1f553c46..b0ff26beec80 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -26,6 +26,7 @@
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
+static LIST_HEAD(nf_tables_af_info);
 
 /**
  *	nft_register_afinfo - register nf_tables address family info
@@ -35,17 +36,15 @@ static LIST_HEAD(nf_tables_flowtables);
  *	Register the address family for use with nf_tables. Returns zero on
  *	success or a negative errno code otherwise.
  */
-int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
+int nft_register_afinfo(struct nft_af_info *afi)
 {
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_add_tail_rcu(&afi->list, &net->nft.af_info);
+	list_add_tail_rcu(&afi->list, &nf_tables_af_info);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nft_register_afinfo);
 
-static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi);
-
 /**
  *	nft_unregister_afinfo - unregister nf_tables address family info
  *
@@ -53,10 +52,9 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi);
  *
  *	Unregister the address family for use with nf_tables.
  */
-void nft_unregister_afinfo(struct net *net, struct nft_af_info *afi)
+void nft_unregister_afinfo(struct nft_af_info *afi)
 {
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	__nft_release_afinfo(net, afi);
 	list_del_rcu(&afi->list);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
@@ -66,7 +64,7 @@ static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
 {
 	struct nft_af_info *afi;
 
-	list_for_each_entry(afi, &net->nft.af_info, list) {
+	list_for_each_entry(afi, &nf_tables_af_info, list) {
 		if (afi->family == family)
 			return afi;
 	}
@@ -5042,15 +5040,12 @@ void nft_flow_table_iterate(struct net *net,
 			    void *data)
 {
 	struct nft_flowtable *flowtable;
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
-		list_for_each_entry_rcu(table, &net->nft.tables, list) {
-			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
-				iter(&flowtable->data, data);
-			}
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+			iter(&flowtable->data, data);
 		}
 	}
 	rcu_read_unlock();
@@ -6533,21 +6528,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
 }
 EXPORT_SYMBOL_GPL(nft_data_dump);
 
-static int __net_init nf_tables_init_net(struct net *net)
-{
-	INIT_LIST_HEAD(&net->nft.af_info);
-	INIT_LIST_HEAD(&net->nft.tables);
-	INIT_LIST_HEAD(&net->nft.commit_list);
-	net->nft.base_seq = 1;
-	return 0;
-}
-
-static void __net_exit nf_tables_exit_net(struct net *net)
-{
-	WARN_ON_ONCE(!list_empty(&net->nft.af_info));
-	WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
-}
-
 int __nft_release_basechain(struct nft_ctx *ctx)
 {
 	struct nft_rule *rule, *nr;
@@ -6568,8 +6548,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(__nft_release_basechain);
 
-/* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */
-static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
+static void __nft_release_afinfo(struct net *net)
 {
 	struct nft_flowtable *flowtable, *nf;
 	struct nft_table *table, *nt;
@@ -6579,10 +6558,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 	struct nft_set *set, *ns;
 	struct nft_ctx ctx = {
 		.net	= net,
-		.family	= afi->family,
 	};
 
 	list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
+		ctx.family = table->afi->family;
+
 		list_for_each_entry(chain, &table->chains, list)
 			nf_tables_unregister_hook(net, table, chain);
 		list_for_each_entry(flowtable, &table->flowtables, list)
@@ -6623,6 +6603,21 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 	}
 }
 
+static int __net_init nf_tables_init_net(struct net *net)
+{
+	INIT_LIST_HEAD(&net->nft.tables);
+	INIT_LIST_HEAD(&net->nft.commit_list);
+	net->nft.base_seq = 1;
+	return 0;
+}
+
+static void __net_exit nf_tables_exit_net(struct net *net)
+{
+	__nft_release_afinfo(net);
+	WARN_ON_ONCE(!list_empty(&net->nft.tables));
+	WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
+}
+
 static struct pernet_operations nf_tables_net_ops = {
 	.init	= nf_tables_init_net,
 	.exit	= nf_tables_exit_net,
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 00b1fc9cea2e..d486ced4de84 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -43,34 +43,6 @@ static struct nft_af_info nft_af_inet __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int __net_init nf_tables_inet_init_net(struct net *net)
-{
-	net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
-	if (net->nft.inet == NULL)
-		return -ENOMEM;
-	memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet));
-
-	if (nft_register_afinfo(net, net->nft.inet) < 0)
-		goto err;
-
-	return 0;
-
-err:
-	kfree(net->nft.inet);
-	return -ENOMEM;
-}
-
-static void __net_exit nf_tables_inet_exit_net(struct net *net)
-{
-	nft_unregister_afinfo(net, net->nft.inet);
-	kfree(net->nft.inet);
-}
-
-static struct pernet_operations nf_tables_inet_net_ops = {
-	.init	= nf_tables_inet_init_net,
-	.exit	= nf_tables_inet_exit_net,
-};
-
 static const struct nf_chain_type filter_inet = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -94,21 +66,24 @@ static int __init nf_tables_inet_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&filter_inet);
-	if (ret < 0)
+	if (nft_register_afinfo(&nft_af_inet) < 0)
 		return ret;
 
-	ret = register_pernet_subsys(&nf_tables_inet_net_ops);
+	ret = nft_register_chain_type(&filter_inet);
 	if (ret < 0)
-		nft_unregister_chain_type(&filter_inet);
+		goto err_register_chain;
+
+	return ret;
 
+err_register_chain:
+	nft_unregister_afinfo(&nft_af_inet);
 	return ret;
 }
 
 static void __exit nf_tables_inet_exit(void)
 {
-	unregister_pernet_subsys(&nf_tables_inet_net_ops);
 	nft_unregister_chain_type(&filter_inet);
+	nft_unregister_afinfo(&nft_af_inet);
 }
 
 module_init(nf_tables_inet_init);
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 01b61a67a2ac..404b49acb125 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -43,34 +43,6 @@ static struct nft_af_info nft_af_netdev __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int nf_tables_netdev_init_net(struct net *net)
-{
-	net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
-	if (net->nft.netdev == NULL)
-		return -ENOMEM;
-
-	memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev));
-
-	if (nft_register_afinfo(net, net->nft.netdev) < 0)
-		goto err;
-
-	return 0;
-err:
-	kfree(net->nft.netdev);
-	return -ENOMEM;
-}
-
-static void nf_tables_netdev_exit_net(struct net *net)
-{
-	nft_unregister_afinfo(net, net->nft.netdev);
-	kfree(net->nft.netdev);
-}
-
-static struct pernet_operations nf_tables_netdev_net_ops = {
-	.init	= nf_tables_netdev_init_net,
-	.exit	= nf_tables_netdev_exit_net,
-};
-
 static const struct nf_chain_type nft_filter_chain_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -145,32 +117,32 @@ static int __init nf_tables_netdev_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&nft_filter_chain_netdev);
-	if (ret)
+	if (nft_register_afinfo(&nft_af_netdev) < 0)
 		return ret;
 
-	ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+	ret = nft_register_chain_type(&nft_filter_chain_netdev);
 	if (ret)
-		goto err1;
+		goto err_register_chain_type;
 
 	ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
 	if (ret)
-		goto err2;
+		goto err_register_netdevice_notifier;
 
 	return 0;
 
-err2:
-	unregister_pernet_subsys(&nf_tables_netdev_net_ops);
-err1:
+err_register_netdevice_notifier:
 	nft_unregister_chain_type(&nft_filter_chain_netdev);
+err_register_chain_type:
+	nft_unregister_afinfo(&nft_af_netdev);
+
 	return ret;
 }
 
 static void __exit nf_tables_netdev_exit(void)
 {
 	unregister_netdevice_notifier(&nf_tables_netdev_notifier);
-	unregister_pernet_subsys(&nf_tables_netdev_net_ops);
 	nft_unregister_chain_type(&nft_filter_chain_netdev);
+	nft_unregister_afinfo(&nft_af_netdev);
 }
 
 module_init(nf_tables_netdev_init);
-- 
cgit v1.2.3


From 98319cb9089844d76e65a6cce5bfbd165e698735 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jan 2018 02:48:47 +0100
Subject: netfilter: nf_tables: get rid of struct nft_af_info abstraction

Remove the infrastructure to register/unregister nft_af_info structure,
this structure stores no useful information anymore.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       |  23 +--
 net/bridge/netfilter/nf_tables_bridge.c |  25 +--
 net/ipv4/netfilter/nf_tables_arp.c      |  25 +--
 net/ipv4/netfilter/nf_tables_ipv4.c     |  24 +--
 net/ipv6/netfilter/nf_tables_ipv6.c     |  24 +--
 net/netfilter/nf_tables_api.c           | 305 ++++++++------------------------
 net/netfilter/nf_tables_inet.c          |  23 +--
 net/netfilter/nf_tables_netdev.c        |  19 +-
 8 files changed, 86 insertions(+), 382 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 12f83d223caa..4aca413367ee 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -960,28 +960,12 @@ struct nft_table {
 	struct list_head		flowtables;
 	u64				hgenerator;
 	u32				use;
-	u16				flags:14,
+	u16				family:6,
+					flags:8,
 					genmask:2;
-	struct nft_af_info		*afi;
 	char				*name;
 };
 
-/**
- *	struct nft_af_info - nf_tables address family info
- *
- *	@list: used internally
- *	@family: address family
- *	@owner: module owner
- */
-struct nft_af_info {
-	struct list_head		list;
-	int				family;
-	struct module			*owner;
-};
-
-int nft_register_afinfo(struct nft_af_info *);
-void nft_unregister_afinfo(struct nft_af_info *);
-
 int nft_register_chain_type(const struct nf_chain_type *);
 void nft_unregister_chain_type(const struct nf_chain_type *);
 
@@ -1146,9 +1130,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 
 void nft_trace_notify(struct nft_traceinfo *info);
 
-#define MODULE_ALIAS_NFT_FAMILY(family)	\
-	MODULE_ALIAS("nft-afinfo-" __stringify(family))
-
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
 	MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)
 
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index dbf7195f059c..5160cf614176 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -42,11 +42,6 @@ nft_do_chain_bridge(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static struct nft_af_info nft_af_bridge __read_mostly = {
-	.family		= NFPROTO_BRIDGE,
-	.owner		= THIS_MODULE,
-};
-
 static const struct nf_chain_type filter_bridge = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -68,28 +63,12 @@ static const struct nf_chain_type filter_bridge = {
 
 static int __init nf_tables_bridge_init(void)
 {
-	int ret;
-
-	ret = nft_register_afinfo(&nft_af_bridge);
-	if (ret < 0)
-		return ret;
-
-	ret = nft_register_chain_type(&filter_bridge);
-	if (ret < 0)
-		goto err_register_chain;
-
-	return ret;
-
-err_register_chain:
-	nft_unregister_chain_type(&filter_bridge);
-
-	return ret;
+	return nft_register_chain_type(&filter_bridge);
 }
 
 static void __exit nf_tables_bridge_exit(void)
 {
 	nft_unregister_chain_type(&filter_bridge);
-	nft_unregister_afinfo(&nft_af_bridge);
 }
 
 module_init(nf_tables_bridge_init);
@@ -97,4 +76,4 @@ module_exit(nf_tables_bridge_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(AF_BRIDGE);
+MODULE_ALIAS_NFT_CHAIN(AF_BRIDGE, "filter");
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 07667388ceb5..036c074736b0 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -27,11 +27,6 @@ nft_do_chain_arp(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static struct nft_af_info nft_af_arp __read_mostly = {
-	.family		= NFPROTO_ARP,
-	.owner		= THIS_MODULE,
-};
-
 static const struct nf_chain_type filter_arp = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -47,28 +42,12 @@ static const struct nf_chain_type filter_arp = {
 
 static int __init nf_tables_arp_init(void)
 {
-	int ret;
-
-	ret = nft_register_afinfo(&nft_af_arp);
-	if (ret < 0)
-		return ret;
-
-	ret = nft_register_chain_type(&filter_arp);
-	if (ret < 0)
-		goto err_register_chain;
-
-	return 0;
-
-err_register_chain:
-	nft_unregister_chain_type(&filter_arp);
-
-	return ret;
+	return nft_register_chain_type(&filter_arp);
 }
 
 static void __exit nf_tables_arp_exit(void)
 {
 	nft_unregister_chain_type(&filter_arp);
-	nft_unregister_afinfo(&nft_af_arp);
 }
 
 module_init(nf_tables_arp_init);
@@ -76,4 +55,4 @@ module_exit(nf_tables_arp_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
+MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index e1441738acb4..96f955496d5f 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -30,11 +30,6 @@ static unsigned int nft_do_chain_ipv4(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static struct nft_af_info nft_af_ipv4 __read_mostly = {
-	.family		= NFPROTO_IPV4,
-	.owner		= THIS_MODULE,
-};
-
 static const struct nf_chain_type filter_ipv4 = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -56,27 +51,12 @@ static const struct nf_chain_type filter_ipv4 = {
 
 static int __init nf_tables_ipv4_init(void)
 {
-	int ret;
-
-	ret = nft_register_afinfo(&nft_af_ipv4);
-	if (ret < 0)
-		return ret;
-
-	ret = nft_register_chain_type(&filter_ipv4);
-	if (ret < 0)
-		goto err_register_chain;
-
-	return 0;
-
-err_register_chain:
-	nft_unregister_afinfo(&nft_af_ipv4);
-	return ret;
+	return nft_register_chain_type(&filter_ipv4);
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
 	nft_unregister_chain_type(&filter_ipv4);
-	nft_unregister_afinfo(&nft_af_ipv4);
 }
 
 module_init(nf_tables_ipv4_init);
@@ -84,4 +64,4 @@ module_exit(nf_tables_ipv4_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(AF_INET);
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 912d0e5516b0..17e03589331c 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -28,11 +28,6 @@ static unsigned int nft_do_chain_ipv6(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static struct nft_af_info nft_af_ipv6 __read_mostly = {
-	.family		= NFPROTO_IPV6,
-	.owner		= THIS_MODULE,
-};
-
 static const struct nf_chain_type filter_ipv6 = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -54,26 +49,11 @@ static const struct nf_chain_type filter_ipv6 = {
 
 static int __init nf_tables_ipv6_init(void)
 {
-	int ret;
-
-	ret = nft_register_afinfo(&nft_af_ipv6);
-	if (ret < 0)
-		return ret;
-
-	ret = nft_register_chain_type(&filter_ipv6);
-	if (ret < 0)
-		goto err_register_chain;
-
-	return 0;
-
-err_register_chain:
-	nft_unregister_afinfo(&nft_af_ipv6);
-	return ret;
+	return nft_register_chain_type(&filter_ipv6);
 }
 
 static void __exit nf_tables_ipv6_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_ipv6);
 	nft_unregister_chain_type(&filter_ipv6);
 }
 
@@ -82,4 +62,4 @@ module_exit(nf_tables_ipv6_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(AF_INET6);
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b0ff26beec80..0b814cbcd45e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -26,71 +26,6 @@
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
-static LIST_HEAD(nf_tables_af_info);
-
-/**
- *	nft_register_afinfo - register nf_tables address family info
- *
- *	@afi: address family info to register
- *
- *	Register the address family for use with nf_tables. Returns zero on
- *	success or a negative errno code otherwise.
- */
-int nft_register_afinfo(struct nft_af_info *afi)
-{
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_add_tail_rcu(&afi->list, &nf_tables_af_info);
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nft_register_afinfo);
-
-/**
- *	nft_unregister_afinfo - unregister nf_tables address family info
- *
- *	@afi: address family info to unregister
- *
- *	Unregister the address family for use with nf_tables.
- */
-void nft_unregister_afinfo(struct nft_af_info *afi)
-{
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_del_rcu(&afi->list);
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
-
-static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
-{
-	struct nft_af_info *afi;
-
-	list_for_each_entry(afi, &nf_tables_af_info, list) {
-		if (afi->family == family)
-			return afi;
-	}
-	return NULL;
-}
-
-static struct nft_af_info *
-nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
-{
-	struct nft_af_info *afi;
-
-	afi = nft_afinfo_lookup(net, family);
-	if (afi != NULL)
-		return afi;
-#ifdef CONFIG_MODULES
-	if (autoload) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-afinfo-%u", family);
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		afi = nft_afinfo_lookup(net, family);
-		if (afi != NULL)
-			return ERR_PTR(-EAGAIN);
-	}
-#endif
-	return ERR_PTR(-EAFNOSUPPORT);
-}
 
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
@@ -390,7 +325,7 @@ static struct nft_table *nft_table_lookup(const struct net *net,
 
 	list_for_each_entry(table, &net->nft.tables, list) {
 		if (!nla_strcmp(nla, table->name) &&
-		    table->afi->family == family &&
+		    table->family == family &&
 		    nft_active_genmask(table, genmask))
 			return table;
 	}
@@ -531,7 +466,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 	cb->seq = net->nft.base_seq;
 
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		if (family != NFPROTO_UNSPEC && family != table->afi->family)
+		if (family != NFPROTO_UNSPEC && family != table->family)
 			continue;
 
 		if (idx < s_idx)
@@ -545,7 +480,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      NFT_MSG_NEWTABLE, NLM_F_MULTI,
-					      table->afi->family, table) < 0)
+					      table->family, table) < 0)
 			goto done;
 
 		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -565,7 +500,6 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	struct sk_buff *skb2;
 	int family = nfmsg->nfgen_family;
@@ -578,11 +512,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -702,19 +632,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
 	const struct nlattr *name;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	int family = nfmsg->nfgen_family;
 	u32 flags = 0;
 	struct nft_ctx ctx;
 	int err;
 
-	afi = nf_tables_afinfo_lookup(net, family, true);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(net, name, afi->family, genmask);
+	table = nf_tables_table_lookup(net, name, family, genmask);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -724,7 +649,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+		nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 		return nf_tables_updtable(&ctx);
 	}
 
@@ -734,40 +659,34 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 			return -EINVAL;
 	}
 
-	err = -EAFNOSUPPORT;
-	if (!try_module_get(afi->owner))
-		goto err1;
-
 	err = -ENOMEM;
 	table = kzalloc(sizeof(*table), GFP_KERNEL);
 	if (table == NULL)
-		goto err2;
+		goto err_kzalloc;
 
 	table->name = nla_strdup(name, GFP_KERNEL);
 	if (table->name == NULL)
-		goto err3;
+		goto err_strdup;
 
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
 	INIT_LIST_HEAD(&table->flowtables);
-	table->afi = afi;
+	table->family = family;
 	table->flags = flags;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
 	if (err < 0)
-		goto err4;
+		goto err_trans;
 
 	list_add_tail_rcu(&table->list, &net->nft.tables);
 	return 0;
-err4:
+err_trans:
 	kfree(table->name);
-err3:
+err_strdup:
 	kfree(table);
-err2:
-	module_put(afi->owner);
-err1:
+err_kzalloc:
 	return err;
 }
 
@@ -838,10 +757,10 @@ static int nft_flush(struct nft_ctx *ctx, int family)
 	int err = 0;
 
 	list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) {
-		if (family != AF_UNSPEC && table->afi->family != family)
+		if (family != AF_UNSPEC && table->family != family)
 			continue;
 
-		ctx->family = table->afi->family;
+		ctx->family = table->family;
 
 		if (!nft_is_active_next(ctx->net, table))
 			continue;
@@ -867,7 +786,6 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	int family = nfmsg->nfgen_family;
 	struct nft_ctx ctx;
@@ -876,11 +794,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
 		return nft_flush(&ctx, family);
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -889,7 +803,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	    table->use > 0)
 		return -EBUSY;
 
-	ctx.family = afi->family;
+	ctx.family = family;
 	ctx.table = table;
 
 	return nft_flush_table(&ctx);
@@ -901,7 +815,6 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 
 	kfree(ctx->table->name);
 	kfree(ctx->table);
-	module_put(ctx->table->afi->owner);
 }
 
 int nft_register_chain_type(const struct nf_chain_type *ctype)
@@ -1130,7 +1043,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 	cb->seq = net->nft.base_seq;
 
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		if (family != NFPROTO_UNSPEC && family != table->afi->family)
+		if (family != NFPROTO_UNSPEC && family != table->family)
 			continue;
 
 		list_for_each_entry_rcu(chain, &table->chains, list) {
@@ -1146,7 +1059,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 						      cb->nlh->nlmsg_seq,
 						      NFT_MSG_NEWCHAIN,
 						      NLM_F_MULTI,
-						      table->afi->family, table,
+						      table->family, table,
 						      chain) < 0)
 				goto done;
 
@@ -1168,7 +1081,6 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	struct sk_buff *skb2;
@@ -1182,11 +1094,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -1555,7 +1463,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	const struct nlattr * uninitialized_var(name);
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
 	u8 policy = NF_ACCEPT;
@@ -1565,11 +1472,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(net, family, true);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -1610,7 +1513,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 		}
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
 
 	if (chain != NULL) {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
@@ -1631,7 +1534,6 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule;
@@ -1640,11 +1542,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	u32 use;
 	int err;
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -1657,7 +1555,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	    chain->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
 
 	use = chain->use;
 	list_for_each_entry(rule, &chain->rules, list) {
@@ -2080,7 +1978,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	cb->seq = net->nft.base_seq;
 
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		if (family != NFPROTO_UNSPEC && family != table->afi->family)
+		if (family != NFPROTO_UNSPEC && family != table->family)
 			continue;
 
 		if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
@@ -2103,7 +2001,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 							      cb->nlh->nlmsg_seq,
 							      NFT_MSG_NEWRULE,
 							      NLM_F_MULTI | NLM_F_APPEND,
-							      table->afi->family,
+							      table->family,
 							      table, chain, rule) < 0)
 					goto done;
 
@@ -2139,7 +2037,6 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
@@ -2183,11 +2080,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -2245,7 +2138,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_af_info *afi;
+	int family = nfmsg->nfgen_family;
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *old_rule = NULL;
@@ -2261,11 +2154,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -2305,7 +2194,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			return PTR_ERR(old_rule);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
 
 	n = 0;
 	size = 0;
@@ -2429,18 +2318,13 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain = NULL;
 	struct nft_rule *rule;
 	int family = nfmsg->nfgen_family, err = 0;
 	struct nft_ctx ctx;
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -2452,7 +2336,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			return PTR_ERR(chain);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
 
 	if (chain) {
 		if (nla[NFTA_RULE_HANDLE]) {
@@ -2632,26 +2516,17 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 				     u8 genmask)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	struct nft_af_info *afi = NULL;
+	int family = nfmsg->nfgen_family;
 	struct nft_table *table = NULL;
 
-	if (nfmsg->nfgen_family != NFPROTO_UNSPEC) {
-		afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
-		if (IS_ERR(afi))
-			return PTR_ERR(afi);
-	}
-
 	if (nla[NFTA_SET_TABLE] != NULL) {
-		if (afi == NULL)
-			return -EAFNOSUPPORT;
-
 		table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE],
-					       afi->family, genmask);
+					       family, genmask);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
 
-	nft_ctx_init(ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
 	return 0;
 }
 
@@ -2882,7 +2757,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
 
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
 		if (ctx->family != NFPROTO_UNSPEC &&
-		    ctx->family != table->afi->family)
+		    ctx->family != table->family)
 			continue;
 
 		if (ctx->table && ctx->table != table)
@@ -2903,7 +2778,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
 
 			ctx_set = *ctx;
 			ctx_set.table = table;
-			ctx_set.family = table->afi->family;
+			ctx_set.family = table->family;
 
 			if (nf_tables_fill_set(skb, &ctx_set, set,
 					       NFT_MSG_NEWSET,
@@ -3015,8 +2890,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
 	const struct nft_set_ops *ops;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -3123,16 +2998,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
 	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
@@ -3390,19 +3261,15 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 				      u8 genmask)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	struct nft_af_info *afi;
+	int family = nfmsg->nfgen_family;
 	struct nft_table *table;
 
-	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
 	table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE],
-				       afi->family, genmask);
+				       family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
 	return 0;
 }
 
@@ -3520,7 +3387,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	rcu_read_lock();
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
 		if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
-		    dump_ctx->ctx.family != table->afi->family)
+		    dump_ctx->ctx.family != table->family)
 			continue;
 
 		if (table != dump_ctx->ctx.table)
@@ -3550,7 +3417,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 		goto nla_put_failure;
 
 	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = table->afi->family;
+	nfmsg->nfgen_family = table->family;
 	nfmsg->version      = NFNETLINK_V0;
 	nfmsg->res_id	    = htons(net->nft.base_seq & 0xffff);
 
@@ -4501,7 +4368,6 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 	const struct nft_object_type *type;
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_object *obj;
 	struct nft_ctx ctx;
@@ -4513,11 +4379,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_DATA])
 		return -EINVAL;
 
-	afi = nf_tables_afinfo_lookup(net, family, true);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -4536,7 +4398,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		return 0;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
 	type = nft_obj_type_get(objtype);
 	if (IS_ERR(type))
@@ -4628,7 +4490,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 	cb->seq = net->nft.base_seq;
 
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		if (family != NFPROTO_UNSPEC && family != table->afi->family)
+		if (family != NFPROTO_UNSPEC && family != table->family)
 			continue;
 
 		list_for_each_entry_rcu(obj, &table->objects, list) {
@@ -4651,7 +4513,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 						    cb->nlh->nlmsg_seq,
 						    NFT_MSG_NEWOBJ,
 						    NLM_F_MULTI | NLM_F_APPEND,
-						    table->afi->family, table,
+						    table->family, table,
 						    obj, reset) < 0)
 				goto done;
 
@@ -4709,7 +4571,6 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
 	int family = nfmsg->nfgen_family;
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	struct nft_object *obj;
 	struct sk_buff *skb2;
@@ -4740,11 +4601,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_TYPE])
 		return -EINVAL;
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -4791,7 +4648,6 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_object *obj;
 	struct nft_ctx ctx;
@@ -4801,11 +4657,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_NAME])
 		return -EINVAL;
 
-	afi = nf_tables_afinfo_lookup(net, family, true);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], afi->family,
+	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
 				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -4817,7 +4669,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	if (obj->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
 	return nft_delobj(&ctx, obj);
 }
@@ -5002,33 +4854,31 @@ err1:
 	return err;
 }
 
-static const struct nf_flowtable_type *
-__nft_flowtable_type_get(const struct nft_af_info *afi)
+static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
 {
 	const struct nf_flowtable_type *type;
 
 	list_for_each_entry(type, &nf_tables_flowtables, list) {
-		if (afi->family == type->family)
+		if (family == type->family)
 			return type;
 	}
 	return NULL;
 }
 
-static const struct nf_flowtable_type *
-nft_flowtable_type_get(const struct nft_af_info *afi)
+static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
 {
 	const struct nf_flowtable_type *type;
 
-	type = __nft_flowtable_type_get(afi);
+	type = __nft_flowtable_type_get(family);
 	if (type != NULL && try_module_get(type->owner))
 		return type;
 
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
 		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nf-flowtable-%u", afi->family);
+		request_module("nf-flowtable-%u", family);
 		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		if (__nft_flowtable_type_get(afi))
+		if (__nft_flowtable_type_get(family))
 			return ERR_PTR(-EAGAIN);
 	}
 #endif
@@ -5076,7 +4926,6 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_ctx ctx;
 	int err, i, k;
@@ -5086,12 +4935,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_FLOWTABLE_HOOK])
 		return -EINVAL;
 
-	afi = nf_tables_afinfo_lookup(net, family, true);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
 	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       afi->family, genmask);
+				       family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -5108,7 +4953,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		return 0;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
 	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
 	if (!flowtable)
@@ -5121,7 +4966,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 
-	type = nft_flowtable_type_get(afi);
+	type = nft_flowtable_type_get(family);
 	if (IS_ERR(type)) {
 		err = PTR_ERR(type);
 		goto err2;
@@ -5181,16 +5026,11 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
-	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_ctx ctx;
 
-	afi = nf_tables_afinfo_lookup(net, family, true);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
 	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       afi->family, genmask);
+				       family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -5201,7 +5041,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	if (flowtable->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, afi->family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
 	return nft_delflowtable(&ctx, flowtable);
 }
@@ -5276,7 +5116,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
 	cb->seq = net->nft.base_seq;
 
 	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		if (family != NFPROTO_UNSPEC && family != table->afi->family)
+		if (family != NFPROTO_UNSPEC && family != table->family)
 			continue;
 
 		list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
@@ -5295,7 +5135,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
 							  cb->nlh->nlmsg_seq,
 							  NFT_MSG_NEWFLOWTABLE,
 							  NLM_F_MULTI | NLM_F_APPEND,
-							  table->afi->family, flowtable) < 0)
+							  table->family, flowtable) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -5353,7 +5193,6 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	u8 genmask = nft_genmask_cur(net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
-	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	struct sk_buff *skb2;
 	int err;
@@ -5379,12 +5218,8 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	if (!nla[NFTA_FLOWTABLE_NAME])
 		return -EINVAL;
 
-	afi = nf_tables_afinfo_lookup(net, family, false);
-	if (IS_ERR(afi))
-		return PTR_ERR(afi);
-
 	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       afi->family, genmask);
+				       family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -6548,7 +6383,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(__nft_release_basechain);
 
-static void __nft_release_afinfo(struct net *net)
+static void __nft_release_tables(struct net *net)
 {
 	struct nft_flowtable *flowtable, *nf;
 	struct nft_table *table, *nt;
@@ -6561,7 +6396,7 @@ static void __nft_release_afinfo(struct net *net)
 	};
 
 	list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
-		ctx.family = table->afi->family;
+		ctx.family = table->family;
 
 		list_for_each_entry(chain, &table->chains, list)
 			nf_tables_unregister_hook(net, table, chain);
@@ -6613,7 +6448,7 @@ static int __net_init nf_tables_init_net(struct net *net)
 
 static void __net_exit nf_tables_exit_net(struct net *net)
 {
-	__nft_release_afinfo(net);
+	__nft_release_tables(net);
 	WARN_ON_ONCE(!list_empty(&net->nft.tables));
 	WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
 }
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index d486ced4de84..e30c7da09d0d 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -38,11 +38,6 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
 	return nft_do_chain(&pkt, priv);
 }
 
-static struct nft_af_info nft_af_inet __read_mostly = {
-	.family		= NFPROTO_INET,
-	.owner		= THIS_MODULE,
-};
-
 static const struct nf_chain_type filter_inet = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -64,26 +59,12 @@ static const struct nf_chain_type filter_inet = {
 
 static int __init nf_tables_inet_init(void)
 {
-	int ret;
-
-	if (nft_register_afinfo(&nft_af_inet) < 0)
-		return ret;
-
-	ret = nft_register_chain_type(&filter_inet);
-	if (ret < 0)
-		goto err_register_chain;
-
-	return ret;
-
-err_register_chain:
-	nft_unregister_afinfo(&nft_af_inet);
-	return ret;
+	return nft_register_chain_type(&filter_inet);
 }
 
 static void __exit nf_tables_inet_exit(void)
 {
 	nft_unregister_chain_type(&filter_inet);
-	nft_unregister_afinfo(&nft_af_inet);
 }
 
 module_init(nf_tables_inet_init);
@@ -91,4 +72,4 @@ module_exit(nf_tables_inet_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(1);
+MODULE_ALIAS_NFT_CHAIN(1, "filter");
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 404b49acb125..4041fafca934 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -38,11 +38,6 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 	return nft_do_chain(&pkt, priv);
 }
 
-static struct nft_af_info nft_af_netdev __read_mostly = {
-	.family		= NFPROTO_NETDEV,
-	.owner		= THIS_MODULE,
-};
-
 static const struct nf_chain_type nft_filter_chain_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
@@ -91,10 +86,10 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	list_for_each_entry(table, &ctx.net->nft.tables, list) {
-		if (table->afi->family != NFPROTO_NETDEV)
+		if (table->family != NFPROTO_NETDEV)
 			continue;
 
-		ctx.family = table->afi->family;
+		ctx.family = table->family;
 		ctx.table = table;
 		list_for_each_entry_safe(chain, nr, &table->chains, list) {
 			if (!nft_is_base_chain(chain))
@@ -117,12 +112,9 @@ static int __init nf_tables_netdev_init(void)
 {
 	int ret;
 
-	if (nft_register_afinfo(&nft_af_netdev) < 0)
-		return ret;
-
 	ret = nft_register_chain_type(&nft_filter_chain_netdev);
 	if (ret)
-		goto err_register_chain_type;
+		return ret;
 
 	ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
 	if (ret)
@@ -132,8 +124,6 @@ static int __init nf_tables_netdev_init(void)
 
 err_register_netdevice_notifier:
 	nft_unregister_chain_type(&nft_filter_chain_netdev);
-err_register_chain_type:
-	nft_unregister_afinfo(&nft_af_netdev);
 
 	return ret;
 }
@@ -142,7 +132,6 @@ static void __exit nf_tables_netdev_exit(void)
 {
 	unregister_netdevice_notifier(&nf_tables_netdev_notifier);
 	nft_unregister_chain_type(&nft_filter_chain_netdev);
-	nft_unregister_afinfo(&nft_af_netdev);
 }
 
 module_init(nf_tables_netdev_init);
@@ -150,4 +139,4 @@ module_exit(nf_tables_netdev_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */
+MODULE_ALIAS_NFT_CHAIN(5, "filter"); /* NFPROTO_NETDEV */
-- 
cgit v1.2.3


From 20651cefd25ffa77a15cab5853b175a6dc975ec2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Jan 2018 14:30:48 +0100
Subject: netfilter: x_tables: unbreak module auto loading

a typo causes module auto load support to never be compiled in.

Fixes: 03d13b6868a2 ("netfilter: xtables: add and use xt_request_find_table_lock")
Reported-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 10c19a3f4cbd..5b8f3b7358e6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1082,7 +1082,7 @@ struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
 {
 	struct xt_table *t = xt_find_table_lock(net, af, name);
 
-#ifdef CONFIG_MODULE
+#ifdef CONFIG_MODULES
 	if (IS_ERR(t)) {
 		int err = request_module("%stable_%s", xt_prefix[af], name);
 		if (err)
-- 
cgit v1.2.3


From 03a0120f75dfb1807c0441376e26b36160087de4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 07:04:54 +0000
Subject: netfilter: nf_tables: fix a typo in nf_tables_getflowtable()

Fix a typo, we should check 'flowtable' instead of 'table'.

Fixes: 3b49e2e94e6e ("netfilter: nf_tables: add flow table netlink frontend")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0b814cbcd45e..b541e5094dce 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5225,7 +5225,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 
 	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
 					       genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(flowtable))
 		return PTR_ERR(flowtable);
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-- 
cgit v1.2.3


From 0ded1785f3c810182408f9498a726e89267b0275 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 07:05:06 +0000
Subject: netfilter: core: make local function __nf_unregister_net_hook static

Fixes the following sparse warning:

net/netfilter/core.c:380:6: warning:
 symbol '__nf_unregister_net_hook' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 997dd387d259..3f8e2d06b9cc 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -377,8 +377,8 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
 	}
 }
 
-void __nf_unregister_net_hook(struct net *net, int pf,
-			      const struct nf_hook_ops *reg)
+static void __nf_unregister_net_hook(struct net *net, int pf,
+				     const struct nf_hook_ops *reg)
 {
 	struct nf_hook_entries __rcu **pp;
 	struct nf_hook_entries *p;
-- 
cgit v1.2.3


From 99eadf67c8fe0d9ebe5f4a2b1551d8238b4a43bf Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 13:06:46 +0000
Subject: netfilter: remove duplicated include

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 1 -
 net/netfilter/nf_queue.c                | 2 --
 2 files changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 0c3b9d32f64f..fff21602875a 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -5,7 +5,6 @@
 #include <linux/rhashtable.h>
 #include <linux/ipv6.h>
 #include <linux/netdevice.h>
-#include <linux/ipv6.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
 #include <net/neighbour.h>
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 7f55af5f3d1a..d67a96a25a68 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -15,8 +15,6 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
 #include <net/protocol.h>
 #include <net/netfilter/nf_queue.h>
 #include <net/dst.h>
-- 
cgit v1.2.3


From cbef426ce725d46beb5ba84b9e9eb624c25462dd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 10 Jan 2018 15:24:15 +0100
Subject: netfilter: core: return EBUSY in case NAT hook is already in use

EEXIST is used for an object that already exists, with the same
name/handle. However, there no same object there, instead there is a
object that is using the single slot that is available for NAT hooks
since patch f92b40a8b264 ("netfilter: core: only allow one nat hook per
hook point"). Let's change this return value before this behaviour gets
exposed in the first -rc.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 3f8e2d06b9cc..0f6b8172fb9a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -140,7 +140,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
 
 		if (reg->nat_hook && orig_ops[i]->nat_hook) {
 			kvfree(new);
-			return ERR_PTR(-EEXIST);
+			return ERR_PTR(-EBUSY);
 		}
 
 		if (inserted || reg->priority > orig_ops[i]->priority) {
-- 
cgit v1.2.3


From 202a8ff545ccdaa5ac2000d9201df3453c8816be Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Sun, 7 Jan 2018 19:22:02 +0100
Subject: netfilter: add IPv6 segment routing header 'srh' match

It allows matching packets based on Segment Routing Header
(SRH) information.
The implementation considers revision 7 of the SRH draft.
https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-07

Currently supported match options include:
(1) Next Header
(2) Hdr Ext Len
(3) Segments Left
(4) Last Entry
(5) Tag value of SRH

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h |  57 ++++++++++
 net/ipv6/netfilter/Kconfig                   |   9 ++
 net/ipv6/netfilter/Makefile                  |   1 +
 net/ipv6/netfilter/ip6t_srh.c                | 161 +++++++++++++++++++++++++++
 4 files changed, 228 insertions(+)
 create mode 100644 include/uapi/linux/netfilter_ipv6/ip6t_srh.h
 create mode 100644 net/ipv6/netfilter/ip6t_srh.c

(limited to 'net')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
new file mode 100644
index 000000000000..f3cc0ef514a7
--- /dev/null
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _IP6T_SRH_H
+#define _IP6T_SRH_H
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+
+/* Values for "mt_flags" field in struct ip6t_srh */
+#define IP6T_SRH_NEXTHDR        0x0001
+#define IP6T_SRH_LEN_EQ         0x0002
+#define IP6T_SRH_LEN_GT         0x0004
+#define IP6T_SRH_LEN_LT         0x0008
+#define IP6T_SRH_SEGS_EQ        0x0010
+#define IP6T_SRH_SEGS_GT        0x0020
+#define IP6T_SRH_SEGS_LT        0x0040
+#define IP6T_SRH_LAST_EQ        0x0080
+#define IP6T_SRH_LAST_GT        0x0100
+#define IP6T_SRH_LAST_LT        0x0200
+#define IP6T_SRH_TAG            0x0400
+#define IP6T_SRH_MASK           0x07FF
+
+/* Values for "mt_invflags" field in struct ip6t_srh */
+#define IP6T_SRH_INV_NEXTHDR    0x0001
+#define IP6T_SRH_INV_LEN_EQ     0x0002
+#define IP6T_SRH_INV_LEN_GT     0x0004
+#define IP6T_SRH_INV_LEN_LT     0x0008
+#define IP6T_SRH_INV_SEGS_EQ    0x0010
+#define IP6T_SRH_INV_SEGS_GT    0x0020
+#define IP6T_SRH_INV_SEGS_LT    0x0040
+#define IP6T_SRH_INV_LAST_EQ    0x0080
+#define IP6T_SRH_INV_LAST_GT    0x0100
+#define IP6T_SRH_INV_LAST_LT    0x0200
+#define IP6T_SRH_INV_TAG        0x0400
+#define IP6T_SRH_INV_MASK       0x07FF
+
+/**
+ *      struct ip6t_srh - SRH match options
+ *      @ next_hdr: Next header field of SRH
+ *      @ hdr_len: Extension header length field of SRH
+ *      @ segs_left: Segments left field of SRH
+ *      @ last_entry: Last entry field of SRH
+ *      @ tag: Tag field of SRH
+ *      @ mt_flags: match options
+ *      @ mt_invflags: Invert the sense of match options
+ */
+
+struct ip6t_srh {
+	__u8                    next_hdr;
+	__u8                    hdr_len;
+	__u8                    segs_left;
+	__u8                    last_entry;
+	__u16                   tag;
+	__u16                   mt_flags;
+	__u16                   mt_invflags;
+};
+
+#endif /*_IP6T_SRH_H*/
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 806e95375ec8..b6f5edf926d2 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -240,6 +240,15 @@ config IP6_NF_MATCH_RT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_SRH
+        tristate '"srh" Segment Routing header match support'
+        depends on NETFILTER_ADVANCED
+        help
+          srh matching allows you to match packets based on the segment
+	  routing header of the packet.
+
+          To compile it as a module, choose M here.  If unsure, say N.
+
 # The targets
 config IP6_NF_TARGET_HL
 	tristate '"HL" hoplimit target support'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 95611c4b39b0..d984057b8395 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o
 obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o
 obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o
 obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
+obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o
 
 # targets
 obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
new file mode 100644
index 000000000000..9642164107ce
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -0,0 +1,161 @@
+/* Kernel module to match Segment Routing Header (SRH) parameters. */
+
+/* Author:
+ * Ahmed Abdelsalam <amsalam20@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version 2
+ *	of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/ipv6.h>
+#include <net/seg6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6t_srh.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+/* Test a struct->mt_invflags and a boolean for inequality */
+#define NF_SRH_INVF(ptr, flag, boolean)	\
+	((boolean) ^ !!((ptr)->mt_invflags & (flag)))
+
+static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip6t_srh *srhinfo = par->matchinfo;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6_sr_hdr _srh;
+	int hdrlen, srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return false;
+	srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+	if (!srh)
+		return false;
+
+	hdrlen = ipv6_optlen(srh);
+	if (skb->len - srhoff < hdrlen)
+		return false;
+
+	if (srh->type != IPV6_SRCRT_TYPE_4)
+		return false;
+
+	if (srh->segments_left > srh->first_segment)
+		return false;
+
+	/* Next Header matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+				!(srh->nexthdr == srhinfo->next_hdr)))
+			return false;
+
+	/* Header Extension Length matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+				!(srh->hdrlen == srhinfo->hdr_len)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+				!(srh->hdrlen > srhinfo->hdr_len)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+				!(srh->hdrlen < srhinfo->hdr_len)))
+			return false;
+
+	/* Segments Left matching */
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+				!(srh->segments_left == srhinfo->segs_left)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+				!(srh->segments_left > srhinfo->segs_left)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+				!(srh->segments_left < srhinfo->segs_left)))
+			return false;
+
+	/**
+	 * Last Entry matching
+	 * Last_Entry field was introduced in revision 6 of the SRH draft.
+	 * It was called First_Segment in the previous revision
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+				!(srh->first_segment == srhinfo->last_entry)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+				!(srh->first_segment > srhinfo->last_entry)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+				!(srh->first_segment < srhinfo->last_entry)))
+			return false;
+
+	/**
+	 * Tag matchig
+	 * Tag field was introduced in revision 6 of the SRH draft.
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_TAG)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+				!(srh->tag == srhinfo->tag)))
+			return false;
+	return true;
+}
+
+static int srh_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_srh *srhinfo = par->matchinfo;
+
+	if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+		pr_err("unknown srh match flags  %X\n", srhinfo->mt_flags);
+		return -EINVAL;
+	}
+
+	if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+		pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match srh_mt6_reg __read_mostly = {
+	.name		= "srh",
+	.family		= NFPROTO_IPV6,
+	.match		= srh_mt6,
+	.matchsize	= sizeof(struct ip6t_srh),
+	.checkentry	= srh_mt6_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init srh_mt6_init(void)
+{
+	return xt_register_match(&srh_mt6_reg);
+}
+
+static void __exit srh_mt6_exit(void)
+{
+	xt_unregister_match(&srh_mt6_reg);
+}
+
+module_init(srh_mt6_init);
+module_exit(srh_mt6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match");
+MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>");
-- 
cgit v1.2.3


From a0a97f2a1a2351923ab0ab266598d2a92c65eb18 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Jan 2018 18:10:59 +0100
Subject: netfilter: improve flow table Kconfig dependencies

The newly added NF_FLOW_TABLE options cause some build failures in
randconfig kernels:

- when CONFIG_NF_CONNTRACK is disabled, or is a loadable module but
  NF_FLOW_TABLE is built-in:

  In file included from net/netfilter/nf_flow_table.c:8:0:
  include/net/netfilter/nf_conntrack.h:59:22: error: field 'ct_general' has incomplete type
    struct nf_conntrack ct_general;
  include/net/netfilter/nf_conntrack.h: In function 'nf_ct_get':
  include/net/netfilter/nf_conntrack.h:148:15: error: 'const struct sk_buff' has no member named '_nfct'
  include/net/netfilter/nf_conntrack.h: In function 'nf_ct_put':
  include/net/netfilter/nf_conntrack.h:157:2: error: implicit declaration of function 'nf_conntrack_put'; did you mean 'nf_ct_put'? [-Werror=implicit-function-declaration]

  net/netfilter/nf_flow_table.o: In function `nf_flow_offload_work_gc':
  (.text+0x1540): undefined reference to `nf_ct_delete'

- when CONFIG_NF_TABLES is disabled:

  In file included from net/ipv6/netfilter/nf_flow_table_ipv6.c:13:0:
  include/net/netfilter/nf_tables.h: In function 'nft_gencursor_next':
  include/net/netfilter/nf_tables.h:1189:14: error: 'const struct net' has no member named 'nft'; did you mean 'nf'?

 - when CONFIG_NF_FLOW_TABLE_INET is enabled, but NF_FLOW_TABLE_IPV4
  or NF_FLOW_TABLE_IPV6 are not, or are loadable modules

  net/netfilter/nf_flow_table_inet.o: In function `nf_flow_offload_inet_hook':
  nf_flow_table_inet.c:(.text+0x94): undefined reference to `nf_flow_offload_ipv6_hook'
  nf_flow_table_inet.c:(.text+0x40): undefined reference to `nf_flow_offload_ip_hook'

- when CONFIG_NF_FLOW_TABLES is disabled, but the other options are
  enabled:

  net/netfilter/nf_flow_table_inet.o: In function `nf_flow_offload_inet_hook':
  nf_flow_table_inet.c:(.text+0x6c): undefined reference to `nf_flow_offload_ipv6_hook'
  net/netfilter/nf_flow_table_inet.o: In function `nf_flow_inet_module_exit':
  nf_flow_table_inet.c:(.exit.text+0x8): undefined reference to `nft_unregister_flowtable_type'
  net/netfilter/nf_flow_table_inet.o: In function `nf_flow_inet_module_init':
  nf_flow_table_inet.c:(.init.text+0x8): undefined reference to `nft_register_flowtable_type'
  net/ipv4/netfilter/nf_flow_table_ipv4.o: In function `nf_flow_ipv4_module_exit':
  nf_flow_table_ipv4.c:(.exit.text+0x8): undefined reference to `nft_unregister_flowtable_type'
  net/ipv4/netfilter/nf_flow_table_ipv4.o: In function `nf_flow_ipv4_module_init':
  nf_flow_table_ipv4.c:(.init.text+0x8): undefined reference to `nft_register_flowtable_type'

This adds additional Kconfig dependencies to ensure that NF_CONNTRACK and NF_TABLES
are always visible from NF_FLOW_TABLE, and that the internal dependencies between
the four new modules are met.

Fixes: 7c23b629a808 ("netfilter: flow table support for the mixed IPv4/IPv6 family")
Fixes: 0995210753a2 ("netfilter: flow table support for IPv6")
Fixes: 97add9f0d66d ("netfilter: flow table support for IPv4")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig | 3 ++-
 net/ipv6/netfilter/Kconfig | 3 ++-
 net/netfilter/Kconfig      | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 7d5d444964aa..3ad46a90b0fc 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -79,8 +79,9 @@ config NF_TABLES_ARP
 endif # NF_TABLES
 
 config NF_FLOW_TABLE_IPV4
-	select NF_FLOW_TABLE
 	tristate "Netfilter flow table IPv4 module"
+	depends on NF_CONNTRACK && NF_TABLES
+	select NF_FLOW_TABLE
 	help
 	  This option adds the flow table IPv4 support.
 
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index b6f5edf926d2..4a634b7a2c80 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -72,8 +72,9 @@ endif # NF_TABLES_IPV6
 endif # NF_TABLES
 
 config NF_FLOW_TABLE_IPV6
-	select NF_FLOW_TABLE
 	tristate "Netfilter flow table IPv6 module"
+	depends on NF_CONNTRACK && NF_TABLES
+	select NF_FLOW_TABLE
 	help
 	  This option adds the flow table IPv6 support.
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 0ee0fcf3abbf..ea447826e127 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -665,8 +665,9 @@ endif # NF_TABLES_NETDEV
 endif # NF_TABLES
 
 config NF_FLOW_TABLE_INET
-	select NF_FLOW_TABLE
 	tristate "Netfilter flow table mixed IPv4/IPv6 module"
+	depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6
+	select NF_FLOW_TABLE
 	help
           This option adds the flow table mixed IPv4/IPv6 support.
 
@@ -674,6 +675,7 @@ config NF_FLOW_TABLE_INET
 
 config NF_FLOW_TABLE
 	tristate "Netfilter flow table module"
+	depends on NF_CONNTRACK && NF_TABLES
 	help
 	  This option adds the flow table core infrastructure.
 
-- 
cgit v1.2.3


From d7dedee184e775f77d321cfa1c660a7680cf6588 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 9 Jan 2018 16:40:25 +0200
Subject: ipv6: Calculate hash thresholds for IPv6 nexthops

Before we convert IPv6 to use hash-threshold instead of modulo-N, we
first need each nexthop to store its region boundary in the hash
function's output space.

The boundary is calculated by dividing the output space equally between
the different active nexthops. That is, nexthops that are not dead or
linkdown.

The boundaries are rebalanced whenever a nexthop is added or removed to
a multipath route and whenever a nexthop becomes active or inactive.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |  1 +
 include/net/ip6_route.h |  7 ++++
 net/ipv6/ip6_fib.c      |  8 ++---
 net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ddf53dd1e948..97cd05d87780 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -149,6 +149,7 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
+	atomic_t			rt6i_nh_upper_bound;
 
 	atomic_t			rt6i_ref;
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 34cd3b0c6ded..27d23a65f3cd 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
+{
+	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	       RTF_GATEWAY;
+}
+
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
@@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
+void rt6_multipath_rebalance(struct rt6_info *rt);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b5f19703fca6..e31118f417b4 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -796,12 +796,6 @@ insert_above:
 	return ln;
 }
 
-static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
-{
-	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
-}
-
 static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
 {
 	int i;
@@ -991,6 +985,7 @@ next_iter:
 			rt6i_nsiblings++;
 		}
 		BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+		rt6_multipath_rebalance(temp_sibling);
 	}
 
 	/*
@@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 			sibling->rt6i_nsiblings--;
 		rt->rt6i_nsiblings = 0;
 		list_del_init(&rt->rt6i_siblings);
+		rt6_multipath_rebalance(next_sibling);
 	}
 
 	/* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1054b059747f..ced2c9bed10b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3481,6 +3481,99 @@ struct arg_netdev_event {
 	};
 };
 
+static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
+{
+	struct rt6_info *iter;
+	struct fib6_node *fn;
+
+	fn = rcu_dereference_protected(rt->rt6i_node,
+			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	iter = rcu_dereference_protected(fn->leaf,
+			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	while (iter) {
+		if (iter->rt6i_metric == rt->rt6i_metric &&
+		    rt6_qualify_for_ecmp(iter))
+			return iter;
+		iter = rcu_dereference_protected(iter->rt6_next,
+				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	}
+
+	return NULL;
+}
+
+static bool rt6_is_dead(const struct rt6_info *rt)
+{
+	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
+	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+		return true;
+
+	return false;
+}
+
+static int rt6_multipath_total_weight(const struct rt6_info *rt)
+{
+	struct rt6_info *iter;
+	int total = 0;
+
+	if (!rt6_is_dead(rt))
+		total++;
+
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+		if (!rt6_is_dead(iter))
+			total++;
+	}
+
+	return total;
+}
+
+static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
+{
+	int upper_bound = -1;
+
+	if (!rt6_is_dead(rt)) {
+		(*weight)++;
+		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
+						    total) - 1;
+	}
+	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
+}
+
+static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
+{
+	struct rt6_info *iter;
+	int weight = 0;
+
+	rt6_upper_bound_set(rt, &weight, total);
+
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+		rt6_upper_bound_set(iter, &weight, total);
+}
+
+void rt6_multipath_rebalance(struct rt6_info *rt)
+{
+	struct rt6_info *first;
+	int total;
+
+	/* In case the entire multipath route was marked for flushing,
+	 * then there is no need to rebalance upon the removal of every
+	 * sibling route.
+	 */
+	if (!rt->rt6i_nsiblings || rt->should_flush)
+		return;
+
+	/* During lookup routes are evaluated in order, so we need to
+	 * make sure upper bounds are assigned from the first sibling
+	 * onwards.
+	 */
+	first = rt6_multipath_first_sibling(rt);
+	if (WARN_ON_ONCE(!first))
+		return;
+
+	total = rt6_multipath_total_weight(first);
+	rt6_multipath_upper_bound_set(first, total);
+}
+
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
@@ -3489,6 +3582,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
 		rt->rt6i_nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+		rt6_multipath_rebalance(rt);
 	}
 
 	return 0;
@@ -3588,6 +3682,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
 						   RTNH_F_LINKDOWN);
 			fib6_update_sernum(rt);
+			rt6_multipath_rebalance(rt);
 		}
 		return -2;
 	case NETDEV_CHANGE:
@@ -3595,6 +3690,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+		rt6_multipath_rebalance(rt);
 		break;
 	}
 
-- 
cgit v1.2.3


From 7696c06a189c0f1f4d0a7e49e28d10e1050ec529 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 9 Jan 2018 16:40:26 +0200
Subject: ipv6: Use a 31-bit multipath hash

The hash thresholds assigned to IPv6 nexthops are in the range of
[-1, 2^31 - 1], where a negative value is assigned to nexthops that
should not be considered during multipath selection.

Therefore, in a similar fashion to IPv4, we need to use the upper
31-bits of the multipath hash for multipath selection.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ced2c9bed10b..09e8e10b101d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1833,10 +1833,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
 
 	if (skb) {
 		ip6_multipath_l3_keys(skb, &hash_keys);
-		return flow_hash_from_keys(&hash_keys);
+		return flow_hash_from_keys(&hash_keys) >> 1;
 	}
 
-	return get_hash_from_flowi6(fl6);
+	return get_hash_from_flowi6(fl6) >> 1;
 }
 
 void ip6_route_input(struct sk_buff *skb)
-- 
cgit v1.2.3


From 3d709f69a3e749f4d1c195dab499df8ab66e25a8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 9 Jan 2018 16:40:27 +0200
Subject: ipv6: Use hash-threshold instead of modulo-N

Now that each nexthop stores its region boundary in the multipath hash
function's output space, we can use hash-threshold instead of modulo-N
in multipath selection.

This reduces the number of checks we need to perform during lookup, as
dead and linkdown nexthops are assigned a negative region boundary. In
addition, in contrast to modulo-N, only flows near region boundaries are
affected when a nexthop is added or removed.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09e8e10b101d..7837b8c754a3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 					     int strict)
 {
 	struct rt6_info *sibling, *next_sibling;
-	int route_choosen;
 
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
@@ -463,28 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
 
-	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
-	/* Don't change the route, if route_choosen == 0
-	 * (siblings does not include ourself)
-	 */
-	if (route_choosen)
-		list_for_each_entry_safe(sibling, next_sibling,
-				&match->rt6i_siblings, rt6i_siblings) {
-			route_choosen--;
-			if (route_choosen == 0) {
-				struct inet6_dev *idev = sibling->rt6i_idev;
-
-				if (sibling->rt6i_nh_flags & RTNH_F_DEAD)
-					break;
-				if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
-				    idev->cnf.ignore_routes_with_linkdown)
-					break;
-				if (rt6_score_route(sibling, oif, strict) < 0)
-					break;
-				match = sibling;
-				break;
-			}
-		}
+	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
+		return match;
+
+	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
+				 rt6i_siblings) {
+		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
+			continue;
+		if (rt6_score_route(sibling, oif, strict) < 0)
+			break;
+		match = sibling;
+		break;
+	}
+
 	return match;
 }
 
-- 
cgit v1.2.3


From 398958ae48f44bb036d0fa9829cd489270bf1fc2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 9 Jan 2018 16:40:28 +0200
Subject: ipv6: Add support for non-equal-cost multipath

The use of hash-threshold instead of modulo-N makes it trivial to add
support for non-equal-cost multipath.

Instead of dividing the multipath hash function's output space equally
between the nexthops, each nexthop is assigned a region size which is
proportional to its weight.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/route.c      | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 97cd05d87780..34ec321d6a03 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -171,6 +171,7 @@ struct rt6_info {
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
+	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
 	u8				exception_bucket_flushed:1,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7837b8c754a3..1076ae0ea9d5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2594,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 #endif
 
 	rt->rt6i_metric = cfg->fc_metric;
+	rt->rt6i_nh_weight = 1;
 
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
@@ -3507,11 +3508,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total++;
+		total += rt->rt6i_nh_weight;
 
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
 		if (!rt6_is_dead(iter))
-			total++;
+			total += iter->rt6i_nh_weight;
 	}
 
 	return total;
@@ -3522,7 +3523,7 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		(*weight)++;
+		*weight += rt->rt6i_nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
@@ -4024,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
+		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
+
 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
 		if (err) {
 			dst_release_immediate(&rt->dst);
@@ -4246,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = 0;
+	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
 
 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
-- 
cgit v1.2.3


From 809a79e913eed4ca02bfe5f78d3b7a56c7d8d7a6 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 07:43:15 +0000
Subject: tcp: make local function tcp_recv_timestamp static

Fixes the following sparse warning:

net/ipv4/tcp.c:1736:6: warning:
 symbol 'tcp_recv_timestamp' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f68cb33d50d1..d7cf861bf699 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1733,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb,
 }
 
 /* Similar to __sock_recv_timestamp, but does not require an skb */
-void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
-			struct scm_timestamping *tss)
+static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+			       struct scm_timestamping *tss)
 {
 	struct timeval tv;
 	bool has_timestamping = false;
-- 
cgit v1.2.3


From f8253df553537518e9f87fa2eb599d68a2df092b Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Wed, 10 Jan 2018 14:59:59 +0100
Subject: net: sch: red: Change offloaded xstats to be incremental

Change the value of the xstats requested from the driver for offloaded RED
to be incremental, like the normal stats.
It increases consistency - if a qdisc stops being offloaded its xstats
don't change.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_qdisc.c   | 15 +++++++++++---
 net/sched/sch_red.c                                | 24 ++++++++--------------
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
index 3e2841872f64..55e4e4d0dad3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
@@ -212,6 +212,7 @@ mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port, u32 handle,
 {
 	struct red_stats *xstats_base = &mlxsw_sp_qdisc->xstats_base;
 	struct mlxsw_sp_port_xstats *xstats;
+	int early_drops, marks, pdrops;
 
 	if (mlxsw_sp_qdisc->handle != handle ||
 	    mlxsw_sp_qdisc->type != MLXSW_SP_QDISC_RED)
@@ -219,9 +220,17 @@ mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port, u32 handle,
 
 	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
 
-	res->prob_drop = xstats->wred_drop[tclass_num] - xstats_base->prob_drop;
-	res->prob_mark = xstats->ecn - xstats_base->prob_mark;
-	res->pdrop = xstats->tail_drop[tclass_num] - xstats_base->pdrop;
+	early_drops = xstats->wred_drop[tclass_num] - xstats_base->prob_drop;
+	marks = xstats->ecn - xstats_base->prob_mark;
+	pdrops = xstats->tail_drop[tclass_num] - xstats_base->pdrop;
+
+	res->pdrop += pdrops;
+	res->prob_drop += early_drops;
+	res->prob_mark += marks;
+
+	xstats_base->pdrop += pdrops;
+	xstats_base->prob_drop += early_drops;
+	xstats_base->prob_mark += marks;
 	return 0;
 }
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a392eaa4a0b4..0af1c1254e0b 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -344,32 +344,24 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
-	struct tc_red_xstats st = {
-		.early	= q->stats.prob_drop + q->stats.forced_drop,
-		.pdrop	= q->stats.pdrop,
-		.other	= q->stats.other,
-		.marked	= q->stats.prob_mark + q->stats.forced_mark,
-	};
+	struct tc_red_xstats st = {0};
 
 	if (sch->flags & TCQ_F_OFFLOADED) {
-		struct red_stats hw_stats = {0};
 		struct tc_red_qopt_offload hw_stats_request = {
 			.command = TC_RED_XSTATS,
 			.handle = sch->handle,
 			.parent = sch->parent,
 			{
-				.xstats = &hw_stats,
+				.xstats = &q->stats,
 			},
 		};
-		if (!dev->netdev_ops->ndo_setup_tc(dev,
-						   TC_SETUP_QDISC_RED,
-						   &hw_stats_request)) {
-			st.early += hw_stats.prob_drop + hw_stats.forced_drop;
-			st.pdrop += hw_stats.pdrop;
-			st.other += hw_stats.other;
-			st.marked += hw_stats.prob_mark + hw_stats.forced_mark;
-		}
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					      &hw_stats_request);
 	}
+	st.early = q->stats.prob_drop + q->stats.forced_drop;
+	st.pdrop = q->stats.pdrop;
+	st.other = q->stats.other;
+	st.marked = q->stats.prob_mark + q->stats.forced_mark;
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
-- 
cgit v1.2.3


From 5ed001baeeffccaa63d13a433e8dc99ae6c017ad Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Jan 2018 09:21:29 +0100
Subject: netfilter: clusterip: make sure arp hooks are available

The clusterip target needs to register an arp mangling hook,
so make sure NF_ARP hooks are available.

Fixes: 2a95183a5e ("netfilter: don't allocate space for arp/bridge hooks unless needed")
Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 3ad46a90b0fc..f28b08819f89 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -343,6 +343,7 @@ config IP_NF_TARGET_CLUSTERIP
 	depends on NF_CONNTRACK_IPV4
 	depends on NETFILTER_ADVANCED
 	select NF_CONNTRACK_MARK
+	select NETFILTER_FAMILY_ARP
 	help
 	  The CLUSTERIP target allows you to build load-balancing clusters of
 	  network servers without having a dedicated load-balancing
-- 
cgit v1.2.3


From 902d6a4c2a4f411582689e53fb101895ffe99028 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 10 Jan 2018 20:51:57 -0700
Subject: netfilter: nf_defrag: Skip defrag if NOTRACK is set

conntrack defrag is needed only if some module like CONNTRACK or NAT
explicitly requests it. For plain forwarding scenarios, defrag is
not needed and can be skipped if NOTRACK is set in a rule.

Since conntrack defrag is currently higher priority than raw table,
setting NOTRACK is not sufficient. We need to move raw to a higher
priority for iptables only.

This is achieved by introducing a module parameter "raw_before_defrag"
which allows to change the priority of raw table to place it before
defrag. By default, the parameter is disabled and the priority of raw
table is NF_IP_PRI_RAW to support legacy behavior. If the module
parameter is enabled, then the priority of the raw table is set to
NF_IP_PRI_RAW_BEFORE_DEFRAG.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv4.h       |  1 +
 include/uapi/linux/netfilter_ipv6.h       |  1 +
 net/ipv4/netfilter/iptable_raw.c          | 13 ++++++++++++-
 net/ipv4/netfilter/nf_defrag_ipv4.c       |  2 +-
 net/ipv6/netfilter/ip6table_raw.c         | 13 ++++++++++++-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c |  3 +++
 6 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h
index e6b1a84f5dd3..c3b060775e13 100644
--- a/include/uapi/linux/netfilter_ipv4.h
+++ b/include/uapi/linux/netfilter_ipv4.h
@@ -57,6 +57,7 @@
 
 enum nf_ip_hook_priorities {
 	NF_IP_PRI_FIRST = INT_MIN,
+	NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
 	NF_IP_PRI_CONNTRACK_DEFRAG = -400,
 	NF_IP_PRI_RAW = -300,
 	NF_IP_PRI_SELINUX_FIRST = -225,
diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h
index 2f9724611cc2..dc624fd24d25 100644
--- a/include/uapi/linux/netfilter_ipv6.h
+++ b/include/uapi/linux/netfilter_ipv6.h
@@ -62,6 +62,7 @@
 
 enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_FIRST = INT_MIN,
+	NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450,
 	NF_IP6_PRI_CONNTRACK_DEFRAG = -400,
 	NF_IP6_PRI_RAW = -300,
 	NF_IP6_PRI_SELINUX_FIRST = -225,
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index a869d1fea7d9..29b64d3024e0 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/slab.h>
@@ -12,7 +13,11 @@
 
 static int __net_init iptable_raw_table_init(struct net *net);
 
-static const struct xt_table packet_raw = {
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
+
+static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
@@ -64,6 +69,12 @@ static int __init iptable_raw_init(void)
 {
 	int ret;
 
+	if (raw_before_defrag) {
+		packet_raw.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG;
+
+		pr_info("Enabling raw table before defrag\n");
+	}
+
 	rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
 	if (IS_ERR(rawtable_ops))
 		return PTR_ERR(rawtable_ops);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 37fe1616ca0b..cbd987f6b1f8 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -80,7 +80,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
 #endif
 #endif
 	/* Gather fragments. */
-	if (ip_is_fragment(ip_hdr(skb))) {
+	if (skb->_nfct != IP_CT_UNTRACKED && ip_is_fragment(ip_hdr(skb))) {
 		enum ip_defrag_users user =
 			nf_ct_defrag_user(state->hook, skb);
 
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index d4bc56443dc1..3df7383f96d0 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/slab.h>
@@ -11,7 +12,11 @@
 
 static int __net_init ip6table_raw_table_init(struct net *net);
 
-static const struct xt_table packet_raw = {
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
+
+static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
@@ -63,6 +68,12 @@ static int __init ip6table_raw_init(void)
 {
 	int ret;
 
+	if (raw_before_defrag) {
+		packet_raw.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG;
+
+		pr_info("Enabling raw table before defrag\n");
+	}
+
 	/* Register hooks */
 	rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
 	if (IS_ERR(rawtable_ops))
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index b326da59257f..87b503a8f5ef 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -65,6 +65,9 @@ static unsigned int ipv6_defrag(void *priv,
 		return NF_ACCEPT;
 #endif
 
+	if (skb->_nfct == IP_CT_UNTRACKED)
+		return NF_ACCEPT;
+
 	err = nf_ct_frag6_gather(state->net, skb,
 				 nf_ct6_defrag_user(state->hook, skb));
 	/* queued */
-- 
cgit v1.2.3


From d584527c70399cf0d095396d696029f54a10cfd3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 22 Nov 2017 10:57:41 -0800
Subject: net: Cap number of queues even with accel_priv

With the recent fix to ixgbe we can cap the number of queues always
regardless of if accel_priv is being used or not since the actual number of
queues are being reported via real_num_tx_queues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3d24d9a59086..94435cd09072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3420,8 +3420,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 		else
 			queue_index = __netdev_pick_tx(dev, skb);
 
-		if (!accel_priv)
-			queue_index = netdev_cap_txqueue(dev, queue_index);
+		queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
 
 	skb_set_queue_mapping(skb, queue_index);
-- 
cgit v1.2.3


From daaf24c634ab951cad3dcef28492001ef9c931d0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 11 Jan 2018 17:39:09 +0100
Subject: bpf: simplify xdp_convert_ctx_access for xdp_rxq_info

As pointed out by Daniel Borkmann, using bpf_target_off() is not
necessary for xdp_rxq_info when extracting queue_index and
ifindex, as these members are u32 like BPF_W.

Also fix trivial spelling mistake introduced in same commit.

Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 2 +-
 net/core/filter.c        | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 405317f9c064..395d261948de 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -899,7 +899,7 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
-	/* Below access go though struct xdp_rxq_info */
+	/* Below access go through struct xdp_rxq_info */
 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index d4b190e63b79..db2ee8c7e1bd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4310,16 +4310,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->dst_reg,
 				      offsetof(struct xdp_rxq_info, dev));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      bpf_target_off(struct net_device,
-						     ifindex, 4, target_size));
+				      offsetof(struct net_device, ifindex));
 		break;
 	case offsetof(struct xdp_md, rx_queue_index):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, rxq));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      bpf_target_off(struct xdp_rxq_info,
-						queue_index, 4, target_size));
+				      offsetof(struct xdp_rxq_info,
+					       queue_index));
 		break;
 	}
 
-- 
cgit v1.2.3


From 7fdb61b44c0c95d00f6c856d9fb61a9f647bc85f Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Sun, 14 Jan 2018 12:33:15 +0100
Subject: net: sch: prio: Add offload ability to PRIO qdisc

Add the ability to offload PRIO qdisc by using ndo_setup_tc.
There are three commands for PRIO offloading:
* TC_PRIO_REPLACE: handles set and tune
* TC_PRIO_DESTROY: handles qdisc destroy
* TC_PRIO_STATS: updates the qdiscs counters (given as reference)

Like RED qdisc, the indication of whether PRIO is being offloaded is being
set and updated as part of the dump function. It is so because the driver
could decide to offload or not based on the qdisc parent, which could
change without notifying the qdisc.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     | 25 ++++++++++++++++++++
 net/sched/sch_prio.c      | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef7b348e8498..6d95477b962c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -780,6 +780,7 @@ enum tc_setup_type {
 	TC_SETUP_BLOCK,
 	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
+	TC_SETUP_QDISC_PRIO,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0d1343cba84c..9c341f003091 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -761,4 +761,29 @@ struct tc_red_qopt_offload {
 	};
 };
 
+enum tc_prio_command {
+	TC_PRIO_REPLACE,
+	TC_PRIO_DESTROY,
+	TC_PRIO_STATS,
+};
+
+struct tc_prio_qopt_offload_params {
+	int bands;
+	u8 priomap[TC_PRIO_MAX + 1];
+	/* In case that a prio qdisc is offloaded and now is changed to a
+	 * non-offloadedable config, it needs to update the backlog & qlen
+	 * values to negate the HW backlog & qlen values (and only them).
+	 */
+	struct gnet_stats_queue *qstats;
+};
+
+struct tc_prio_qopt_offload {
+	enum tc_prio_command command;
+	u32 handle;
+	u32 parent;
+	union {
+		struct tc_prio_qopt_offload_params replace_params;
+		struct tc_qopt_offload_stats stats;
+	};
+};
 #endif
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fe1510eb111f..a398502899a9 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -142,6 +142,31 @@ prio_reset(struct Qdisc *sch)
 	sch->q.qlen = 0;
 }
 
+static int prio_offload(struct Qdisc *sch, bool enable)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_prio_qopt_offload opt = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	if (enable) {
+		opt.command = TC_PRIO_REPLACE;
+		opt.replace_params.bands = q->bands;
+		memcpy(&opt.replace_params.priomap, q->prio2band,
+		       TC_PRIO_MAX + 1);
+		opt.replace_params.qstats = &sch->qstats;
+	} else {
+		opt.command = TC_PRIO_DESTROY;
+	}
+
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt);
+}
+
 static void
 prio_destroy(struct Qdisc *sch)
 {
@@ -149,6 +174,7 @@ prio_destroy(struct Qdisc *sch)
 	struct prio_sched_data *q = qdisc_priv(sch);
 
 	tcf_block_put(q->block);
+	prio_offload(sch, false);
 	for (prio = 0; prio < q->bands; prio++)
 		qdisc_destroy(q->queues[prio]);
 }
@@ -204,6 +230,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	sch_tree_unlock(sch);
+	prio_offload(sch, true);
 	return 0;
 }
 
@@ -223,15 +250,47 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
 	return prio_tune(sch, opt, extack);
 }
 
+static int prio_dump_offload(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_prio_qopt_offload hw_stats = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+		.command = TC_PRIO_STATS,
+		.stats.bstats = &sch->bstats,
+		.stats.qstats = &sch->qstats,
+	};
+	int err;
+
+	sch->flags &= ~TCQ_F_OFFLOADED;
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
+					    &hw_stats);
+	if (err == -EOPNOTSUPP)
+		return 0;
+
+	if (!err)
+		sch->flags |= TCQ_F_OFFLOADED;
+
+	return err;
+}
+
 static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_prio_qopt opt;
+	int err;
 
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
 
+	err = prio_dump_offload(sch);
+	if (err)
+		goto nla_put_failure;
+
 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From febafc8455fdbb0ba53d596075068a683b75f355 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 10 Jan 2018 21:08:50 +0100
Subject: tipc: fix a potental access after delete in tipc_sk_join()

In commit d12d2e12cec2 "tipc: send out join messages as soon as new
member is discovered") we added a call to the function tipc_group_join()
without considering the case that the preceding tipc_sk_publish() might
have failed, and the group item already deleted.

We fix this by returning from tipc_sk_join() directly after the
failed tipc_sk_publish.

Reported-by: syzbot+e3eeae78ea88b8d6d858@syzkaller.appspotmail.com
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 1f236271766c..f38264db45ae 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2774,6 +2774,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 	if (rc) {
 		tipc_group_delete(net, grp);
 		tsk->group = NULL;
+		return rc;
 	}
 	/* Eliminate any risk that a broadcast overtakes sent JOINs */
 	tsk->mc_method.rcast = true;
-- 
cgit v1.2.3


From 594831a8aba3fd045c3212a3e3bb9788c77b989d Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 11 Jan 2018 14:22:07 -0200
Subject: sctp: removed unused var from sctp_make_auth

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index b9b269cf615e..793b05ec692b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1276,7 +1276,6 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 	struct sctp_authhdr auth_hdr;
 	struct sctp_hmac *hmac_desc;
 	struct sctp_chunk *retval;
-	__u8 *hmac;
 
 	/* Get the first hmac that the peer told us to use */
 	hmac_desc = sctp_auth_asoc_get_hmac(asoc);
@@ -1295,7 +1294,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 	retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr),
 						 &auth_hdr);
 
-	hmac = skb_put_zero(retval->skb, hmac_desc->hmac_len);
+	skb_put_zero(retval->skb, hmac_desc->hmac_len);
 
 	/* Adjust the chunk header to include the empty MAC */
 	retval->chunk_hdr->length =
-- 
cgit v1.2.3


From 273c28bc57ca9672f7b70bed764ecdfb964930c8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 12 Jan 2018 18:28:31 +0300
Subject: net: Convert atomic_t net::count to refcount_t

Since net could be obtained from RCU lists,
and there is a race with net destruction,
the patch converts net::count to refcount_t.

This provides sanity checks for the cases of
incrementing counter of already dead net,
when maybe_get_net() has to used instead
of get_net().

Drivers: allyesconfig and allmodconfig are OK.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h   | 8 ++++----
 net/core/net-sysfs.c          | 6 +++---
 net/core/net_namespace.c      | 8 ++++----
 net/ipv4/inet_timewait_sock.c | 4 ++--
 net/ipv4/tcp_metrics.c        | 2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 10f99dafd5ac..f8a84a2c2341 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -51,7 +51,7 @@ struct net {
 	refcount_t		passive;	/* To decided when the network
 						 * namespace should be freed.
 						 */
-	atomic_t		count;		/* To decided when the network
+	refcount_t		count;		/* To decided when the network
 						 *  namespace should be shut down.
 						 */
 	spinlock_t		rules_mod_lock;
@@ -195,7 +195,7 @@ void __put_net(struct net *net);
 
 static inline struct net *get_net(struct net *net)
 {
-	atomic_inc(&net->count);
+	refcount_inc(&net->count);
 	return net;
 }
 
@@ -206,14 +206,14 @@ static inline struct net *maybe_get_net(struct net *net)
 	 * exists.  If the reference count is zero this
 	 * function fails and returns NULL.
 	 */
-	if (!atomic_inc_not_zero(&net->count))
+	if (!refcount_inc_not_zero(&net->count))
 		net = NULL;
 	return net;
 }
 
 static inline void put_net(struct net *net)
 {
-	if (atomic_dec_and_test(&net->count))
+	if (refcount_dec_and_test(&net->count))
 		__put_net(net);
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 799b75268291..7bf8b85ade16 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -961,7 +961,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 	while (--i >= new_num) {
 		struct kobject *kobj = &dev->_rx[i].kobj;
 
-		if (!atomic_read(&dev_net(dev)->count))
+		if (!refcount_read(&dev_net(dev)->count))
 			kobj->uevent_suppress = 1;
 		if (dev->sysfs_rx_queue_group)
 			sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -1367,7 +1367,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 	while (--i >= new_num) {
 		struct netdev_queue *queue = dev->_tx + i;
 
-		if (!atomic_read(&dev_net(dev)->count))
+		if (!refcount_read(&dev_net(dev)->count))
 			queue->kobj.uevent_suppress = 1;
 #ifdef CONFIG_BQL
 		sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
 {
 	struct device *dev = &ndev->dev;
 
-	if (!atomic_read(&dev_net(ndev)->count))
+	if (!refcount_read(&dev_net(ndev)->count))
 		dev_set_uevent_suppress(dev, 1);
 
 	kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 60a71be75aea..2213d45fcafd 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -35,7 +35,7 @@ LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
 
 struct net init_net = {
-	.count		= ATOMIC_INIT(1),
+	.count		= REFCOUNT_INIT(1),
 	.dev_base_head	= LIST_HEAD_INIT(init_net.dev_base_head),
 };
 EXPORT_SYMBOL(init_net);
@@ -224,10 +224,10 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 	bool alloc;
 	int id;
 
-	if (atomic_read(&net->count) == 0)
+	if (refcount_read(&net->count) == 0)
 		return NETNSA_NSID_NOT_ASSIGNED;
 	spin_lock_bh(&net->nsid_lock);
-	alloc = atomic_read(&peer->count) == 0 ? false : true;
+	alloc = refcount_read(&peer->count) == 0 ? false : true;
 	id = __peernet2id_alloc(net, peer, &alloc);
 	spin_unlock_bh(&net->nsid_lock);
 	if (alloc && id >= 0)
@@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	int error = 0;
 	LIST_HEAD(net_exit_list);
 
-	atomic_set(&net->count, 1);
+	refcount_set(&net->count, 1);
 	refcount_set(&net->passive, 1);
 	net->dev_base_seq = 1;
 	net->user_ns = user_ns;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 277ff69a312d..c3ea4906d237 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -270,14 +270,14 @@ restart:
 				continue;
 			tw = inet_twsk(sk);
 			if ((tw->tw_family != family) ||
-				atomic_read(&twsk_net(tw)->count))
+				refcount_read(&twsk_net(tw)->count))
 				continue;
 
 			if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
 				continue;
 
 			if (unlikely((tw->tw_family != family) ||
-				     atomic_read(&twsk_net(tw)->count))) {
+				     refcount_read(&twsk_net(tw)->count))) {
 				inet_twsk_put(tw);
 				goto restart;
 			}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 759e6bc8327b..03b51cdcc731 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -892,7 +892,7 @@ static void tcp_metrics_flush_all(struct net *net)
 		pp = &hb->chain;
 		for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
 			match = net ? net_eq(tm_net(tm), net) :
-				!atomic_read(&tm_net(tm)->count);
+				!refcount_read(&tm_net(tm)->count);
 			if (match) {
 				*pp = tm->tcpm_next;
 				kfree_rcu(tm, rcu_head);
-- 
cgit v1.2.3


From e9a034456a8cd766795610aa5065263147e35228 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 12 Jan 2018 20:56:50 +0100
Subject: tipc: fix bug during lookup of multicast destination nodes

In commit 232d07b74a33 ("tipc: improve groupcast scope handling") we
inadvertently broke non-group multicast transmission when changing the
parameter 'domain' to 'scope' in the function
tipc_nametbl_lookup_dst_nodes(). We missed to make the corresponding
change in the calling function, with the result that the lookup always
fails.

A closer anaysis reveals that this parameter is not needed at all.
Non-group multicast is hard coded to use CLUSTER_SCOPE, and in the
current implementation this will be delivered to all matching
destinations except those which are published with NODE_SCOPE on other
nodes. Since such publications never will be visible on the sending node
anyway, it makes no sense to discriminate by scope at all.

We now remove this parameter altogether.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 6 ++----
 net/tipc/name_table.h | 3 +--
 net/tipc/socket.c     | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 64cdd3c302b0..ed0457cc99d6 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -680,8 +680,7 @@ exit:
  * - Determines if any node local ports overlap
  */
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
-				   u32 upper, u32 scope,
-				   struct tipc_nlist *nodes)
+				   u32 upper, struct tipc_nlist *nodes)
 {
 	struct sub_seq *sseq, *stop;
 	struct publication *publ;
@@ -699,8 +698,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 	for (; sseq != stop && sseq->lower <= upper; sseq++) {
 		info = sseq->info;
 		list_for_each_entry(publ, &info->zone_list, zone_list) {
-			if (publ->scope == scope)
-				tipc_nlist_add(nodes, publ->node);
+			tipc_nlist_add(nodes, publ->node);
 		}
 	}
 	spin_unlock_bh(&seq->lock);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index b595d8aa00f0..f56e7cb3d436 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -105,8 +105,7 @@ int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 			      u32 type, u32 domain);
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
-				   u32 upper, u32 domain,
-				   struct tipc_nlist *nodes);
+				   u32 upper, struct tipc_nlist *nodes);
 bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
 			 struct list_head *dsts, int *dstcnt, u32 exclude,
 			 bool all);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f38264db45ae..d799e50ff722 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -772,7 +772,6 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct net *net = sock_net(sk);
 	int mtu = tipc_bcast_get_mtu(net);
 	struct tipc_mc_method *method = &tsk->mc_method;
-	u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
 	struct sk_buff_head pkts;
 	struct tipc_nlist dsts;
 	int rc;
@@ -788,7 +787,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	/* Lookup destination nodes */
 	tipc_nlist_init(&dsts, tipc_own_addr(net));
 	tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
-				      seq->upper, domain, &dsts);
+				      seq->upper, &dsts);
 	if (!dsts.local && !dsts.remote)
 		return -EHOSTUNREACH;
 
-- 
cgit v1.2.3


From 6802f3adcb3f201b81a3f396d82aa0ddb5923cd5 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 12 Jan 2018 22:07:36 +0200
Subject: ipv6: Fix build with gcc-4.4.5

Emil reported the following compiler errors:

net/ipv6/route.c: In function `rt6_sync_up`:
net/ipv6/route.c:3586: error: unknown field `nh_flags` specified in initializer
net/ipv6/route.c:3586: warning: missing braces around initializer
net/ipv6/route.c:3586: warning: (near initialization for `arg.<anonymous>`)
net/ipv6/route.c: In function `rt6_sync_down_dev`:
net/ipv6/route.c:3695: error: unknown field `event` specified in initializer
net/ipv6/route.c:3695: warning: missing braces around initializer
net/ipv6/route.c:3695: warning: (near initialization for `arg.<anonymous>`)

Problem is with the named initializers for the anonymous union members.
Fix this by adding curly braces around the initialization.

Fixes: 4c981e28d373 ("ipv6: Prepare to handle multiple netdev events")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Emil S Tantilov <emils.tantilov@gmail.com>
Tested-by: Emil S Tantilov <emils.tantilov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1076ae0ea9d5..c37bd9569172 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3583,7 +3583,9 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
 {
 	struct arg_netdev_event arg = {
 		.dev = dev,
-		.nh_flags = nh_flags,
+		{
+			.nh_flags = nh_flags,
+		},
 	};
 
 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
@@ -3692,7 +3694,9 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
 {
 	struct arg_netdev_event arg = {
 		.dev = dev,
-		.event = event,
+		{
+			.event = event,
+		},
 	};
 
 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
-- 
cgit v1.2.3


From 9be9d04b28e75b52f83e3e10ee529a1ec992a2c0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Jan 2018 16:50:26 +0100
Subject: netfilter: nf_tables: flow_offload depends on flow_table

Without CONFIG_NF_FLOW_TABLE, the new nft_flow_offload module produces
a link error:

net/netfilter/nft_flow_offload.o: In function `nft_flow_offload_iterate_cleanup':
nft_flow_offload.c:(.text+0xb0): undefined reference to `nf_flow_table_iterate'
net/netfilter/nft_flow_offload.o: In function `flow_offload_iterate_cleanup':
nft_flow_offload.c:(.text+0x160): undefined reference to `flow_offload_dead'
net/netfilter/nft_flow_offload.o: In function `nft_flow_offload_eval':
nft_flow_offload.c:(.text+0xc4c): undefined reference to `flow_offload_alloc'
nft_flow_offload.c:(.text+0xc64): undefined reference to `flow_offload_add'
nft_flow_offload.c:(.text+0xc94): undefined reference to `flow_offload_free'

This adds a Kconfig dependency for it.

Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ea447826e127..9019fa98003d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -506,7 +506,7 @@ config NFT_CT
 	  connection tracking information such as the flow state.
 
 config NFT_FLOW_OFFLOAD
-	depends on NF_CONNTRACK
+	depends on NF_CONNTRACK && NF_FLOW_TABLE
 	tristate "Netfilter nf_tables hardware flow offload module"
 	help
 	  This option adds the "flow_offload" expression that you can use to
-- 
cgit v1.2.3


From e3eeacbac4ad34fac93f82a7cf15402bba83d22e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 13 Jan 2018 14:06:08 +0100
Subject: netfilter: x_tables: don't return garbage pointer on modprobe failure

request_module may return a positive error result from modprobe,
if we cast this to ERR_PTR this returns a garbage result (it passes
IS_ERR checks).

Fix it by ignoring modprobe return values entirely, just retry the
table lookup instead.

Reported-by: syzbot+980925dbfbc7f93bc2ef@syzkaller.appspotmail.com
Fixes: 03d13b6868a2 ("netfilter: xtables: add and use xt_request_find_table_lock")
Fixes: 20651cefd25f ("netfilter: x_tables: unbreak module auto loading")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 5b8f3b7358e6..3c2548787d78 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1085,7 +1085,7 @@ struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
 #ifdef CONFIG_MODULES
 	if (IS_ERR(t)) {
 		int err = request_module("%stable_%s", xt_prefix[af], name);
-		if (err)
+		if (err < 0)
 			return ERR_PTR(err);
 		t = xt_find_table_lock(net, af, name);
 	}
-- 
cgit v1.2.3


From 83f1999caeb14e15df205e80d210699951733287 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Fri, 12 Jan 2018 17:36:27 -0700
Subject: netfilter: ipv6: nf_defrag: Pass on packets to stack per RFC2460

ipv6_defrag pulls network headers before fragment header. In case of
an error, the netfilter layer is currently dropping these packets.
This results in failure of some IPv6 standards tests which passed on
older kernels due to the netfilter framework using cloning.

The test case run here is a check for ICMPv6 error message replies
when some invalid IPv6 fragments are sent. This specific test case is
listed in https://www.ipv6ready.org/docs/Core_Conformance_Latest.pdf
in the Extension Header Processing Order section.

A packet with unrecognized option Type 11 is sent and the test expects
an ICMP error in line with RFC2460 section 4.2 -

11 - discard the packet and, only if the packet's Destination
     Address was not a multicast address, send an ICMP Parameter
     Problem, Code 2, message to the packet's Source Address,
     pointing to the unrecognized Option Type.

Since netfilter layer now drops all invalid IPv6 frag packets, we no
longer see the ICMP error message and fail the test case.

To fix this, save the transport header. If defrag is unable to process
the packet due to RFC2460, restore the transport header and allow packet
to be processed by stack. There is no change for other packet
processing paths.

Tested by confirming that stack sends an ICMP error when it receives
these packets. Also tested that fragmented ICMP pings succeed.

v1->v2: Instead of cloning always, save the transport_header and
restore it in case of this specific error. Update the title and
commit message accordingly.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 977d8900cfd1..ce53dcfda88a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -231,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
 		pr_debug("offset is too large.\n");
-		return -1;
+		return -EINVAL;
 	}
 
 	ecn = ip6_frag_ecn(ipv6_hdr(skb));
@@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
-			return -1;
+			return -EPROTO;
 		}
 		if (end > fq->q.len) {
 			/* Some bits beyond end -> corruption. */
@@ -358,7 +358,7 @@ found:
 discard_fq:
 	inet_frag_kill(&fq->q, &nf_frags);
 err:
-	return -1;
+	return -EINVAL;
 }
 
 /*
@@ -567,6 +567,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
 
 int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 {
+	u16 savethdr = skb->transport_header;
 	struct net_device *dev = skb->dev;
 	int fhoff, nhoff, ret;
 	struct frag_hdr *fhdr;
@@ -600,8 +601,12 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 	spin_lock_bh(&fq->q.lock);
 
-	if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) {
-		ret = -EINVAL;
+	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
+	if (ret < 0) {
+		if (ret == -EPROTO) {
+			skb->transport_header = savethdr;
+			ret = 0;
+		}
 		goto out_unlock;
 	}
 
-- 
cgit v1.2.3


From b069b37adb1472b58f44606c8bdfe9d8da71a28c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 15 Jan 2018 16:49:05 +0100
Subject: netfilter: nf_defrag: mark xt_table structures 'const' again

As a side-effect of adding the module option, we now get a section
mismatch warning:

WARNING: net/ipv4/netfilter/iptable_raw.o(.data+0x1c): Section mismatch in reference from the variable packet_raw to the function .init.text:iptable_raw_table_init()
The variable packet_raw references
the function __init iptable_raw_table_init()
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console

Apparently it's ok to link to a __net_init function from .rodata but not
from .data. We can address this by rearranging the logic so that the
structure is read-only again. Instead of writing to the .priority field
later, we have an extra copies of the structure with that flag. An added
advantage is that that we don't have writable function pointers with this
approach.

Fixes: 902d6a4c2a4f ("netfilter: nf_defrag: Skip defrag if NOTRACK is set")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_raw.c  | 24 +++++++++++++++++++-----
 net/ipv6/netfilter/ip6table_raw.c | 24 +++++++++++++++++++-----
 2 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 29b64d3024e0..960625aabf04 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -17,7 +17,7 @@ static bool raw_before_defrag __read_mostly;
 MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
 module_param(raw_before_defrag, bool, 0000);
 
-static struct xt_table packet_raw = {
+static const struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
@@ -26,6 +26,15 @@ static struct xt_table packet_raw = {
 	.table_init = iptable_raw_table_init,
 };
 
+static const struct xt_table packet_raw_before_defrag = {
+	.name = "raw",
+	.valid_hooks =  RAW_VALID_HOOKS,
+	.me = THIS_MODULE,
+	.af = NFPROTO_IPV4,
+	.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
+	.table_init = iptable_raw_table_init,
+};
+
 /* The work comes in here from netfilter.c. */
 static unsigned int
 iptable_raw_hook(void *priv, struct sk_buff *skb,
@@ -39,15 +48,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init iptable_raw_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	const struct xt_table *table = &packet_raw;
 	int ret;
 
+	if (raw_before_defrag)
+		table = &packet_raw_before_defrag;
+
 	if (net->ipv4.iptable_raw)
 		return 0;
 
-	repl = ipt_alloc_initial_table(&packet_raw);
+	repl = ipt_alloc_initial_table(table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops,
+	ret = ipt_register_table(net, table, repl, rawtable_ops,
 				 &net->ipv4.iptable_raw);
 	kfree(repl);
 	return ret;
@@ -68,14 +81,15 @@ static struct pernet_operations iptable_raw_net_ops = {
 static int __init iptable_raw_init(void)
 {
 	int ret;
+	const struct xt_table *table = &packet_raw;
 
 	if (raw_before_defrag) {
-		packet_raw.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG;
+		table = &packet_raw_before_defrag;
 
 		pr_info("Enabling raw table before defrag\n");
 	}
 
-	rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
+	rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook);
 	if (IS_ERR(rawtable_ops))
 		return PTR_ERR(rawtable_ops);
 
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 3df7383f96d0..710fa0806c37 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -16,7 +16,7 @@ static bool raw_before_defrag __read_mostly;
 MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
 module_param(raw_before_defrag, bool, 0000);
 
-static struct xt_table packet_raw = {
+static const struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
@@ -25,6 +25,15 @@ static struct xt_table packet_raw = {
 	.table_init = ip6table_raw_table_init,
 };
 
+static const struct xt_table packet_raw_before_defrag = {
+	.name = "raw",
+	.valid_hooks = RAW_VALID_HOOKS,
+	.me = THIS_MODULE,
+	.af = NFPROTO_IPV6,
+	.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG,
+	.table_init = ip6table_raw_table_init,
+};
+
 /* The work comes in here from netfilter.c. */
 static unsigned int
 ip6table_raw_hook(void *priv, struct sk_buff *skb,
@@ -38,15 +47,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init ip6table_raw_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	const struct xt_table *table = &packet_raw;
 	int ret;
 
+	if (raw_before_defrag)
+		table = &packet_raw_before_defrag;
+
 	if (net->ipv6.ip6table_raw)
 		return 0;
 
-	repl = ip6t_alloc_initial_table(&packet_raw);
+	repl = ip6t_alloc_initial_table(table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops,
+	ret = ip6t_register_table(net, table, repl, rawtable_ops,
 				  &net->ipv6.ip6table_raw);
 	kfree(repl);
 	return ret;
@@ -67,15 +80,16 @@ static struct pernet_operations ip6table_raw_net_ops = {
 static int __init ip6table_raw_init(void)
 {
 	int ret;
+	const struct xt_table *table = &packet_raw;
 
 	if (raw_before_defrag) {
-		packet_raw.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG;
+		table = &packet_raw_before_defrag;
 
 		pr_info("Enabling raw table before defrag\n");
 	}
 
 	/* Register hooks */
-	rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
+	rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook);
 	if (IS_ERR(rawtable_ops))
 		return PTR_ERR(rawtable_ops);
 
-- 
cgit v1.2.3


From 41e4b391157ff20aa911d1ef6cf3d6db079e9e57 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 15 Jan 2018 16:49:06 +0100
Subject: netfilter: nf_defrag: move NF_CONNTRACK bits into #ifdef

We cannot access the skb->_nfct field when CONFIG_NF_CONNTRACK is
disabled:

net/ipv4/netfilter/nf_defrag_ipv4.c: In function 'ipv4_conntrack_defrag':
net/ipv4/netfilter/nf_defrag_ipv4.c:83:9: error: 'struct sk_buff' has no member named '_nfct'
net/ipv6/netfilter/nf_defrag_ipv6_hooks.c: In function 'ipv6_defrag':
net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68:9: error: 'struct sk_buff' has no member named '_nfct'

Both functions already have an #ifdef for this, so let's move the
check in there.

Fixes: 902d6a4c2a4f ("netfilter: nf_defrag: Skip defrag if NOTRACK is set")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_defrag_ipv4.c       | 4 +++-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index cbd987f6b1f8..a0d3ad60a411 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -78,9 +78,11 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
 	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
+	if (skb->_nfct == IP_CT_UNTRACKED)
+		return NF_ACCEPT;
 #endif
 	/* Gather fragments. */
-	if (skb->_nfct != IP_CT_UNTRACKED && ip_is_fragment(ip_hdr(skb))) {
+	if (ip_is_fragment(ip_hdr(skb))) {
 		enum ip_defrag_users user =
 			nf_ct_defrag_user(state->hook, skb);
 
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 87b503a8f5ef..c87b48359e8f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -63,10 +63,10 @@ static unsigned int ipv6_defrag(void *priv,
 	/* Previously seen (loopback)?	*/
 	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
-#endif
 
 	if (skb->_nfct == IP_CT_UNTRACKED)
 		return NF_ACCEPT;
+#endif
 
 	err = nf_ct_frag6_gather(state->net, skb,
 				 nf_ct6_defrag_user(state->hook, skb));
-- 
cgit v1.2.3


From 2406e7e546b223e8cf42c44ac7352d4d1fd1dbcd Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:02 +0100
Subject: devlink: Add per devlink instance lock

This is a preparation before introducing resources and hot reload support.
Currently there are two global lock where one protects all devlink access,
and the second one protects devlink port access. This patch adds per devlink
instance lock which protects the internal members which are the sb/dpipe/
resource/ports. By introducing this lock the global devlink port lock can
be discarded.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |   1 +
 net/core/devlink.c    | 136 ++++++++++++++++++++++++++++----------------------
 2 files changed, 78 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b9654e133599..4d2c6fc94837 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -30,6 +30,7 @@ struct devlink {
 	const struct devlink_ops *ops;
 	struct device *dev;
 	possible_net_t _net;
+	struct mutex lock;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7d430c1d9c3e..2f71734c4ff6 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list);
  */
 static DEFINE_MUTEX(devlink_mutex);
 
-/* devlink_port_mutex
- *
- * Shared lock to guard lists of ports in all devlink devices.
- */
-static DEFINE_MUTEX(devlink_port_mutex);
-
 static struct net *devlink_net(const struct devlink *devlink)
 {
 	return read_pnet(&devlink->_net);
@@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
 #define DEVLINK_NL_FLAG_NEED_DEVLINK	BIT(0)
 #define DEVLINK_NL_FLAG_NEED_PORT	BIT(1)
 #define DEVLINK_NL_FLAG_NEED_SB		BIT(2)
-#define DEVLINK_NL_FLAG_LOCK_PORTS	BIT(3)
-	/* port is not needed but we need to ensure they don't
-	 * change in the middle of command
-	 */
+
+/* The per devlink instance lock is taken by default in the pre-doit
+ * operation, yet several commands do not require this. The global
+ * devlink lock is taken and protects from disruption by user-calls.
+ */
+#define DEVLINK_NL_FLAG_NO_LOCK		BIT(3)
 
 static int devlink_nl_pre_doit(const struct genl_ops *ops,
 			       struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink;
+	int err;
 
 	mutex_lock(&devlink_mutex);
 	devlink = devlink_get_from_info(info);
@@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 		mutex_unlock(&devlink_mutex);
 		return PTR_ERR(devlink);
 	}
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_lock(&devlink->lock);
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
 		info->user_ptr[0] = devlink;
 	} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
 		struct devlink_port *devlink_port;
 
-		mutex_lock(&devlink_port_mutex);
 		devlink_port = devlink_port_get_from_info(devlink, info);
 		if (IS_ERR(devlink_port)) {
-			mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_port);
+			err = PTR_ERR(devlink_port);
+			goto unlock;
 		}
 		info->user_ptr[0] = devlink_port;
 	}
-	if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) {
-		mutex_lock(&devlink_port_mutex);
-	}
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
 		struct devlink_sb *devlink_sb;
 
 		devlink_sb = devlink_sb_get_from_info(devlink, info);
 		if (IS_ERR(devlink_sb)) {
-			if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
-				mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_sb);
+			err = PTR_ERR(devlink_sb);
+			goto unlock;
 		}
 		info->user_ptr[1] = devlink_sb;
 	}
 	return 0;
+
+unlock:
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_unlock(&devlink->lock);
+	mutex_unlock(&devlink_mutex);
+	return err;
 }
 
 static void devlink_nl_post_doit(const struct genl_ops *ops,
 				 struct sk_buff *skb, struct genl_info *info)
 {
-	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT ||
-	    ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS)
-		mutex_unlock(&devlink_port_mutex);
+	struct devlink *devlink;
+
+	devlink = devlink_get_from_info(info);
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_unlock(&devlink->lock);
 	mutex_unlock(&devlink_mutex);
 }
 
@@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_port, &devlink->port_list, list) {
 			if (idx < start) {
 				idx++;
@@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq,
 						   NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			if (idx < start) {
 				idx++;
@@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 						 NETLINK_CB(cb->skb).portid,
 						 cb->nlh->nlmsg_seq,
 						 NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
 						   devlink_sb,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_port_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_port_pool_get_dumpit(msg, start, &idx,
 							devlink, devlink_sb,
 							NETLINK_CB(cb->skb).portid,
 							cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
 			continue;
+
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
 							   devlink,
 							   devlink_sb,
 							   NETLINK_CB(cb->skb).portid,
 							   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -2322,14 +2337,16 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.doit = devlink_nl_cmd_port_split_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
 		.doit = devlink_nl_cmd_port_unsplit_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
@@ -2397,8 +2414,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
@@ -2406,8 +2422,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_ESWITCH_GET,
@@ -2488,6 +2503,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	mutex_init(&devlink->lock);
 	return devlink;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc);
@@ -2550,16 +2566,16 @@ int devlink_port_register(struct devlink *devlink,
 			  struct devlink_port *devlink_port,
 			  unsigned int port_index)
 {
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_port_index_exists(devlink, port_index)) {
-		mutex_unlock(&devlink_port_mutex);
+		mutex_unlock(&devlink->lock);
 		return -EEXIST;
 	}
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	devlink_port->registered = true;
 	list_add_tail(&devlink_port->list, &devlink->port_list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 	return 0;
 }
@@ -2572,10 +2588,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register);
  */
 void devlink_port_unregister(struct devlink_port *devlink_port)
 {
+	struct devlink *devlink = devlink_port->devlink;
+
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	list_del(&devlink_port->list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_port_unregister);
 
@@ -2651,7 +2669,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	struct devlink_sb *devlink_sb;
 	int err = 0;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_sb_index_exists(devlink, sb_index)) {
 		err = -EEXIST;
 		goto unlock;
@@ -2670,7 +2688,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	devlink_sb->egress_tc_count = egress_tc_count;
 	list_add_tail(&devlink_sb->list, &devlink->sb_list);
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_sb_register);
@@ -2679,11 +2697,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
 {
 	struct devlink_sb *devlink_sb;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
 	WARN_ON(!devlink_sb);
 	list_del(&devlink_sb->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree(devlink_sb);
 }
 EXPORT_SYMBOL_GPL(devlink_sb_unregister);
@@ -2699,9 +2717,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 int devlink_dpipe_headers_register(struct devlink *devlink,
 				   struct devlink_dpipe_headers *dpipe_headers)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = dpipe_headers;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
@@ -2715,9 +2733,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
  */
 void devlink_dpipe_headers_unregister(struct devlink *devlink)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = NULL;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
 
@@ -2783,9 +2801,9 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 	table->priv = priv;
 	table->counter_control_extern = counter_control_extern;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
@@ -2801,17 +2819,17 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 {
 	struct devlink_dpipe_table *table;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
 					 table_name);
 	if (!table)
 		goto unlock;
 	list_del_rcu(&table->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree_rcu(table, rcu);
 	return;
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
-- 
cgit v1.2.3


From d9f9b9a4d05fab693fd23a9ecaa330e03ebe2c31 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:03 +0100
Subject: devlink: Add support for resource abstraction

Add support for hardware resource abstraction over devlink. Each resource
is identified via id, furthermore it contains information regarding its
size and its related sub resources. Each resource can also provide its
current occupancy.

In some cases the sizes of some resources can be changed, yet for those
changes to take place a hot driver reload may be needed. The reload
capability will be introduced in the next patch.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  96 +++++++++++
 include/uapi/linux/devlink.h |  18 +++
 net/core/devlink.c           | 374 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 488 insertions(+)

(limited to 'net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4d2c6fc94837..ceb1895d119b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -26,6 +26,7 @@ struct devlink {
 	struct list_head port_list;
 	struct list_head sb_list;
 	struct list_head dpipe_table_list;
+	struct list_head resource_list;
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
@@ -224,6 +225,61 @@ struct devlink_dpipe_headers {
 	unsigned int headers_count;
 };
 
+/**
+ * struct devlink_resource_ops - resource ops
+ * @occ_get: get the occupied size
+ * @size_validate: validate the size of the resource before update, reload
+ *                 is needed for changes to take place
+ */
+struct devlink_resource_ops {
+	u64 (*occ_get)(struct devlink *devlink);
+	int (*size_validate)(struct devlink *devlink, u64 size,
+			     struct netlink_ext_ack *extack);
+};
+
+/**
+ * struct devlink_resource_size_params - resource's size parameters
+ * @size_min: minimum size which can be set
+ * @size_max: maximum size which can be set
+ * @size_granularity: size granularity
+ * @size_unit: resource's basic unit
+ */
+struct devlink_resource_size_params {
+	u64 size_min;
+	u64 size_max;
+	u64 size_granularity;
+	enum devlink_resource_unit unit;
+};
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @id: id, per devlink instance
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ *              including its children
+ * @parent: parent resource
+ * @size_params: size parameters
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @resource_ops: resource ops
+ */
+struct devlink_resource {
+	const char *name;
+	u64 id;
+	u64 size;
+	u64 size_new;
+	bool size_valid;
+	struct devlink_resource *parent;
+	struct devlink_resource_size_params *size_params;
+	struct list_head list;
+	struct list_head resource_list;
+	const struct devlink_resource_ops *resource_ops;
+};
+
+#define DEVLINK_RESOURCE_ID_PARENT_TOP 0
+
 struct devlink_ops {
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
@@ -333,6 +389,20 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_size_params *size_params,
+			      const struct devlink_resource_ops *resource_ops);
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource);
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size);
+
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -469,6 +539,32 @@ devlink_dpipe_match_put(struct sk_buff *skb,
 	return 0;
 }
 
+static inline int
+devlink_resource_register(struct devlink *devlink,
+			  const char *resource_name,
+			  bool top_hierarchy,
+			  u64 resource_size,
+			  u64 resource_id,
+			  u64 parent_resource_id,
+			  struct devlink_resource_size_params *size_params,
+			  const struct devlink_resource_ops *resource_ops)
+{
+	return 0;
+}
+
+static inline void
+devlink_resources_unregister(struct devlink *devlink,
+			     struct devlink_resource *resource)
+{
+}
+
+static inline int
+devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
+			  u64 *p_resource_size)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6665df69e26a..f89950443e17 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -70,6 +70,8 @@ enum devlink_command {
 	DEVLINK_CMD_DPIPE_ENTRIES_GET,
 	DEVLINK_CMD_DPIPE_HEADERS_GET,
 	DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+	DEVLINK_CMD_RESOURCE_SET,
+	DEVLINK_CMD_RESOURCE_DUMP,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
@@ -202,6 +204,18 @@ enum devlink_attr {
 	DEVLINK_ATTR_PAD,
 
 	DEVLINK_ATTR_ESWITCH_ENCAP_MODE,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_LIST,		/* nested */
+	DEVLINK_ATTR_RESOURCE,			/* nested */
+	DEVLINK_ATTR_RESOURCE_NAME,		/* string */
+	DEVLINK_ATTR_RESOURCE_ID,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_NEW,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_VALID,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_SIZE_MIN,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_MAX,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_GRAN,        /* u64 */
+	DEVLINK_ATTR_RESOURCE_UNIT,		/* u8 */
+	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
@@ -245,4 +259,8 @@ enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_IPV6,
 };
 
+enum devlink_resource_unit {
+	DEVLINK_RESOURCE_UNIT_ENTRY,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2f71734c4ff6..89b3704fa450 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2288,6 +2288,233 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
+struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+		      struct devlink_resource *resource, u64 resource_id)
+{
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		struct devlink_resource *child_resource;
+
+		if (resource->id == resource_id)
+			return resource;
+
+		child_resource = devlink_resource_find(devlink, resource,
+						       resource_id);
+		if (child_resource)
+			return child_resource;
+	}
+	return NULL;
+}
+
+void devlink_resource_validate_children(struct devlink_resource *resource)
+{
+	struct devlink_resource *child_resource;
+	bool size_valid = true;
+	u64 parts_size = 0;
+
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	list_for_each_entry(child_resource, &resource->resource_list, list)
+		parts_size += child_resource->size_new;
+
+	if (parts_size > resource->size)
+		size_valid = false;
+out:
+	resource->size_valid = size_valid;
+}
+
+static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	u64 resource_id;
+	u64 size;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
+	    !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+		return -EINVAL;
+	resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource)
+		return -EINVAL;
+
+	if (!resource->resource_ops->size_validate)
+		return -EINVAL;
+
+	size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+	err = resource->resource_ops->size_validate(devlink, size,
+						    info->extack);
+	if (err)
+		return err;
+
+	resource->size_new = size;
+	devlink_resource_validate_children(resource);
+	if (resource->parent)
+		devlink_resource_validate_children(resource->parent);
+	return 0;
+}
+
+static void
+devlink_resource_size_params_put(struct devlink_resource *resource,
+				 struct sk_buff *skb)
+{
+	struct devlink_resource_size_params *size_params;
+
+	size_params = resource->size_params;
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+			  size_params->size_granularity, DEVLINK_ATTR_PAD);
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+			  size_params->size_max, DEVLINK_ATTR_PAD);
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+			  size_params->size_min, DEVLINK_ATTR_PAD);
+	nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit);
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+				struct devlink_resource *resource)
+{
+	struct devlink_resource *child_resource;
+	struct nlattr *child_resource_attr;
+	struct nlattr *resource_attr;
+
+	resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
+	if (!resource_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
+			      DEVLINK_ATTR_PAD) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
+			      DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+	if (resource->size != resource->size_new)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+				  resource->size_new, DEVLINK_ATTR_PAD);
+	if (resource->resource_ops && resource->resource_ops->occ_get)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+				  resource->resource_ops->occ_get(devlink),
+				  DEVLINK_ATTR_PAD);
+	devlink_resource_size_params_put(resource, skb);
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+		       resource->size_valid))
+		goto nla_put_failure;
+
+	child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!child_resource_attr)
+		goto nla_put_failure;
+
+	list_for_each_entry(child_resource, &resource->resource_list, list) {
+		if (devlink_resource_put(devlink, skb, child_resource))
+			goto resource_put_failure;
+	}
+
+	nla_nest_end(skb, child_resource_attr);
+out:
+	nla_nest_end(skb, resource_attr);
+	return 0;
+
+resource_put_failure:
+	nla_nest_cancel(skb, child_resource_attr);
+nla_put_failure:
+	nla_nest_cancel(skb, resource_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+				 enum devlink_command cmd, int flags)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	struct nlattr *resources_attr;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	bool incomplete;
+	void *hdr;
+	int i;
+	int err;
+
+	resource = list_first_entry(&devlink->resource_list,
+				    struct devlink_resource, list);
+start_again:
+	err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+	if (err)
+		return err;
+
+	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, NLM_F_MULTI, cmd);
+	if (!hdr) {
+		nlmsg_free(skb);
+		return -EMSGSIZE;
+	}
+
+	if (devlink_nl_put_handle(skb, devlink))
+		goto nla_put_failure;
+
+	resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!resources_attr)
+		goto nla_put_failure;
+
+	incomplete = false;
+	i = 0;
+	list_for_each_entry_from(resource, &devlink->resource_list, list) {
+		err = devlink_resource_put(devlink, skb, resource);
+		if (err) {
+			if (!i)
+				goto err_resource_put;
+			incomplete = true;
+			break;
+		}
+		i++;
+	}
+	nla_nest_end(skb, resources_attr);
+	genlmsg_end(skb, hdr);
+	if (incomplete)
+		goto start_again;
+send_done:
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	if (!nlh) {
+		err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+		if (err)
+			goto err_skb_send_alloc;
+		goto send_done;
+	}
+	return genlmsg_reply(skb, info);
+
+nla_put_failure:
+	err = -EMSGSIZE;
+err_resource_put:
+err_skb_send_alloc:
+	genlmsg_cancel(skb, hdr);
+	nlmsg_free(skb);
+	return err;
+}
+
+static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	if (list_empty(&devlink->resource_list))
+		return -EOPNOTSUPP;
+
+	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2306,6 +2533,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
+	[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -2466,6 +2695,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_SET,
+		.doit = devlink_nl_cmd_resource_set,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_DUMP,
+		.doit = devlink_nl_cmd_resource_dump,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2503,6 +2746,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	INIT_LIST_HEAD(&devlink->resource_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
@@ -2833,6 +3077,136 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
+/**
+ *	devlink_resource_register - devlink resource register
+ *
+ *	@devlink: devlink
+ *	@resource_name: resource's name
+ *	@top_hierarchy: top hierarchy
+ *	@reload_required: reload is required for new configuration to
+ *			  apply
+ *	@resource_size: resource's size
+ *	@resource_id: resource's id
+ *	@parent_reosurce_id: resource's parent id
+ *	@size params: size parameters
+ *	@resource_ops: resource ops
+ */
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_size_params *size_params,
+			      const struct devlink_resource_ops *resource_ops)
+{
+	struct devlink_resource *resource;
+	struct list_head *resource_list;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (resource) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (top_hierarchy) {
+		resource_list = &devlink->resource_list;
+	} else {
+		struct devlink_resource *parent_resource;
+
+		parent_resource = devlink_resource_find(devlink, NULL,
+							parent_resource_id);
+		if (parent_resource) {
+			resource_list = &parent_resource->resource_list;
+			resource->parent = parent_resource;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	resource->name = resource_name;
+	resource->size = resource_size;
+	resource->size_new = resource_size;
+	resource->id = resource_id;
+	resource->resource_ops = resource_ops;
+	resource->size_valid = true;
+	resource->size_params = size_params;
+	INIT_LIST_HEAD(&resource->resource_list);
+	list_add_tail(&resource->list, resource_list);
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+/**
+ *	devlink_resources_unregister - free all resources
+ *
+ *	@devlink: devlink
+ *	@resource: resource
+ */
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource)
+{
+	struct devlink_resource *tmp, *child_resource;
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	if (!resource)
+		mutex_lock(&devlink->lock);
+
+	list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
+		devlink_resources_unregister(devlink, child_resource);
+		list_del(&child_resource->list);
+		kfree(child_resource);
+	}
+
+	if (!resource)
+		mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ *	devlink_resource_size_get - get and update size
+ *
+ *	@devlink: devlink
+ *	@resource_id: the requested resource id
+ *	@p_resource_size: ptr to update
+ */
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size)
+{
+	struct devlink_resource *resource;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource) {
+		err = -EINVAL;
+		goto out;
+	}
+	*p_resource_size = resource->size_new;
+	resource->size = resource->size_new;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit v1.2.3


From 2d8dc5bbf4e7603747875eb5cadcd67c1fa8b1bb Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:04 +0100
Subject: devlink: Add support for reload

Add support for performing driver hot reload.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 47 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ceb1895d119b..c698883fb0bb 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -281,6 +281,7 @@ struct devlink_resource {
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
 struct devlink_ops {
+	int (*reload)(struct devlink *devlink);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index f89950443e17..555ddcaf0be2 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -73,6 +73,11 @@ enum devlink_command {
 	DEVLINK_CMD_RESOURCE_SET,
 	DEVLINK_CMD_RESOURCE_DUMP,
 
+	/* Hot driver reload, makes configuration changes take place. The
+	 * devlink instance is not released during the process.
+	 */
+	DEVLINK_CMD_RELOAD,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89b3704fa450..4c3d85560436 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2515,6 +2515,45 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
 	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
 }
 
+static int
+devlink_resources_validate(struct devlink *devlink,
+			   struct devlink_resource *resource,
+			   struct genl_info *info)
+{
+	struct list_head *resource_list;
+	int err = 0;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		if (!resource->size_valid)
+			return -EINVAL;
+		err = devlink_resources_validate(devlink, resource, info);
+		if (err)
+			return err;
+	}
+	return err;
+}
+
+static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	int err;
+
+	if (!devlink->ops->reload)
+		return -EOPNOTSUPP;
+
+	err = devlink_resources_validate(devlink, NULL, info);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
+		return err;
+	}
+	return devlink->ops->reload(devlink);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2709,6 +2748,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RELOAD,
+		.doit = devlink_nl_cmd_reload,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From 56dc7cd0a87a1ff4f49ee1e67bd88e768385d51a Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:05 +0100
Subject: devlink: Add relation between dpipe and resource

The hardware processes which are modeled via dpipe commonly use some
internal hardware resources. Such relation can improve the understanding
of hardware limitations. The number of resource's unit consumed per
table's entry are also provided for each table.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        | 17 +++++++++++++++++
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index c698883fb0bb..6545b03e97f7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -183,6 +183,9 @@ struct devlink_dpipe_table_ops;
  * @counters_enabled: indicates if counters are active
  * @counter_control_extern: indicates if counter control is in dpipe or
  *			    external tool
+ * @resource_valid: Indicate that the resource id is valid
+ * @resource_id: relative resource this table is related to
+ * @resource_units: number of resource's unit consumed per table's entry
  * @table_ops: table operations
  * @rcu: rcu
  */
@@ -192,6 +195,9 @@ struct devlink_dpipe_table {
 	const char *name;
 	bool counters_enabled;
 	bool counter_control_extern;
+	bool resource_valid;
+	u64 resource_id;
+	u64 resource_units;
 	struct devlink_dpipe_table_ops *table_ops;
 	struct rcu_head rcu;
 };
@@ -403,6 +409,9 @@ void devlink_resources_unregister(struct devlink *devlink,
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size);
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id,
+				     u64 resource_units);
 
 #else
 
@@ -566,6 +575,14 @@ devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_dpipe_table_resource_set(struct devlink *devlink,
+				 const char *table_name, u64 resource_id,
+				 u64 resource_units)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 555ddcaf0be2..1df65a4c2044 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -221,6 +221,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_RESOURCE_SIZE_GRAN,        /* u64 */
 	DEVLINK_ATTR_RESOURCE_UNIT,		/* u8 */
 	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
+	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
+	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4c3d85560436..dd7d6dd07bfb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1694,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
 		       table->counters_enabled))
 		goto nla_put_failure;
 
+	if (table->resource_valid) {
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+				  table->resource_id, DEVLINK_ATTR_PAD);
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+				  table->resource_units, DEVLINK_ATTR_PAD);
+	}
 	if (devlink_dpipe_matches_put(table, skb))
 		goto nla_put_failure;
 
@@ -3254,6 +3260,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(devlink_resource_size_get);
 
+/**
+ *	devlink_dpipe_table_resource_set - set the resource id
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *	@resource_id: resource id
+ *	@resource_units: number of resource's units consumed per table's entry
+ */
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id,
+				     u64 resource_units)
+{
+	struct devlink_dpipe_table *table;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+					 table_name);
+	if (!table) {
+		err = -EINVAL;
+		goto out;
+	}
+	table->resource_id = resource_id;
+	table->resource_units = resource_units;
+	table->resource_valid = true;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit v1.2.3


From e88f2be83282d5ffc8f5ffe4c22606bf62eb1ac7 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 15 Jan 2018 17:56:28 +0100
Subject: tipc: fix race condition at topology server receive

We have identified a race condition during reception of socket
events and messages in the topology server.

- The function tipc_close_conn() is releasing the corresponding
  struct tipc_subscriber instance without considering that there
  may still be items in the receive work queue. When those are
  scheduled, in the function tipc_receive_from_work(), they are
  using the subscriber pointer stored in struct tipc_conn, without
  first checking if this is valid or not. This will sometimes
  lead to crashes, as the next call of tipc_conn_recvmsg() will
  access the now deleted item.
  We fix this by making the usage of this pointer conditional on
  whether the connection is active or not. I.e., we check the condition
  test_bit(CF_CONNECTED) before making the call tipc_conn_recvmsg().

- Since the two functions may be running on different cores, the
  condition test described above is not enough. tipc_close_conn()
  may come in between and delete the subscriber item after the condition
  test is done, but before tipc_conn_recv_msg() is finished. This
  happens less frequently than the problem described above, but leads
  to the same symptoms.

  We fix this by using the existing sk_callback_lock for mutual
  exclusion in the two functions. In addition, we have to move
  a call to tipc_conn_terminate() outside the mentioned lock to
  avoid deadlock.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 70 +++++++++++++++++++++++++++++--------------------------
 net/tipc/server.h |  6 ++---
 net/tipc/subscr.c | 21 +++++++++--------
 3 files changed, 51 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index 8ee5e86b7870..c0d331f13eee 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -132,10 +132,11 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
 
 	spin_lock_bh(&s->idr_lock);
 	con = idr_find(&s->conn_idr, conid);
-	if (con && test_bit(CF_CONNECTED, &con->flags))
-		conn_get(con);
-	else
-		con = NULL;
+	if (con) {
+		if (!test_bit(CF_CONNECTED, &con->flags) ||
+		    !kref_get_unless_zero(&con->kref))
+			con = NULL;
+	}
 	spin_unlock_bh(&s->idr_lock);
 	return con;
 }
@@ -183,35 +184,28 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
-static void tipc_unregister_callbacks(struct tipc_conn *con)
-{
-	struct sock *sk = con->sock->sk;
-
-	write_lock_bh(&sk->sk_callback_lock);
-	sk->sk_user_data = NULL;
-	write_unlock_bh(&sk->sk_callback_lock);
-}
-
 static void tipc_close_conn(struct tipc_conn *con)
 {
 	struct tipc_server *s = con->server;
+	struct sock *sk = con->sock->sk;
+	bool disconnect = false;
 
-	if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
-		if (con->sock)
-			tipc_unregister_callbacks(con);
-
+	write_lock_bh(&sk->sk_callback_lock);
+	disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags);
+	if (disconnect) {
+		sk->sk_user_data = NULL;
 		if (con->conid)
 			s->tipc_conn_release(con->conid, con->usr_data);
-
-		/* We shouldn't flush pending works as we may be in the
-		 * thread. In fact the races with pending rx/tx work structs
-		 * are harmless for us here as we have already deleted this
-		 * connection from server connection list.
-		 */
-		if (con->sock)
-			kernel_sock_shutdown(con->sock, SHUT_RDWR);
-		conn_put(con);
 	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	/* Handle concurrent calls from sending and receiving threads */
+	if (!disconnect)
+		return;
+
+	/* Don't flush pending works, -just let them expire */
+	kernel_sock_shutdown(con->sock, SHUT_RDWR);
+	conn_put(con);
 }
 
 static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
@@ -248,9 +242,10 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
 
 static int tipc_receive_from_sock(struct tipc_conn *con)
 {
-	struct msghdr msg = {};
 	struct tipc_server *s = con->server;
+	struct sock *sk = con->sock->sk;
 	struct sockaddr_tipc addr;
+	struct msghdr msg = {};
 	struct kvec iov;
 	void *buf;
 	int ret;
@@ -271,12 +266,15 @@ static int tipc_receive_from_sock(struct tipc_conn *con)
 		goto out_close;
 	}
 
-	s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid, &addr,
-			     con->usr_data, buf, ret);
-
+	read_lock_bh(&sk->sk_callback_lock);
+	if (test_bit(CF_CONNECTED, &con->flags))
+		ret = s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid,
+					   &addr, con->usr_data, buf, ret);
+	read_unlock_bh(&sk->sk_callback_lock);
 	kmem_cache_free(s->rcvbuf_cache, buf);
-
-	return 0;
+	if (ret < 0)
+		tipc_conn_terminate(s, con->conid);
+	return ret;
 
 out_close:
 	if (ret != -EWOULDBLOCK)
@@ -525,11 +523,17 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
 {
 	struct tipc_conn *con;
+	struct tipc_server *srv;
 
 	con = tipc_conn_lookup(tipc_topsrv(net), conid);
 	if (!con)
 		return;
-	tipc_close_conn(con);
+
+	test_and_clear_bit(CF_CONNECTED, &con->flags);
+	srv = con->server;
+	if (con->conid)
+		srv->tipc_conn_release(con->conid, con->usr_data);
+	conn_put(con);
 	conn_put(con);
 }
 
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 17f49ee44cfd..64df7513cd70 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -74,9 +74,9 @@ struct tipc_server {
 	int max_rcvbuf_size;
 	void *(*tipc_conn_new)(int conid);
 	void (*tipc_conn_release)(int conid, void *usr_data);
-	void (*tipc_conn_recvmsg)(struct net *net, int conid,
-				  struct sockaddr_tipc *addr, void *usr_data,
-				  void *buf, size_t len);
+	int (*tipc_conn_recvmsg)(struct net *net, int conid,
+				 struct sockaddr_tipc *addr, void *usr_data,
+				 void *buf, size_t len);
 	struct sockaddr_tipc *saddr;
 	char name[TIPC_SERVER_NAME_LEN];
 	int imp;
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 44df528ed6ab..68e26470c516 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -289,17 +289,16 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 	return sub;
 }
 
-static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
-				   struct tipc_subscriber *subscriber, int swap,
-				   bool status)
+static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
+				  struct tipc_subscriber *subscriber, int swap,
+				  bool status)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub = NULL;
 	u32 timeout;
 
 	sub = tipc_subscrp_create(net, s, swap);
 	if (!sub)
-		return tipc_conn_terminate(tn->topsrv, subscriber->conid);
+		return -1;
 
 	spin_lock_bh(&subscriber->lock);
 	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
@@ -313,6 +312,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 
 	if (timeout != TIPC_WAIT_FOREVER)
 		mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
+	return 0;
 }
 
 /* Handle one termination request for the subscriber */
@@ -322,9 +322,9 @@ static void tipc_subscrb_release_cb(int conid, void *usr_data)
 }
 
 /* Handle one request to create a new subscription for the subscriber */
-static void tipc_subscrb_rcv_cb(struct net *net, int conid,
-				struct sockaddr_tipc *addr, void *usr_data,
-				void *buf, size_t len)
+static int tipc_subscrb_rcv_cb(struct net *net, int conid,
+			       struct sockaddr_tipc *addr, void *usr_data,
+			       void *buf, size_t len)
 {
 	struct tipc_subscriber *subscriber = usr_data;
 	struct tipc_subscr *s = (struct tipc_subscr *)buf;
@@ -338,10 +338,11 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 	/* Detect & process a subscription cancellation request */
 	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
 		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
-		return tipc_subscrp_cancel(s, subscriber);
+		tipc_subscrp_cancel(s, subscriber);
+		return 0;
 	}
 	status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap));
-	tipc_subscrp_subscribe(net, s, subscriber, swap, status);
+	return tipc_subscrp_subscribe(net, s, subscriber, swap, status);
 }
 
 /* Handle one request to establish a new subscriber */
-- 
cgit v1.2.3


From 96890d62523c2cddc2c053ad29de35c4d935cf11 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jan 2018 00:42:40 +0300
Subject: net: delete /proc THIS_MODULE references

/proc has been ignoring struct file_operations::owner field for 10 years.
Specifically, it started with commit 786d7e1612f0b0adb6046f19b906609e4fe8b1ba
("Fix rmmod/read/write races in /proc entries"). Notice the chunk where
inode->i_fop is initialized with proxy struct file_operations for
regular files:

	-               if (de->proc_fops)
	-                       inode->i_fop = de->proc_fops;
	+               if (de->proc_fops) {
	+                       if (S_ISREG(inode->i_mode))
	+                               inode->i_fop = &proc_reg_file_ops;
	+                       else
	+                               inode->i_fop = de->proc_fops;
	+               }

VFS stopped pinning module at this point.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlanproc.c               | 2 --
 net/appletalk/aarp.c               | 1 -
 net/appletalk/atalk_proc.c         | 3 ---
 net/atm/br2684.c                   | 1 -
 net/atm/lec.c                      | 1 -
 net/atm/mpoa_proc.c                | 1 -
 net/atm/proc.c                     | 1 -
 net/ax25/af_ax25.c                 | 1 -
 net/ax25/ax25_route.c              | 1 -
 net/ax25/ax25_uid.c                | 1 -
 net/bluetooth/cmtp/capi.c          | 1 -
 net/can/bcm.c                      | 1 -
 net/can/proc.c                     | 6 ------
 net/core/neighbour.c               | 1 -
 net/core/net-procfs.c              | 4 ----
 net/core/pktgen.c                  | 3 ---
 net/core/sock.c                    | 1 -
 net/decnet/af_decnet.c             | 1 -
 net/decnet/dn_dev.c                | 1 -
 net/decnet/dn_neigh.c              | 1 -
 net/decnet/dn_route.c              | 1 -
 net/ipv4/arp.c                     | 1 -
 net/ipv4/fib_trie.c                | 3 ---
 net/ipv4/igmp.c                    | 2 --
 net/ipv4/ipconfig.c                | 1 -
 net/ipv4/ipmr.c                    | 2 --
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 -
 net/ipv4/proc.c                    | 3 ---
 net/ipv4/raw.c                     | 1 -
 net/ipv4/route.c                   | 3 ---
 net/ipv4/tcp_ipv4.c                | 1 -
 net/ipv4/udp.c                     | 1 -
 net/ipv4/udplite.c                 | 1 -
 net/ipv6/addrconf.c                | 1 -
 net/ipv6/anycast.c                 | 1 -
 net/ipv6/ip6_flowlabel.c           | 1 -
 net/ipv6/ip6mr.c                   | 2 --
 net/ipv6/mcast.c                   | 2 --
 net/ipv6/proc.c                    | 3 ---
 net/ipv6/raw.c                     | 1 -
 net/ipv6/route.c                   | 2 --
 net/ipv6/tcp_ipv6.c                | 1 -
 net/ipv6/udp.c                     | 1 -
 net/ipv6/udplite.c                 | 1 -
 net/ipx/ipx_proc.c                 | 3 ---
 net/kcm/kcmproc.c                  | 2 --
 net/l2tp/l2tp_ppp.c                | 1 -
 net/llc/llc_proc.c                 | 2 --
 net/netlink/af_netlink.c           | 1 -
 net/netrom/af_netrom.c             | 1 -
 net/netrom/nr_route.c              | 2 --
 net/packet/af_packet.c             | 1 -
 net/phonet/socket.c                | 2 --
 net/rose/af_rose.c                 | 1 -
 net/rose/rose_route.c              | 3 ---
 net/rxrpc/proc.c                   | 2 --
 net/sched/sch_api.c                | 1 -
 net/sctp/proc.c                    | 1 -
 net/unix/af_unix.c                 | 1 -
 net/wireless/wext-proc.c           | 1 -
 net/xfrm/xfrm_proc.c               | 1 -
 61 files changed, 96 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index 5f1446c9f098..a662ccc166df 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -80,7 +80,6 @@ static int vlan_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations vlan_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = vlan_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -97,7 +96,6 @@ static int vlandev_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations vlandev_fops = {
-	.owner = THIS_MODULE,
 	.open    = vlandev_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 309d7dbb36e8..d4c1021e74e1 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -1047,7 +1047,6 @@ static int aarp_seq_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations atalk_seq_arp_fops = {
-	.owner		= THIS_MODULE,
 	.open           = aarp_seq_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index af46bc49e1e9..a3bf9d519193 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -226,7 +226,6 @@ static int atalk_seq_socket_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations atalk_seq_interface_fops = {
-	.owner		= THIS_MODULE,
 	.open		= atalk_seq_interface_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -234,7 +233,6 @@ static const struct file_operations atalk_seq_interface_fops = {
 };
 
 static const struct file_operations atalk_seq_route_fops = {
-	.owner		= THIS_MODULE,
 	.open		= atalk_seq_route_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -242,7 +240,6 @@ static const struct file_operations atalk_seq_route_fops = {
 };
 
 static const struct file_operations atalk_seq_socket_fops = {
-	.owner		= THIS_MODULE,
 	.open		= atalk_seq_socket_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 4e111196f902..fd94bea36ee8 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -824,7 +824,6 @@ static int br2684_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations br2684_proc_ops = {
-	.owner = THIS_MODULE,
 	.open = br2684_proc_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 6676e3433261..09a1f056712a 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -992,7 +992,6 @@ static int lec_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations lec_seq_fops = {
-	.owner = THIS_MODULE,
 	.open = lec_seq_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 2212da9c2da2..b93cc0f18292 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -57,7 +57,6 @@ static int parse_qos(const char *buff);
  *   Define allowed FILE OPERATIONS
  */
 static const struct file_operations mpc_file_operations = {
-	.owner =	THIS_MODULE,
 	.open =		proc_mpc_open,
 	.read =		seq_read,
 	.llseek =	seq_lseek,
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 642f9272ab95..edc48edc95c1 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -37,7 +37,6 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
 				 size_t count, loff_t *pos);
 
 static const struct file_operations proc_atm_dev_ops = {
-	.owner =	THIS_MODULE,
 	.read =		proc_dev_atm_read,
 	.llseek =	noop_llseek,
 };
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 06eac1f50c5e..47fdd399626b 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1931,7 +1931,6 @@ static int ax25_info_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ax25_info_fops = {
-	.owner = THIS_MODULE,
 	.open = ax25_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 0446b892618a..525558972fd9 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -336,7 +336,6 @@ static int ax25_rt_info_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations ax25_route_fops = {
-	.owner = THIS_MODULE,
 	.open = ax25_rt_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 83b035f56202..4ebe91ba317a 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -194,7 +194,6 @@ static int ax25_uid_info_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations ax25_uid_fops = {
-	.owner = THIS_MODULE,
 	.open = ax25_uid_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index bb308224099c..426a92f02db4 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -527,7 +527,6 @@ static int cmtp_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations cmtp_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= cmtp_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 13690334efa3..ac5e5e34fee3 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -246,7 +246,6 @@ static int bcm_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations bcm_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= bcm_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/can/proc.c b/net/can/proc.c
index 45e38a3085bc..fdf704e9bb8c 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -276,7 +276,6 @@ static int can_stats_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations can_stats_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= can_stats_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -310,7 +309,6 @@ static int can_reset_stats_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations can_reset_stats_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= can_reset_stats_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -329,7 +327,6 @@ static int can_version_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations can_version_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= can_version_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -382,7 +379,6 @@ static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations can_rcvlist_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= can_rcvlist_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -450,7 +446,6 @@ static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations can_rcvlist_sff_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= can_rcvlist_sff_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -494,7 +489,6 @@ static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations can_rcvlist_eff_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= can_rcvlist_eff_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d1f5fe986edd..f96f9f58b894 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2862,7 +2862,6 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)
 };
 
 static const struct file_operations neigh_stat_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open 	 = neigh_stat_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 615ccab55f38..e010bb800d7b 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -182,7 +182,6 @@ static int dev_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dev_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = dev_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -202,7 +201,6 @@ static int softnet_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations softnet_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = softnet_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -306,7 +304,6 @@ static int ptype_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ptype_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ptype_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -387,7 +384,6 @@ static int dev_mc_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dev_mc_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = dev_mc_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b9ce241cd28c..4fcfcb14e7c6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -523,7 +523,6 @@ static int pgctrl_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pktgen_fops = {
-	.owner   = THIS_MODULE,
 	.open    = pgctrl_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1804,7 +1803,6 @@ static int pktgen_if_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pktgen_if_fops = {
-	.owner   = THIS_MODULE,
 	.open    = pktgen_if_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1942,7 +1940,6 @@ static int pktgen_thread_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pktgen_thread_fops = {
-	.owner   = THIS_MODULE,
 	.open    = pktgen_thread_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/core/sock.c b/net/core/sock.c
index 72d14b221784..abf4cbff99b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3362,7 +3362,6 @@ static int proto_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations proto_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= proto_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 518cea17b811..d93e5b887f03 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -2320,7 +2320,6 @@ static int dn_socket_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dn_socket_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= dn_socket_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index d1885cf59319..c9f5e1ebb9c8 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1389,7 +1389,6 @@ static int dn_dev_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dn_dev_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = dn_dev_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 528119a5618e..6e37d9e6345e 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -597,7 +597,6 @@ static int dn_neigh_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dn_neigh_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= dn_neigh_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 73160d4aebbe..ef20b8e31669 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1860,7 +1860,6 @@ static int dn_rt_cache_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dn_rt_cache_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = dn_rt_cache_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a8d7c5a9fb05..56aef027df31 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1420,7 +1420,6 @@ static int arp_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations arp_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open           = arp_seq_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5ddc4aefff12..5530cd6fdbc7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2334,7 +2334,6 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations fib_triestat_fops = {
-	.owner	= THIS_MODULE,
 	.open	= fib_triestat_seq_open,
 	.read	= seq_read,
 	.llseek	= seq_lseek,
@@ -2521,7 +2520,6 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations fib_trie_fops = {
-	.owner  = THIS_MODULE,
 	.open   = fib_trie_seq_open,
 	.read   = seq_read,
 	.llseek = seq_lseek,
@@ -2715,7 +2713,6 @@ static int fib_route_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations fib_route_fops = {
-	.owner  = THIS_MODULE,
 	.open   = fib_route_seq_open,
 	.read   = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 726f6b608274..02f00be12bb0 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2832,7 +2832,6 @@ static int igmp_mc_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations igmp_mc_seq_fops = {
-	.owner		=	THIS_MODULE,
 	.open		=	igmp_mc_seq_open,
 	.read		=	seq_read,
 	.llseek		=	seq_lseek,
@@ -2979,7 +2978,6 @@ static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations igmp_mcf_seq_fops = {
-	.owner		=	THIS_MODULE,
 	.open		=	igmp_mcf_seq_open,
 	.read		=	seq_read,
 	.llseek		=	seq_lseek,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index abdebca848c9..e9e488e72900 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1322,7 +1322,6 @@ static int pnp_seq_open(struct inode *indoe, struct file *file)
 }
 
 static const struct file_operations pnp_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= pnp_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index fd5f19c988e4..a819fab45d17 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3045,7 +3045,6 @@ static int ipmr_vif_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ipmr_vif_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ipmr_vif_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -3198,7 +3197,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ipmr_mfc_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ipmr_mfc_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 69060e3abe85..c29a6ca6c6d6 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -776,7 +776,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
 }
 
 static const struct file_operations clusterip_proc_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = clusterip_proc_open,
 	.read	 = seq_read,
 	.write	 = clusterip_proc_write,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 9f37c4727861..dc5edc8f7564 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -83,7 +83,6 @@ static int sockstat_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations sockstat_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = sockstat_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -467,7 +466,6 @@ static int snmp_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations snmp_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = snmp_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -515,7 +513,6 @@ static int netstat_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations netstat_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = netstat_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5e570aa9e43b..136544b36a46 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1119,7 +1119,6 @@ static int raw_v4_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations raw_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = raw_v4_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f0ed031f3594..b7d90b48d821 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,6 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations rt_cache_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = rt_cache_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -331,7 +330,6 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations rt_cpu_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = rt_cpu_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -369,7 +367,6 @@ static int rt_acct_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations rt_acct_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= rt_acct_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5d203248123e..95738aa0d8a6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2358,7 +2358,6 @@ out:
 }
 
 static const struct file_operations tcp_afinfo_seq_fops = {
-	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db72619e07e4..853321555a4e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2714,7 +2714,6 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 }
 
 static const struct file_operations udp_afinfo_seq_fops = {
-	.owner    = THIS_MODULE,
 	.open     = udp_seq_open,
 	.read     = seq_read,
 	.llseek   = seq_lseek,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 59f10fe9782e..f96614e9b9a5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -75,7 +75,6 @@ static struct inet_protosw udplite4_protosw = {
 #ifdef CONFIG_PROC_FS
 
 static const struct file_operations udplite_afinfo_seq_fops = {
-	.owner    = THIS_MODULE,
 	.open     = udp_seq_open,
 	.read     = seq_read,
 	.llseek   = seq_lseek,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2435f7ab070b..ab99cb641b7c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4214,7 +4214,6 @@ static int if6_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations if6_fops = {
-	.owner		= THIS_MODULE,
 	.open		= if6_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 0bbab8a4b5d8..8e085cc05aeb 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -533,7 +533,6 @@ static int ac6_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ac6_seq_fops = {
-	.owner		=	THIS_MODULE,
 	.open		=	ac6_seq_open,
 	.read		=	seq_read,
 	.llseek		=	seq_lseek,
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 7f59c8fabeeb..3dab664ff503 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -836,7 +836,6 @@ static int ip6fl_seq_release(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip6fl_seq_fops = {
-	.owner		=	THIS_MODULE,
 	.open		=	ip6fl_seq_open,
 	.read		=	seq_read,
 	.llseek		=	seq_lseek,
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 890f9bda06a4..754ef84cf354 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -477,7 +477,6 @@ static int ip6mr_vif_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip6mr_vif_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ip6mr_vif_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -609,7 +608,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip6mr_mfc_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ipmr_mfc_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 844642682b83..40b223a930a3 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2758,7 +2758,6 @@ static int igmp6_mc_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations igmp6_mc_seq_fops = {
-	.owner		=	THIS_MODULE,
 	.open		=	igmp6_mc_seq_open,
 	.read		=	seq_read,
 	.llseek		=	seq_lseek,
@@ -2913,7 +2912,6 @@ static int igmp6_mcf_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations igmp6_mcf_seq_fops = {
-	.owner		=	THIS_MODULE,
 	.open		=	igmp6_mcf_seq_open,
 	.read		=	seq_read,
 	.llseek		=	seq_lseek,
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index e88bcb8ff0fd..b67814242f78 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -58,7 +58,6 @@ static int sockstat6_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations sockstat6_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = sockstat6_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -248,7 +247,6 @@ static int snmp6_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations snmp6_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = snmp6_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -274,7 +272,6 @@ static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations snmp6_dev_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = snmp6_dev_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 761a473a07c5..ddda7eb3c623 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1308,7 +1308,6 @@ static int raw6_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations raw6_seq_fops = {
-	.owner =	THIS_MODULE,
 	.open =		raw6_seq_open,
 	.read =		seq_read,
 	.llseek =	seq_lseek,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c37bd9569172..f85da2f1e729 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4648,7 +4648,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 #ifdef CONFIG_PROC_FS
 
 static const struct file_operations ipv6_route_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= ipv6_route_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -4676,7 +4675,6 @@ static int rt6_stats_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations rt6_stats_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = rt6_stats_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c0f7e69f2e6c..a1ab29e2ab3b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1883,7 +1883,6 @@ out:
 }
 
 static const struct file_operations tcp6_afinfo_seq_fops = {
-	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eecf9f0faf29..52e3ea0e6f50 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1479,7 +1479,6 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 }
 
 static const struct file_operations udp6_afinfo_seq_fops = {
-	.owner    = THIS_MODULE,
 	.open     = udp_seq_open,
 	.read     = seq_read,
 	.llseek   = seq_lseek,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 2784cc363f2b..14ae32bb1f3d 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -94,7 +94,6 @@ void udplitev6_exit(void)
 #ifdef CONFIG_PROC_FS
 
 static const struct file_operations udplite6_afinfo_seq_fops = {
-	.owner    = THIS_MODULE,
 	.open     = udp_seq_open,
 	.read     = seq_read,
 	.llseek   = seq_lseek,
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index 38a3d51d9ead..b9232e4e2ed4 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -260,7 +260,6 @@ static int ipx_seq_socket_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ipx_seq_interface_fops = {
-	.owner		= THIS_MODULE,
 	.open           = ipx_seq_interface_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
@@ -268,7 +267,6 @@ static const struct file_operations ipx_seq_interface_fops = {
 };
 
 static const struct file_operations ipx_seq_route_fops = {
-	.owner		= THIS_MODULE,
 	.open           = ipx_seq_route_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
@@ -276,7 +274,6 @@ static const struct file_operations ipx_seq_route_fops = {
 };
 
 static const struct file_operations ipx_seq_socket_fops = {
-	.owner		= THIS_MODULE,
 	.open           = ipx_seq_socket_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index bd5723315069..9d5649e4e8b7 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -247,7 +247,6 @@ static int kcm_seq_show(struct seq_file *seq, void *v)
 }
 
 static const struct file_operations kcm_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= kcm_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -397,7 +396,6 @@ static int kcm_stats_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations kcm_stats_seq_fops = {
-	.owner   = THIS_MODULE,
 	.open    = kcm_stats_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index b412fc3351dc..59f246d7b290 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1734,7 +1734,6 @@ static int pppol2tp_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pppol2tp_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= pppol2tp_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 29c509c54bb2..66821e8a2b7a 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -225,7 +225,6 @@ static int llc_seq_core_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations llc_seq_socket_fops = {
-	.owner		= THIS_MODULE,
 	.open		= llc_seq_socket_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -233,7 +232,6 @@ static const struct file_operations llc_seq_socket_fops = {
 };
 
 static const struct file_operations llc_seq_core_fops = {
-	.owner		= THIS_MODULE,
 	.open		= llc_seq_core_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 972bfe113043..1e52f6012d5d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2637,7 +2637,6 @@ static int netlink_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations netlink_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= netlink_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 7ed9d4422a73..9ba30c63be3d 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1344,7 +1344,6 @@ static int nr_info_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations nr_info_fops = {
-	.owner = THIS_MODULE,
 	.open = nr_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index 75e6ba970fde..b5a7dcb30991 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -901,7 +901,6 @@ static int nr_node_info_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations nr_nodes_fops = {
-	.owner = THIS_MODULE,
 	.open = nr_node_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -968,7 +967,6 @@ static int nr_neigh_info_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations nr_neigh_fops = {
-	.owner = THIS_MODULE,
 	.open = nr_neigh_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ee7aa0ba3a67..05d31864a34e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4531,7 +4531,6 @@ static int packet_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations packet_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= packet_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 1b050dd17393..fa2f13a8938f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -635,7 +635,6 @@ static int pn_sock_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations pn_sock_seq_fops = {
-	.owner = THIS_MODULE,
 	.open = pn_sock_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -818,7 +817,6 @@ static int pn_res_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations pn_res_seq_fops = {
-	.owner = THIS_MODULE,
 	.open = pn_res_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 6a5c4992cf61..083bd251406f 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1461,7 +1461,6 @@ static int rose_info_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations rose_info_fops = {
-	.owner = THIS_MODULE,
 	.open = rose_info_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 8ca3124df83f..178619ddab68 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -1156,7 +1156,6 @@ static int rose_nodes_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations rose_nodes_fops = {
-	.owner = THIS_MODULE,
 	.open = rose_nodes_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -1240,7 +1239,6 @@ static int rose_neigh_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations rose_neigh_fops = {
-	.owner = THIS_MODULE,
 	.open = rose_neigh_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -1326,7 +1324,6 @@ static int rose_route_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations rose_routes_fops = {
-	.owner = THIS_MODULE,
 	.open = rose_route_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 7421656963a9..f79f260c6ddc 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -125,7 +125,6 @@ static int rxrpc_call_seq_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations rxrpc_call_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= rxrpc_call_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -217,7 +216,6 @@ static int rxrpc_connection_seq_open(struct inode *inode, struct file *file)
 }
 
 const struct file_operations rxrpc_connection_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= rxrpc_connection_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 8a04c36e579f..0038a1c44ee9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2046,7 +2046,6 @@ static int psched_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations psched_fops = {
-	.owner = THIS_MODULE,
 	.open = psched_open,
 	.read  = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 4545bc2aff84..537545ebcb0e 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -95,7 +95,6 @@ static int sctp_snmp_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations sctp_snmp_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = sctp_snmp_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a9ee634f3c42..90a3784e3084 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2869,7 +2869,6 @@ static int unix_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations unix_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= unix_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c
index e98a01c1034f..5511f989ef47 100644
--- a/net/wireless/wext-proc.c
+++ b/net/wireless/wext-proc.c
@@ -133,7 +133,6 @@ static int seq_open_wireless(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations wireless_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = seq_open_wireless,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index ba2b539879bc..6d5f85f4e672 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -71,7 +71,6 @@ static int xfrm_statistics_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations xfrm_statistics_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = xfrm_statistics_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-- 
cgit v1.2.3


From 416ef9b15c688b91edbf654ebe7bc349c9151147 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 14 Jan 2018 20:01:26 -0800
Subject: net: sched: red: don't reset the backlog on every stat dump

Commit 0dfb33a0d7e2 ("sch_red: report backlog information") copied
child's backlog into RED's backlog.  Back then RED did not maintain
its own backlog counts.  This has changed after commit 2ccccf5fb43f
("net_sched: update hierarchical backlog too") and commit d7f4f332f082
("sch_red: update backlog as well").  Copying is no longer necessary.

Tested:

$ tc -s qdisc show dev veth0
qdisc red 1: root refcnt 2 limit 400000b min 30000b max 30000b ecn
 Sent 20942 bytes 221 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 1260b 14p requeues 14
  marked 0 early 0 pdrop 0 other 0
qdisc tbf 2: parent 1: rate 1Kbit burst 15000b lat 3585.0s
 Sent 20942 bytes 221 pkt (dropped 0, overlimits 138 requeues 0)
 backlog 1260b 14p requeues 14

Recently RED offload was added.  We need to make sure drivers don't
depend on resetting the stats.  This means backlog should be treated
like any other statistic:

  total_stat = new_hw_stat - prev_hw_stat;

Adjust mlxsw.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Nogah Frankel <nogahf@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_qdisc.c   | 26 +++++++++++++++++++---
 include/net/pkt_cls.h                              |  1 +
 net/sched/sch_red.c                                |  2 +-
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
index fee9da81bce6..0b7670459051 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
@@ -247,6 +247,8 @@ mlxsw_sp_setup_tc_qdisc_red_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	stats_base->overlimits = red_base->prob_drop + red_base->prob_mark;
 	stats_base->drops = red_base->prob_drop + red_base->pdrop;
+
+	stats_base->backlog = 0;
 }
 
 static int
@@ -306,6 +308,19 @@ mlxsw_sp_qdisc_red_replace(struct mlxsw_sp_port *mlxsw_sp_port,
 						 max, prob, p->is_ecn);
 }
 
+static void
+mlxsw_sp_qdisc_red_unoffload(struct mlxsw_sp_port *mlxsw_sp_port,
+			     struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			     void *params)
+{
+	struct tc_red_qopt_offload_params *p = params;
+	u64 backlog;
+
+	backlog = mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+				       mlxsw_sp_qdisc->stats_base.backlog);
+	p->qstats->backlog -= backlog;
+}
+
 static int
 mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port,
 			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
@@ -338,7 +353,7 @@ mlxsw_sp_qdisc_get_red_stats(struct mlxsw_sp_port *mlxsw_sp_port,
 			     struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
 			     struct tc_qopt_offload_stats *stats_ptr)
 {
-	u64 tx_bytes, tx_packets, overlimits, drops;
+	u64 tx_bytes, tx_packets, overlimits, drops, backlog;
 	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
 	struct mlxsw_sp_qdisc_stats *stats_base;
 	struct mlxsw_sp_port_xstats *xstats;
@@ -354,14 +369,18 @@ mlxsw_sp_qdisc_get_red_stats(struct mlxsw_sp_port *mlxsw_sp_port,
 		     stats_base->overlimits;
 	drops = xstats->wred_drop[tclass_num] + xstats->tail_drop[tclass_num] -
 		stats_base->drops;
+	backlog = xstats->backlog[tclass_num];
 
 	_bstats_update(stats_ptr->bstats, tx_bytes, tx_packets);
 	stats_ptr->qstats->overlimits += overlimits;
 	stats_ptr->qstats->drops += drops;
 	stats_ptr->qstats->backlog +=
-			mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
-					     xstats->backlog[tclass_num]);
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     backlog) -
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     stats_base->backlog);
 
+	stats_base->backlog = backlog;
 	stats_base->drops +=  drops;
 	stats_base->overlimits += overlimits;
 	stats_base->tx_bytes += tx_bytes;
@@ -375,6 +394,7 @@ static struct mlxsw_sp_qdisc_ops mlxsw_sp_qdisc_ops_red = {
 	.type = MLXSW_SP_QDISC_RED,
 	.check_params = mlxsw_sp_qdisc_red_check_params,
 	.replace = mlxsw_sp_qdisc_red_replace,
+	.unoffload = mlxsw_sp_qdisc_red_unoffload,
 	.destroy = mlxsw_sp_qdisc_red_destroy,
 	.get_stats = mlxsw_sp_qdisc_get_red_stats,
 	.get_xstats = mlxsw_sp_qdisc_get_red_xstats,
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 9c341f003091..cc23c041a6d7 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -748,6 +748,7 @@ struct tc_red_qopt_offload_params {
 	u32 max;
 	u32 probability;
 	bool is_ecn;
+	struct gnet_stats_queue *qstats;
 };
 
 struct tc_red_qopt_offload {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 0af1c1254e0b..16644b3d2362 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -167,6 +167,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.set.max = q->parms.qth_max >> q->parms.Wlog;
 		opt.set.probability = q->parms.max_P;
 		opt.set.is_ecn = red_use_ecn(q);
+		opt.set.qstats = &sch->qstats;
 	} else {
 		opt.command = TC_RED_DESTROY;
 	}
@@ -322,7 +323,6 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	};
 	int err;
 
-	sch->qstats.backlog = q->qdisc->qstats.backlog;
 	err = red_dump_offload_stats(sch, &opt);
 	if (err)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From a9b19443edbaac97c5c094f3cc903c1f1548b3f5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:45 +0100
Subject: net: sched: introduce support for multiple filter chain pointers
 registration

So far, there was possible only to register a single filter chain
pointer to block->chain[0]. However, when the blocks will get shareable,
we need to allow multiple filter chain pointers registration.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  3 +-
 net/sched/cls_api.c       | 77 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 70 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bd9125b0481f..17d8cfd0efda 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -275,8 +275,7 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
 
 struct tcf_chain {
 	struct tcf_proto __rcu *filter_chain;
-	tcf_chain_head_change_t *chain_head_change;
-	void *chain_head_change_priv;
+	struct list_head filter_chain_list;
 	struct list_head list;
 	struct tcf_block *block;
 	u32 index; /* chain index */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 6708b6953bfa..e6b16b300844 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -179,6 +179,12 @@ static void tcf_proto_destroy(struct tcf_proto *tp)
 	kfree_rcu(tp, rcu);
 }
 
+struct tcf_filter_chain_list_item {
+	struct list_head list;
+	tcf_chain_head_change_t *chain_head_change;
+	void *chain_head_change_priv;
+};
+
 static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 					  u32 chain_index)
 {
@@ -187,6 +193,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 	if (!chain)
 		return NULL;
+	INIT_LIST_HEAD(&chain->filter_chain_list);
 	list_add_tail(&chain->list, &block->chain_list);
 	chain->block = block;
 	chain->index = chain_index;
@@ -194,12 +201,19 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	return chain;
 }
 
+static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
+				       struct tcf_proto *tp_head)
+{
+	if (item->chain_head_change)
+		item->chain_head_change(tp_head, item->chain_head_change_priv);
+}
 static void tcf_chain_head_change(struct tcf_chain *chain,
 				  struct tcf_proto *tp_head)
 {
-	if (chain->chain_head_change)
-		chain->chain_head_change(tp_head,
-					 chain->chain_head_change_priv);
+	struct tcf_filter_chain_list_item *item;
+
+	list_for_each_entry(item, &chain->filter_chain_list, list)
+		tcf_chain_head_change_item(item, tp_head);
 }
 
 static void tcf_chain_flush(struct tcf_chain *chain)
@@ -280,6 +294,50 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
 }
 
+static int
+tcf_chain_head_change_cb_add(struct tcf_chain *chain,
+			     struct tcf_block_ext_info *ei,
+			     struct netlink_ext_ack *extack)
+{
+	struct tcf_filter_chain_list_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item) {
+		NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed");
+		return -ENOMEM;
+	}
+	item->chain_head_change = ei->chain_head_change;
+	item->chain_head_change_priv = ei->chain_head_change_priv;
+	if (chain->filter_chain)
+		tcf_chain_head_change_item(item, chain->filter_chain);
+	list_add(&item->list, &chain->filter_chain_list);
+	return 0;
+}
+
+static void
+tcf_chain_head_change_cb_del(struct tcf_chain *chain,
+			     struct tcf_block_ext_info *ei)
+{
+	struct tcf_filter_chain_list_item *item;
+
+	list_for_each_entry(item, &chain->filter_chain_list, list) {
+		if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
+		    (item->chain_head_change == ei->chain_head_change &&
+		     item->chain_head_change_priv == ei->chain_head_change_priv)) {
+			tcf_chain_head_change_item(item, NULL);
+			list_del(&item->list);
+			kfree(item);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+
+static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
+{
+	return list_first_entry(&block->chain_list, struct tcf_chain, list);
+}
+
 int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		      struct tcf_block_ext_info *ei,
 		      struct netlink_ext_ack *extack)
@@ -302,9 +360,10 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		err = -ENOMEM;
 		goto err_chain_create;
 	}
-	WARN_ON(!ei->chain_head_change);
-	chain->chain_head_change = ei->chain_head_change;
-	chain->chain_head_change_priv = ei->chain_head_change_priv;
+	err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
+					   ei, extack);
+	if (err)
+		goto err_chain_head_change_cb_add;
 	block->net = qdisc_net(q);
 	block->q = q;
 	tcf_block_offload_bind(block, q, ei);
@@ -313,6 +372,8 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 
 err_chain_create:
 	kfree(block);
+err_chain_head_change_cb_add:
+	kfree(chain);
 	return err;
 }
 EXPORT_SYMBOL(tcf_block_get_ext);
@@ -351,6 +412,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 	 */
 	if (!block)
 		return;
+	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
 	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_hold(chain);
 
@@ -364,8 +426,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		tcf_chain_put(chain);
 
 	/* Finally, put chain 0 and allow block to be freed. */
-	chain = list_first_entry(&block->chain_list, struct tcf_chain, list);
-	tcf_chain_put(chain);
+	tcf_chain_put(tcf_block_chain_zero(block));
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
-- 
cgit v1.2.3


From 4861738775d70e0165d04fe014f32b41bcb5414a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:46 +0100
Subject: net: sched: introduce shared filter blocks infrastructure

Allow qdiscs to share filter blocks among them. Each qdisc type has to
use block get/put extended modifications that enable sharing.
Shared blocks are tracked within each net namespace and identified
by u32 index. This index is passed from user during the qdisc creation.
If user passes index that is not used by any other qdisc, new block
is created. If user passes index that is already used, the existing
block will be re-used.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |   7 ++
 include/net/sch_generic.h |   2 +
 net/sched/cls_api.c       | 167 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 152 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index cc23c041a6d7..c40d60e6a883 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -29,6 +29,7 @@ struct tcf_block_ext_info {
 	enum tcf_block_binder_type binder_type;
 	tcf_chain_head_change_t *chain_head_change;
 	void *chain_head_change_priv;
+	u32 block_index;
 };
 
 struct tcf_block_cb;
@@ -48,8 +49,14 @@ void tcf_block_put(struct tcf_block *block);
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei);
 
+static inline bool tcf_block_shared(struct tcf_block *block)
+{
+	return block->index;
+}
+
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
+	WARN_ON(tcf_block_shared(block));
 	return block->q;
 }
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 17d8cfd0efda..cc0c1e4711dc 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -284,6 +284,8 @@ struct tcf_chain {
 
 struct tcf_block {
 	struct list_head chain_list;
+	u32 index; /* block index for shared blocks */
+	unsigned int refcnt;
 	struct net *net;
 	struct Qdisc *q;
 	struct list_head cb_list;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e6b16b300844..ee319b1598b5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -24,6 +24,7 @@
 #include <linux/init.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
+#include <linux/idr.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/netlink.h>
@@ -333,22 +334,44 @@ tcf_chain_head_change_cb_del(struct tcf_chain *chain,
 	WARN_ON(1);
 }
 
-static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
+struct tcf_net {
+	struct idr idr;
+};
+
+static unsigned int tcf_net_id;
+
+static int tcf_block_insert(struct tcf_block *block, struct net *net,
+			    u32 block_index, struct netlink_ext_ack *extack)
 {
-	return list_first_entry(&block->chain_list, struct tcf_chain, list);
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+	int err;
+
+	err = idr_alloc_ext(&tn->idr, block, NULL, block_index,
+			    block_index + 1, GFP_KERNEL);
+	if (err)
+		return err;
+	block->index = block_index;
+	return 0;
 }
 
-int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
-		      struct tcf_block_ext_info *ei,
-		      struct netlink_ext_ack *extack)
+static void tcf_block_remove(struct tcf_block *block, struct net *net)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	idr_remove_ext(&tn->idr, block->index);
+}
+
+static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
+					  struct netlink_ext_ack *extack)
 {
-	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
+	struct tcf_block *block;
 	struct tcf_chain *chain;
 	int err;
 
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
 	if (!block) {
 		NL_SET_ERR_MSG(extack, "Memory allocation for block failed");
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
@@ -360,20 +383,76 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		err = -ENOMEM;
 		goto err_chain_create;
 	}
+	block->net = qdisc_net(q);
+	block->refcnt = 1;
+	block->net = net;
+	block->q = q;
+	return block;
+
+err_chain_create:
+	kfree(block);
+	return ERR_PTR(err);
+}
+
+static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	return idr_find_ext(&tn->idr, block_index);
+}
+
+static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
+{
+	return list_first_entry(&block->chain_list, struct tcf_chain, list);
+}
+
+int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei,
+		      struct netlink_ext_ack *extack)
+{
+	struct net *net = qdisc_net(q);
+	struct tcf_block *block = NULL;
+	bool created = false;
+	int err;
+
+	if (ei->block_index) {
+		/* block_index not 0 means the shared block is requested */
+		block = tcf_block_lookup(net, ei->block_index);
+		if (block)
+			block->refcnt++;
+	}
+
+	if (!block) {
+		block = tcf_block_create(net, q, extack);
+		if (IS_ERR(block))
+			return PTR_ERR(block);
+		created = true;
+		if (ei->block_index) {
+			err = tcf_block_insert(block, net,
+					       ei->block_index, extack);
+			if (err)
+				goto err_block_insert;
+		}
+	}
+
 	err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
 					   ei, extack);
 	if (err)
 		goto err_chain_head_change_cb_add;
-	block->net = qdisc_net(q);
-	block->q = q;
 	tcf_block_offload_bind(block, q, ei);
 	*p_block = block;
 	return 0;
 
-err_chain_create:
-	kfree(block);
 err_chain_head_change_cb_add:
-	kfree(chain);
+	if (created) {
+		if (tcf_block_shared(block))
+			tcf_block_remove(block, net);
+err_block_insert:
+		kfree(tcf_block_chain_zero(block));
+		kfree(block);
+	} else {
+		block->refcnt--;
+	}
 	return err;
 }
 EXPORT_SYMBOL(tcf_block_get_ext);
@@ -407,26 +486,34 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 {
 	struct tcf_chain *chain, *tmp;
 
-	/* Hold a refcnt for all chains, so that they don't disappear
-	 * while we are iterating.
-	 */
 	if (!block)
 		return;
 	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
-	list_for_each_entry(chain, &block->chain_list, list)
-		tcf_chain_hold(chain);
 
-	list_for_each_entry(chain, &block->chain_list, list)
-		tcf_chain_flush(chain);
+	if (--block->refcnt == 0) {
+		if (tcf_block_shared(block))
+			tcf_block_remove(block, block->net);
+
+		/* Hold a refcnt for all chains, so that they don't disappear
+		 * while we are iterating.
+		 */
+		list_for_each_entry(chain, &block->chain_list, list)
+			tcf_chain_hold(chain);
+
+		list_for_each_entry(chain, &block->chain_list, list)
+			tcf_chain_flush(chain);
+	}
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	/* At this point, all the chains should have refcnt >= 1. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
+	if (block->refcnt == 0) {
+		/* At this point, all the chains should have refcnt >= 1. */
+		list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+			tcf_chain_put(chain);
 
-	/* Finally, put chain 0 and allow block to be freed. */
-	tcf_chain_put(tcf_block_chain_zero(block));
+		/* Finally, put chain 0 and allow block to be freed. */
+		tcf_chain_put(tcf_block_chain_zero(block));
+	}
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
@@ -1313,18 +1400,50 @@ int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
+static __net_init int tcf_net_init(struct net *net)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	idr_init(&tn->idr);
+	return 0;
+}
+
+static void __net_exit tcf_net_exit(struct net *net)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	idr_destroy(&tn->idr);
+}
+
+static struct pernet_operations tcf_net_ops = {
+	.init = tcf_net_init,
+	.exit = tcf_net_exit,
+	.id   = &tcf_net_id,
+	.size = sizeof(struct tcf_net),
+};
+
 static int __init tc_filter_init(void)
 {
+	int err;
+
 	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
 	if (!tc_filter_wq)
 		return -ENOMEM;
 
+	err = register_pernet_subsys(&tcf_net_ops);
+	if (err)
+		goto err_register_pernet_subsys;
+
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
 		      tc_dump_tfilter, 0);
 
 	return 0;
+
+err_register_pernet_subsys:
+	destroy_workqueue(tc_filter_wq);
+	return err;
 }
 
 subsys_initcall(tc_filter_init);
-- 
cgit v1.2.3


From 9d3aaff3d8523264ff7082a90759cb8a340200be Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:47 +0100
Subject: net: sched: avoid usage of tp->q in tcf_classify

Use block index in the messages instead.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ee319b1598b5..700595abc641 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -672,8 +672,9 @@ reclassify:
 #ifdef CONFIG_NET_CLS_ACT
 reset:
 	if (unlikely(limit++ >= max_reclassify_loop)) {
-		net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
-				       tp->q->ops->id, tp->prio & 0xffff,
+		net_notice_ratelimited("%u: reclassify loop, rule prio %u, protocol %02x\n",
+				       tp->chain->block->index,
+				       tp->prio & 0xffff,
 				       ntohs(tp->protocol));
 		return TC_ACT_SHOT;
 	}
-- 
cgit v1.2.3


From f36fe1c498c8959812415c57b683abaa4527dec5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:48 +0100
Subject: net: sched: introduce block mechanism to handle netif_keep_dst calls

Couple of classifiers call netif_keep_dst directly on q->dev. That is
not possible to do directly for shared blocke where multiple qdiscs are
owning the block. So introduce a infrastructure to keep track of the
block owners in list and use this list to implement block variant of
netif_keep_dst.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  1 +
 include/net/sch_generic.h |  2 ++
 net/sched/cls_api.c       | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 net/sched/cls_bpf.c       |  4 +--
 net/sched/cls_flow.c      |  2 +-
 net/sched/cls_route.c     |  2 +-
 6 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c40d60e6a883..789d818c4a61 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -39,6 +39,7 @@ bool tcf_queue_work(struct work_struct *work);
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
 void tcf_chain_put(struct tcf_chain *chain);
+void tcf_block_netif_keep_dst(struct tcf_block *block);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
 		  struct netlink_ext_ack *extack);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cc0c1e4711dc..f655e66ce742 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -289,6 +289,8 @@ struct tcf_block {
 	struct net *net;
 	struct Qdisc *q;
 	struct list_head cb_list;
+	struct list_head owner_list;
+	bool keep_dst;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 700595abc641..1ca84230f4de 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -375,6 +375,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
 	}
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
+	INIT_LIST_HEAD(&block->owner_list);
 
 	/* Create chain 0 by default, it has to be always present. */
 	chain = tcf_chain_create(block, 0);
@@ -406,6 +407,65 @@ static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
 	return list_first_entry(&block->chain_list, struct tcf_chain, list);
 }
 
+struct tcf_block_owner_item {
+	struct list_head list;
+	struct Qdisc *q;
+	enum tcf_block_binder_type binder_type;
+};
+
+static void
+tcf_block_owner_netif_keep_dst(struct tcf_block *block,
+			       struct Qdisc *q,
+			       enum tcf_block_binder_type binder_type)
+{
+	if (block->keep_dst &&
+	    binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+	    binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+		netif_keep_dst(qdisc_dev(q));
+}
+
+void tcf_block_netif_keep_dst(struct tcf_block *block)
+{
+	struct tcf_block_owner_item *item;
+
+	block->keep_dst = true;
+	list_for_each_entry(item, &block->owner_list, list)
+		tcf_block_owner_netif_keep_dst(block, item->q,
+					       item->binder_type);
+}
+EXPORT_SYMBOL(tcf_block_netif_keep_dst);
+
+static int tcf_block_owner_add(struct tcf_block *block,
+			       struct Qdisc *q,
+			       enum tcf_block_binder_type binder_type)
+{
+	struct tcf_block_owner_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+	item->q = q;
+	item->binder_type = binder_type;
+	list_add(&item->list, &block->owner_list);
+	return 0;
+}
+
+static void tcf_block_owner_del(struct tcf_block *block,
+				struct Qdisc *q,
+				enum tcf_block_binder_type binder_type)
+{
+	struct tcf_block_owner_item *item;
+
+	list_for_each_entry(item, &block->owner_list, list) {
+		if (item->q == q && item->binder_type == binder_type) {
+			list_del(&item->list);
+			kfree(item);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+
 int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		      struct tcf_block_ext_info *ei,
 		      struct netlink_ext_ack *extack)
@@ -435,6 +495,12 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		}
 	}
 
+	err = tcf_block_owner_add(block, q, ei->binder_type);
+	if (err)
+		goto err_block_owner_add;
+
+	tcf_block_owner_netif_keep_dst(block, q, ei->binder_type);
+
 	err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
 					   ei, extack);
 	if (err)
@@ -444,6 +510,8 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 	return 0;
 
 err_chain_head_change_cb_add:
+	tcf_block_owner_del(block, q, ei->binder_type);
+err_block_owner_add:
 	if (created) {
 		if (tcf_block_shared(block))
 			tcf_block_remove(block, net);
@@ -489,6 +557,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 	if (!block)
 		return;
 	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
+	tcf_block_owner_del(block, q, ei->binder_type);
 
 	if (--block->refcnt == 0) {
 		if (tcf_block_shared(block))
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 8d78e7f4ecc3..d79cc5086509 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -392,8 +392,8 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 	prog->bpf_name = name;
 	prog->filter = fp;
 
-	if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS))
-		netif_keep_dst(qdisc_dev(tp->q));
+	if (fp->dst_needed)
+		tcf_block_netif_keep_dst(tp->chain->block);
 
 	return 0;
 }
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 25c2a888e1f0..28cd6fb52c16 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -526,7 +526,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 
 	timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE);
 
-	netif_keep_dst(qdisc_dev(tp->q));
+	tcf_block_netif_keep_dst(tp->chain->block);
 
 	if (tb[TCA_FLOW_KEYS]) {
 		fnew->keymask = keymask;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index ac9a5b8825b9..a1f2b1b7c014 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -527,7 +527,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 		if (f->handle < f1->handle)
 			break;
 
-	netif_keep_dst(qdisc_dev(tp->q));
+	tcf_block_netif_keep_dst(tp->chain->block);
 	rcu_assign_pointer(f->next, f1);
 	rcu_assign_pointer(*fp, f);
 
-- 
cgit v1.2.3


From edf6711c9840fd92e0047f98c411c94114168f19 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:49 +0100
Subject: net: sched: remove classid and q fields from tcf_proto

Both are no longer used, so remove them.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 2 --
 net/sched/cls_api.c       | 7 ++-----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f655e66ce742..54b9a1ca26bd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -255,8 +255,6 @@ struct tcf_proto {
 
 	/* All the rest */
 	u32			prio;
-	u32			classid;
-	struct Qdisc		*q;
 	void			*data;
 	const struct tcf_proto_ops	*ops;
 	struct tcf_chain	*chain;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1ca84230f4de..e36d90c20f04 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -122,8 +122,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 }
 
 static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
-					  u32 prio, u32 parent, struct Qdisc *q,
-					  struct tcf_chain *chain)
+					  u32 prio, struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 	int err;
@@ -157,8 +156,6 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
 	tp->classify = tp->ops->classify;
 	tp->protocol = protocol;
 	tp->prio = prio;
-	tp->classid = parent;
-	tp->q = q;
 	tp->chain = chain;
 
 	err = tp->ops->init(tp);
@@ -1069,7 +1066,7 @@ replay:
 			prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
 
 		tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
-				      protocol, prio, parent, q, chain);
+				      protocol, prio, chain);
 		if (IS_ERR(tp)) {
 			err = PTR_ERR(tp);
 			goto errout;
-- 
cgit v1.2.3


From caa7260156eb3a1496348a2c69fa68e85183d5d7 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:50 +0100
Subject: net: sched: keep track of offloaded filters and check tc offload
 feature

During block bind, we need to check tc offload feature. If it is
disabled yet still the block contains offloaded filters, forbid the
bind. Also forbid to register callback for a block that already
contains offloaded filters, as the play back is not supported now.
For keeping track of offloaded filters there is a new counter
introduced, alongside with couple of helpers called from cls_* code.
These helpers set and clear TCA_CLS_FLAGS_IN_HW flag.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 18 +++++++++++
 net/sched/cls_api.c       | 80 ++++++++++++++++++++++++++++++++++++++---------
 net/sched/cls_bpf.c       |  5 ++-
 net/sched/cls_flower.c    |  3 +-
 net/sched/cls_matchall.c  |  3 +-
 net/sched/cls_u32.c       | 13 ++++----
 6 files changed, 99 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 54b9a1ca26bd..bf5cc0a1d0f6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -289,8 +289,26 @@ struct tcf_block {
 	struct list_head cb_list;
 	struct list_head owner_list;
 	bool keep_dst;
+	unsigned int offloadcnt; /* Number of oddloaded filters */
+	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
 };
 
+static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
+{
+	if (*flags & TCA_CLS_FLAGS_IN_HW)
+		return;
+	*flags |= TCA_CLS_FLAGS_IN_HW;
+	block->offloadcnt++;
+}
+
+static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+	if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+		return;
+	*flags &= ~TCA_CLS_FLAGS_IN_HW;
+	block->offloadcnt--;
+}
+
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 {
 	struct qdisc_skb_cb *qcb;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e36d90c20f04..03e2fa092d9e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -265,31 +265,66 @@ void tcf_chain_put(struct tcf_chain *chain)
 }
 EXPORT_SYMBOL(tcf_chain_put);
 
-static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q,
-				  struct tcf_block_ext_info *ei,
-				  enum tc_block_command command)
+static bool tcf_block_offload_in_use(struct tcf_block *block)
+{
+	return block->offloadcnt;
+}
+
+static int tcf_block_offload_cmd(struct tcf_block *block,
+				 struct net_device *dev,
+				 struct tcf_block_ext_info *ei,
+				 enum tc_block_command command)
 {
-	struct net_device *dev = q->dev_queue->dev;
 	struct tc_block_offload bo = {};
 
-	if (!dev->netdev_ops->ndo_setup_tc)
-		return;
 	bo.command = command;
 	bo.binder_type = ei->binder_type;
 	bo.block = block;
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
 }
 
-static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
-				   struct tcf_block_ext_info *ei)
+static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
+				  struct tcf_block_ext_info *ei)
 {
-	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND);
+	struct net_device *dev = q->dev_queue->dev;
+	int err;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		goto no_offload_dev_inc;
+
+	/* If tc offload feature is disabled and the block we try to bind
+	 * to already has some offloaded filters, forbid to bind.
+	 */
+	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block))
+		return -EOPNOTSUPP;
+
+	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND);
+	if (err == -EOPNOTSUPP)
+		goto no_offload_dev_inc;
+	return err;
+
+no_offload_dev_inc:
+	if (tcf_block_offload_in_use(block))
+		return -EOPNOTSUPP;
+	block->nooffloaddevcnt++;
+	return 0;
 }
 
 static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 				     struct tcf_block_ext_info *ei)
 {
-	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
+	struct net_device *dev = q->dev_queue->dev;
+	int err;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		goto no_offload_dev_dec;
+	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND);
+	if (err == -EOPNOTSUPP)
+		goto no_offload_dev_dec;
+	return;
+
+no_offload_dev_dec:
+	WARN_ON(block->nooffloaddevcnt-- == 0);
 }
 
 static int
@@ -502,10 +537,16 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 					   ei, extack);
 	if (err)
 		goto err_chain_head_change_cb_add;
-	tcf_block_offload_bind(block, q, ei);
+
+	err = tcf_block_offload_bind(block, q, ei);
+	if (err)
+		goto err_block_offload_bind;
+
 	*p_block = block;
 	return 0;
 
+err_block_offload_bind:
+	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
 err_chain_head_change_cb_add:
 	tcf_block_owner_del(block, q, ei->binder_type);
 err_block_owner_add:
@@ -637,9 +678,16 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 {
 	struct tcf_block_cb *block_cb;
 
+	/* At this point, playback of previous block cb calls is not supported,
+	 * so forbid to register to block which already has some offloaded
+	 * filters present.
+	 */
+	if (tcf_block_offload_in_use(block))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
 	if (!block_cb)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	block_cb->cb = cb;
 	block_cb->cb_ident = cb_ident;
 	block_cb->cb_priv = cb_priv;
@@ -655,7 +703,7 @@ int tcf_block_cb_register(struct tcf_block *block,
 	struct tcf_block_cb *block_cb;
 
 	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
-	return block_cb ? 0 : -ENOMEM;
+	return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
 }
 EXPORT_SYMBOL(tcf_block_cb_register);
 
@@ -685,6 +733,10 @@ static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
 	int ok_count = 0;
 	int err;
 
+	/* Make sure all netdevs sharing this block are offload-capable. */
+	if (block->nooffloaddevcnt && err_stop)
+		return -EOPNOTSUPP;
+
 	list_for_each_entry(block_cb, &block->cb_list, list) {
 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
 		if (err) {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index d79cc5086509..cf72aefcf98d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -167,13 +167,16 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	cls_bpf.exts_integrated = obj->exts_integrated;
 	cls_bpf.gen_flags = obj->gen_flags;
 
+	if (oldprog)
+		tcf_block_offload_dec(block, &oldprog->gen_flags);
+
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
 	if (prog) {
 		if (err < 0) {
 			cls_bpf_offload_cmd(tp, oldprog, prog);
 			return err;
 		} else if (err > 0) {
-			prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
+			tcf_block_offload_inc(block, &prog->gen_flags);
 		}
 	}
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6132a7317efa..f61df19b1026 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -229,6 +229,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 
 	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			 &cls_flower, false);
+	tcf_block_offload_dec(block, &f->flags);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -256,7 +257,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		fl_hw_destroy_filter(tp, f);
 		return err;
 	} else if (err > 0) {
-		f->flags |= TCA_CLS_FLAGS_IN_HW;
+		tcf_block_offload_inc(block, &f->flags);
 	}
 
 	if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 66d4e0099158..d0e57c86636f 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -81,6 +81,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	cls_mall.cookie = cookie;
 
 	tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+	tcf_block_offload_dec(block, &head->flags);
 }
 
 static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -103,7 +104,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 		mall_destroy_hw_filter(tp, head, cookie);
 		return err;
 	} else if (err > 0) {
-		head->flags |= TCA_CLS_FLAGS_IN_HW;
+		tcf_block_offload_inc(block, &head->flags);
 	}
 
 	if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 507859cdd1cb..020d328d0afd 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -529,16 +529,17 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	return 0;
 }
 
-static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
+static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
 	tc_cls_common_offload_init(&cls_u32.common, tp);
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
-	cls_u32.knode.handle = handle;
+	cls_u32.knode.handle = n->handle;
 
 	tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+	tcf_block_offload_dec(block, &n->flags);
 }
 
 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -567,10 +568,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
 	if (err < 0) {
-		u32_remove_hw_knode(tp, n->handle);
+		u32_remove_hw_knode(tp, n);
 		return err;
 	} else if (err > 0) {
-		n->flags |= TCA_CLS_FLAGS_IN_HW;
+		tcf_block_offload_inc(block, &n->flags);
 	}
 
 	if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -589,7 +590,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 			RCU_INIT_POINTER(ht->ht[h],
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
-			u32_remove_hw_knode(tp, n->handle);
+			u32_remove_hw_knode(tp, n);
 			idr_remove_ext(&ht->handle_idr, n->handle);
 			if (tcf_exts_get_net(&n->exts))
 				call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
@@ -682,7 +683,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
 		goto out;
 
 	if (TC_U32_KEY(ht->handle)) {
-		u32_remove_hw_knode(tp, ht->handle);
+		u32_remove_hw_knode(tp, (struct tc_u_knode *)ht);
 		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
 		goto out;
 	}
-- 
cgit v1.2.3


From 7960d1daf278cbe23bb48974fe6ae6a1e44c3c15 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:51 +0100
Subject: net: sched: use block index as a handle instead of qdisc when block
 is shared

As the tcm_ifindex with value TCM_IFINDEX_MAGIC_BLOCK is invalid ifindex,
use it to indicate that we work with block, instead of qdisc.
So if tcm_ifindex is set to TCM_IFINDEX_MAGIC_BLOCK, tcm_parent is used
to carry block_index.

If the block is set to be shared between at least 2 qdiscs, it is
forbidden to use the qdisc handle to add/delete filters. In that case,
userspace has to pass block_index.

Also, for dump of the filters, in case the block is shared in between at
least 2 qdiscs, the each filter is dumped with tcm_ifindex value
TCM_IFINDEX_MAGIC_BLOCK and tcm_parent set to block_index. That gives
the user clear indication, that the filter belongs to a shared block
and not only to one qdisc under which it is dumped.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h |  10 ++
 net/sched/cls_api.c            | 202 ++++++++++++++++++++++++-----------------
 2 files changed, 128 insertions(+), 84 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 843e29aa3cac..da878f2e7c39 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -541,9 +541,19 @@ struct tcmsg {
 	int		tcm_ifindex;
 	__u32		tcm_handle;
 	__u32		tcm_parent;
+/* tcm_block_index is used instead of tcm_parent
+ * in case tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK
+ */
+#define tcm_block_index tcm_parent
 	__u32		tcm_info;
 };
 
+/* For manipulation of filters in shared block, tcm_ifindex is set to
+ * TCM_IFINDEX_MAGIC_BLOCK, and tcm_parent is aliased to tcm_block_index
+ * which is the block index.
+ */
+#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
+
 enum {
 	TCA_UNSPEC,
 	TCA_KIND,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 03e2fa092d9e..e500d11da9cd 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -865,8 +865,9 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
 }
 
 static int tcf_fill_node(struct net *net, struct sk_buff *skb,
-			 struct tcf_proto *tp, struct Qdisc *q, u32 parent,
-			 void *fh, u32 portid, u32 seq, u16 flags, int event)
+			 struct tcf_proto *tp, struct tcf_block *block,
+			 struct Qdisc *q, u32 parent, void *fh,
+			 u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -879,8 +880,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
-	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
-	tcm->tcm_parent = parent;
+	if (q) {
+		tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+		tcm->tcm_parent = parent;
+	} else {
+		tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+		tcm->tcm_block_index = block->index;
+	}
 	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 	if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 		goto nla_put_failure;
@@ -903,8 +909,8 @@ nla_put_failure:
 
 static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  struct nlmsghdr *n, struct tcf_proto *tp,
-			  struct Qdisc *q, u32 parent,
-			  void *fh, int event, bool unicast)
+			  struct tcf_block *block, struct Qdisc *q,
+			  u32 parent, void *fh, int event, bool unicast)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -913,8 +919,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq,
-			  n->nlmsg_flags, event) <= 0) {
+	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+			  n->nlmsg_seq, n->nlmsg_flags, event) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -928,8 +934,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 
 static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 			      struct nlmsghdr *n, struct tcf_proto *tp,
-			      struct Qdisc *q, u32 parent,
-			      void *fh, bool unicast, bool *last)
+			      struct tcf_block *block, struct Qdisc *q,
+			      u32 parent, void *fh, bool unicast, bool *last)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -939,8 +945,8 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq,
-			  n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+			  n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -959,15 +965,16 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 }
 
 static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
-				 struct Qdisc *q, u32 parent,
-				 struct nlmsghdr *n,
+				 struct tcf_block *block, struct Qdisc *q,
+				 u32 parent, struct nlmsghdr *n,
 				 struct tcf_chain *chain, int event)
 {
 	struct tcf_proto *tp;
 
 	for (tp = rtnl_dereference(chain->filter_chain);
 	     tp; tp = rtnl_dereference(tp->next))
-		tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false);
+		tfilter_notify(net, oskb, n, tp, block,
+			       q, parent, 0, event, false);
 }
 
 /* Add/change/delete/get a filter node */
@@ -983,13 +990,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	bool prio_allocate;
 	u32 parent;
 	u32 chain_index;
-	struct net_device *dev;
-	struct Qdisc  *q;
+	struct Qdisc *q = NULL;
 	struct tcf_chain_info chain_info;
 	struct tcf_chain *chain = NULL;
 	struct tcf_block *block;
 	struct tcf_proto *tp;
-	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	void *fh;
 	int err;
@@ -1036,41 +1041,58 @@ replay:
 
 	/* Find head of filter chain. */
 
-	/* Find link */
-	dev = __dev_get_by_index(net, t->tcm_ifindex);
-	if (dev == NULL)
-		return -ENODEV;
-
-	/* Find qdisc */
-	if (!parent) {
-		q = dev->qdisc;
-		parent = q->handle;
+	if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+		block = tcf_block_lookup(net, t->tcm_block_index);
+		if (!block) {
+			NL_SET_ERR_MSG(extack, "Block of given index was not found");
+			err = -EINVAL;
+			goto errout;
+		}
 	} else {
-		q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
-		if (q == NULL)
-			return -EINVAL;
-	}
+		const struct Qdisc_class_ops *cops;
+		struct net_device *dev;
 
-	/* Is it classful? */
-	cops = q->ops->cl_ops;
-	if (!cops)
-		return -EINVAL;
+		/* Find link */
+		dev = __dev_get_by_index(net, t->tcm_ifindex);
+		if (!dev)
+			return -ENODEV;
 
-	if (!cops->tcf_block)
-		return -EOPNOTSUPP;
+		/* Find qdisc */
+		if (!parent) {
+			q = dev->qdisc;
+			parent = q->handle;
+		} else {
+			q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
+			if (!q)
+				return -EINVAL;
+		}
 
-	/* Do we search for filter, attached to class? */
-	if (TC_H_MIN(parent)) {
-		cl = cops->find(q, parent);
-		if (cl == 0)
-			return -ENOENT;
-	}
+		/* Is it classful? */
+		cops = q->ops->cl_ops;
+		if (!cops)
+			return -EINVAL;
 
-	/* And the last stroke */
-	block = cops->tcf_block(q, cl, extack);
-	if (!block) {
-		err = -EINVAL;
-		goto errout;
+		if (!cops->tcf_block)
+			return -EOPNOTSUPP;
+
+		/* Do we search for filter, attached to class? */
+		if (TC_H_MIN(parent)) {
+			cl = cops->find(q, parent);
+			if (cl == 0)
+				return -ENOENT;
+		}
+
+		/* And the last stroke */
+		block = cops->tcf_block(q, cl, extack);
+		if (!block) {
+			err = -EINVAL;
+			goto errout;
+		}
+		if (tcf_block_shared(block)) {
+			NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
+			err = -EOPNOTSUPP;
+			goto errout;
+		}
 	}
 
 	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
@@ -1086,7 +1108,7 @@ replay:
 	}
 
 	if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
-		tfilter_notify_chain(net, skb, q, parent, n,
+		tfilter_notify_chain(net, skb, block, q, parent, n,
 				     chain, RTM_DELTFILTER);
 		tcf_chain_flush(chain);
 		err = 0;
@@ -1134,7 +1156,7 @@ replay:
 	if (!fh) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
 			tcf_chain_tp_remove(chain, &chain_info, tp);
-			tfilter_notify(net, skb, n, tp, q, parent, fh,
+			tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 				       RTM_DELTFILTER, false);
 			tcf_proto_destroy(tp);
 			err = 0;
@@ -1159,8 +1181,8 @@ replay:
 			}
 			break;
 		case RTM_DELTFILTER:
-			err = tfilter_del_notify(net, skb, n, tp, q, parent,
-						 fh, false, &last);
+			err = tfilter_del_notify(net, skb, n, tp, block,
+						 q, parent, fh, false, &last);
 			if (err)
 				goto errout;
 			if (last) {
@@ -1169,8 +1191,8 @@ replay:
 			}
 			goto errout;
 		case RTM_GETTFILTER:
-			err = tfilter_notify(net, skb, n, tp, q, parent, fh,
-					     RTM_NEWTFILTER, true);
+			err = tfilter_notify(net, skb, n, tp, block, q, parent,
+					     fh, RTM_NEWTFILTER, true);
 			goto errout;
 		default:
 			err = -EINVAL;
@@ -1183,7 +1205,7 @@ replay:
 	if (err == 0) {
 		if (tp_created)
 			tcf_chain_tp_insert(chain, &chain_info, tp);
-		tfilter_notify(net, skb, n, tp, q, parent, fh,
+		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 			       RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
@@ -1203,6 +1225,7 @@ struct tcf_dump_args {
 	struct tcf_walker w;
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
+	struct tcf_block *block;
 	struct Qdisc *q;
 	u32 parent;
 };
@@ -1212,7 +1235,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 	struct tcf_dump_args *a = (void *)arg;
 	struct net *net = sock_net(a->skb->sk);
 
-	return tcf_fill_node(net, a->skb, tp, a->q, a->parent,
+	return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
 			     n, NETLINK_CB(a->cb->skb).portid,
 			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
 			     RTM_NEWTFILTER);
@@ -1223,6 +1246,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			   long index_start, long *p_index)
 {
 	struct net *net = sock_net(skb->sk);
+	struct tcf_block *block = chain->block;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	struct tcf_dump_args arg;
 	struct tcf_proto *tp;
@@ -1241,7 +1265,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			memset(&cb->args[1], 0,
 			       sizeof(cb->args) - sizeof(cb->args[0]));
 		if (cb->args[1] == 0) {
-			if (tcf_fill_node(net, skb, tp, q, parent, 0,
+			if (tcf_fill_node(net, skb, tp, block, q, parent, 0,
 					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					  RTM_NEWTFILTER) <= 0)
@@ -1254,6 +1278,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 		arg.w.fn = tcf_node_dump;
 		arg.skb = skb;
 		arg.cb = cb;
+		arg.block = block;
 		arg.q = q;
 		arg.parent = parent;
 		arg.w.stop = 0;
@@ -1272,13 +1297,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_MAX + 1];
-	struct net_device *dev;
-	struct Qdisc *q;
+	struct Qdisc *q = NULL;
 	struct tcf_block *block;
 	struct tcf_chain *chain;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
-	unsigned long cl = 0;
-	const struct Qdisc_class_ops *cops;
 	long index_start;
 	long index;
 	u32 parent;
@@ -1291,32 +1313,44 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	if (err)
 		return err;
 
-	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
-	if (!dev)
-		return skb->len;
-
-	parent = tcm->tcm_parent;
-	if (!parent) {
-		q = dev->qdisc;
-		parent = q->handle;
+	if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+		block = tcf_block_lookup(net, tcm->tcm_block_index);
+		if (!block)
+			goto out;
 	} else {
-		q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
-	}
-	if (!q)
-		goto out;
-	cops = q->ops->cl_ops;
-	if (!cops)
-		goto out;
-	if (!cops->tcf_block)
-		goto out;
-	if (TC_H_MIN(tcm->tcm_parent)) {
-		cl = cops->find(q, tcm->tcm_parent);
-		if (cl == 0)
+		const struct Qdisc_class_ops *cops;
+		struct net_device *dev;
+		unsigned long cl = 0;
+
+		dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+		if (!dev)
+			return skb->len;
+
+		parent = tcm->tcm_parent;
+		if (!parent) {
+			q = dev->qdisc;
+			parent = q->handle;
+		} else {
+			q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+		}
+		if (!q)
 			goto out;
+		cops = q->ops->cl_ops;
+		if (!cops)
+			goto out;
+		if (!cops->tcf_block)
+			goto out;
+		if (TC_H_MIN(tcm->tcm_parent)) {
+			cl = cops->find(q, tcm->tcm_parent);
+			if (cl == 0)
+				goto out;
+		}
+		block = cops->tcf_block(q, cl, NULL);
+		if (!block)
+			goto out;
+		if (tcf_block_shared(block))
+			q = NULL;
 	}
-	block = cops->tcf_block(q, cl, NULL);
-	if (!block)
-		goto out;
 
 	index_start = cb->args[0];
 	index = 0;
-- 
cgit v1.2.3


From d47a6b0e7c492a4ba4524d557db388e34fd0a47a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:52 +0100
Subject: net: sched: introduce ingress/egress block index attributes for qdisc

Introduce two new attributes to be used for qdisc creation and dumping.
One for ingress block, one for egress block. Introduce a set of ops that
qdisc which supports block sharing would implement.

Passing block indexes in qdisc change is not supported yet and it is
checked and forbidded.

In future, these attributes are to be reused for specifying block
indexes for classes as well. As of this moment however, it is not
supported so a check is in place to forbid it.

Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      |  7 +++++
 include/uapi/linux/rtnetlink.h |  2 ++
 net/sched/sch_api.c            | 60 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bf5cc0a1d0f6..cfc19d0ba2ad 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -204,6 +204,13 @@ struct Qdisc_ops {
 	int			(*dump)(struct Qdisc *, struct sk_buff *);
 	int			(*dump_stats)(struct Qdisc *, struct gnet_dump *);
 
+	void			(*ingress_block_set)(struct Qdisc *sch,
+						     u32 block_index);
+	void			(*egress_block_set)(struct Qdisc *sch,
+						    u32 block_index);
+	u32			(*ingress_block_get)(struct Qdisc *sch);
+	u32			(*egress_block_get)(struct Qdisc *sch);
+
 	struct module		*owner;
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index da878f2e7c39..9b15005955fa 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -568,6 +568,8 @@ enum {
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
 	TCA_HW_OFFLOAD,
+	TCA_INGRESS_BLOCK,
+	TCA_EGRESS_BLOCK,
 	__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 7dffa9dce28b..d512f49ee83c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -791,6 +791,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	unsigned char *b = skb_tail_pointer(skb);
 	struct gnet_dump d;
 	struct qdisc_size_table *stab;
+	u32 block_index;
 	__u32 qlen;
 
 	cond_resched();
@@ -807,6 +808,18 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
+	if (q->ops->ingress_block_get) {
+		block_index = q->ops->ingress_block_get(q);
+		if (block_index &&
+		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
+			goto nla_put_failure;
+	}
+	if (q->ops->egress_block_get) {
+		block_index = q->ops->egress_block_get(q);
+		if (block_index &&
+		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
+			goto nla_put_failure;
+	}
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
@@ -994,6 +1007,40 @@ skip:
 	return err;
 }
 
+static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
+				   struct netlink_ext_ack *extack)
+{
+	u32 block_index;
+
+	if (tca[TCA_INGRESS_BLOCK]) {
+		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
+
+		if (!block_index) {
+			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
+			return -EINVAL;
+		}
+		if (!sch->ops->ingress_block_set) {
+			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
+			return -EOPNOTSUPP;
+		}
+		sch->ops->ingress_block_set(sch, block_index);
+	}
+	if (tca[TCA_EGRESS_BLOCK]) {
+		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
+
+		if (!block_index) {
+			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
+			return -EINVAL;
+		}
+		if (!sch->ops->egress_block_set) {
+			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
+			return -EOPNOTSUPP;
+		}
+		sch->ops->egress_block_set(sch, block_index);
+	}
+	return 0;
+}
+
 /* lockdep annotation is needed for ingress; egress gets it only for name */
 static struct lock_class_key qdisc_tx_lock;
 static struct lock_class_key qdisc_rx_lock;
@@ -1088,6 +1135,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
 	}
 
+	err = qdisc_block_indexes_set(sch, tca, extack);
+	if (err)
+		goto err_out3;
+
 	if (ops->init) {
 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
 		if (err != 0)
@@ -1169,6 +1220,10 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
 			return -EINVAL;
 		}
+		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
+			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
+			return -EOPNOTSUPP;
+		}
 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
 		if (err)
 			return err;
@@ -1894,6 +1949,11 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 		}
 	}
 
+	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
+		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
+		return -EOPNOTSUPP;
+	}
+
 	new_cl = cl;
 	err = -EOPNOTSUPP;
 	if (cops->change)
-- 
cgit v1.2.3


From 51ab2994c387c80b45caf8b8067b3f3b97771d25 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:53 +0100
Subject: net: sched: allow ingress and clsact qdiscs to share filter blocks

Benefit from the previously introduced shared filter blocks
infrastructure and allow ingress and clsact qdisc instances to share
filter blocks. The block index is coming from userspace as qdisc option.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_ingress.c | 80 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 3372dd5e984d..ce3f55259d0d 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -61,6 +61,20 @@ static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
 	struct mini_Qdisc_pair *miniqp = priv;
 
 	mini_qdisc_pair_swap(miniqp, tp_head);
+};
+
+static void ingress_ingress_block_set(struct Qdisc *sch, u32 block_index)
+{
+	struct ingress_sched_data *q = qdisc_priv(sch);
+
+	q->block_info.block_index = block_index;
+}
+
+static u32 ingress_ingress_block_get(struct Qdisc *sch)
+{
+	struct ingress_sched_data *q = qdisc_priv(sch);
+
+	return q->block_info.block_index;
 }
 
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
@@ -113,14 +127,16 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 };
 
 static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
-	.cl_ops		=	&ingress_class_ops,
-	.id		=	"ingress",
-	.priv_size	=	sizeof(struct ingress_sched_data),
-	.static_flags	=	TCQ_F_CPUSTATS,
-	.init		=	ingress_init,
-	.destroy	=	ingress_destroy,
-	.dump		=	ingress_dump,
-	.owner		=	THIS_MODULE,
+	.cl_ops			=	&ingress_class_ops,
+	.id			=	"ingress",
+	.priv_size		=	sizeof(struct ingress_sched_data),
+	.static_flags		=	TCQ_F_CPUSTATS,
+	.init			=	ingress_init,
+	.destroy		=	ingress_destroy,
+	.dump			=	ingress_dump,
+	.ingress_block_set	=	ingress_ingress_block_set,
+	.ingress_block_get	=	ingress_ingress_block_get,
+	.owner			=	THIS_MODULE,
 };
 
 struct clsact_sched_data {
@@ -164,6 +180,34 @@ static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl,
 	}
 }
 
+static void clsact_ingress_block_set(struct Qdisc *sch, u32 block_index)
+{
+	struct clsact_sched_data *q = qdisc_priv(sch);
+
+	q->ingress_block_info.block_index = block_index;
+}
+
+static void clsact_egress_block_set(struct Qdisc *sch, u32 block_index)
+{
+	struct clsact_sched_data *q = qdisc_priv(sch);
+
+	q->egress_block_info.block_index = block_index;
+}
+
+static u32 clsact_ingress_block_get(struct Qdisc *sch)
+{
+	struct clsact_sched_data *q = qdisc_priv(sch);
+
+	return q->ingress_block_info.block_index;
+}
+
+static u32 clsact_egress_block_get(struct Qdisc *sch)
+{
+	struct clsact_sched_data *q = qdisc_priv(sch);
+
+	return q->egress_block_info.block_index;
+}
+
 static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 		       struct netlink_ext_ack *extack)
 {
@@ -215,14 +259,18 @@ static const struct Qdisc_class_ops clsact_class_ops = {
 };
 
 static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
-	.cl_ops		=	&clsact_class_ops,
-	.id		=	"clsact",
-	.priv_size	=	sizeof(struct clsact_sched_data),
-	.static_flags	=	TCQ_F_CPUSTATS,
-	.init		=	clsact_init,
-	.destroy	=	clsact_destroy,
-	.dump		=	ingress_dump,
-	.owner		=	THIS_MODULE,
+	.cl_ops			=	&clsact_class_ops,
+	.id			=	"clsact",
+	.priv_size		=	sizeof(struct clsact_sched_data),
+	.static_flags		=	TCQ_F_CPUSTATS,
+	.init			=	clsact_init,
+	.destroy		=	clsact_destroy,
+	.dump			=	ingress_dump,
+	.ingress_block_set	=	clsact_ingress_block_set,
+	.egress_block_set	=	clsact_egress_block_set,
+	.ingress_block_get	=	clsact_ingress_block_get,
+	.egress_block_get	=	clsact_egress_block_get,
+	.owner			=	THIS_MODULE,
 };
 
 static int __init ingress_module_init(void)
-- 
cgit v1.2.3


From 0c06bea919f3289368a023d1a62a1bc319617fa3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 16 Jan 2018 12:31:41 +0300
Subject: net: Fix possible race in peernet2id_alloc()

peernet2id_alloc() is racy without rtnl_lock() as refcount_read(&peer->count)
under net->nsid_lock does not guarantee, peer is alive:

rcu_read_lock()
peernet2id_alloc()                            ..
  spin_lock_bh(&net->nsid_lock)               ..
  refcount_read(&peer->count) (!= 0)          ..
  ..                                          put_net()
  ..                                            cleanup_net()
  ..                                              for_each_net(tmp)
  ..                                                spin_lock_bh(&tmp->nsid_lock)
  ..                                                __peernet2id(tmp, net) == -1
  ..                                                    ..
  ..                                                    ..
    __peernet2id_alloc(alloc == true)                   ..
  ..                                                    ..
rcu_read_unlock()                                       ..
..                                                synchronize_rcu()
..                                                kmem_cache_free(net)

After the above situation, net::netns_id contains id pointing to freed memory,
and any other dereferencing by the id will operate with this freed memory.

Currently, peernet2id_alloc() is used under rtnl_lock() everywhere except
ovs_vport_cmd_fill_info(), and this race can't occur. But peernet2id_alloc()
is generic interface, and better we fix it before someone really starts
use it in wrong context.

v2: Don't place refcount_read(&net->count) under net->nsid_lock
    as suggested by Eric W. Biederman <ebiederm@xmission.com>
v3: Rebase on top of net-next

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2213d45fcafd..3c77d84ad60d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -221,17 +221,26 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
  */
 int peernet2id_alloc(struct net *net, struct net *peer)
 {
-	bool alloc;
+	bool alloc = false, alive = false;
 	int id;
 
 	if (refcount_read(&net->count) == 0)
 		return NETNSA_NSID_NOT_ASSIGNED;
 	spin_lock_bh(&net->nsid_lock);
-	alloc = refcount_read(&peer->count) == 0 ? false : true;
+	/*
+	 * When peer is obtained from RCU lists, we may race with
+	 * its cleanup. Check whether it's alive, and this guarantees
+	 * we never hash a peer back to net->netns_ids, after it has
+	 * just been idr_remove()'d from there in cleanup_net().
+	 */
+	if (maybe_get_net(peer))
+		alive = alloc = true;
 	id = __peernet2id_alloc(net, peer, &alloc);
 	spin_unlock_bh(&net->nsid_lock);
 	if (alloc && id >= 0)
 		rtnl_net_notifyid(net, RTM_NEWNSID, id);
+	if (alive)
+		put_net(peer);
 	return id;
 }
 EXPORT_SYMBOL_GPL(peernet2id_alloc);
-- 
cgit v1.2.3


From 42157277af17d5c05946c700eb03877d60760d3c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 16 Jan 2018 12:31:54 +0300
Subject: net: Remove spinlock from get_net_ns_by_id()

idr_find() is safe under rcu_read_lock() and
maybe_get_net() guarantees that net is alive.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3c77d84ad60d..1ccb953b3b09 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -273,11 +273,9 @@ struct net *get_net_ns_by_id(struct net *net, int id)
 		return NULL;
 
 	rcu_read_lock();
-	spin_lock_bh(&net->nsid_lock);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
 		peer = maybe_get_net(peer);
-	spin_unlock_bh(&net->nsid_lock);
 	rcu_read_unlock();
 
 	return peer;
-- 
cgit v1.2.3


From 50bd870a9e5cca9fcf5fb4c130c373643d7d9906 Mon Sep 17 00:00:00 2001
From: Yossef Efraim <yossefe@mellanox.com>
Date: Sun, 14 Jan 2018 11:39:10 +0200
Subject: xfrm: Add ESN support for IPSec HW offload

This patch adds ESN support to IPsec device offload.
Adding new xfrm device operation to synchronize device ESN.

Signed-off-by: Yossef Efraim <yossefe@mellanox.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 Documentation/networking/xfrm_device.txt |  3 +++
 include/linux/netdevice.h                |  1 +
 include/net/xfrm.h                       | 12 ++++++++++++
 net/xfrm/xfrm_device.c                   | 11 +++++++++--
 net/xfrm/xfrm_replay.c                   |  2 ++
 5 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt
index 2d9d588cd34b..50c34ca65efe 100644
--- a/Documentation/networking/xfrm_device.txt
+++ b/Documentation/networking/xfrm_device.txt
@@ -41,6 +41,7 @@ struct xfrmdev_ops {
 	void	(*xdo_dev_state_free) (struct xfrm_state *x);
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
+	void    (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 };
 
 The NIC driver offering ipsec offload will need to implement these
@@ -117,6 +118,8 @@ the stack in xfrm_input().
 
 	hand the packet to napi_gro_receive() as usual
 
+In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn().
+Driver will check packet seq number and update HW ESN state machine if needed.
 
 When the SA is removed by the user, the driver's xdo_dev_state_delete()
 is asked to disable the offload.  Later, xdo_dev_state_free() is called
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ed0799a12bf2..540151875444 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -851,6 +851,7 @@ struct xfrmdev_ops {
 	void	(*xdo_dev_state_free) (struct xfrm_state *x);
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
+	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 };
 #endif
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2e6d4fe6b0ba..7d2077665c0b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1904,6 +1904,14 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
+static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
+{
+	struct xfrm_state_offload *xso = &x->xso;
+
+	if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
+		xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
+}
+
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
 	struct xfrm_state *x = dst->xfrm;
@@ -1974,6 +1982,10 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x
 	return false;
 }
 
+static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
+{
+}
+
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
 	return false;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 75982506617b..93520106731f 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -147,8 +147,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	if (!x->type_offload)
 		return -EINVAL;
 
-	/* We don't yet support UDP encapsulation, TFC padding and ESN. */
-	if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
+	/* We don't yet support UDP encapsulation and TFC padding. */
+	if (x->encap || x->tfcpad)
 		return -EINVAL;
 
 	dev = dev_get_by_index(net, xuo->ifindex);
@@ -178,6 +178,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		return 0;
 	}
 
+	if (x->props.flags & XFRM_STATE_ESN &&
+	    !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
+		xso->dev = NULL;
+		dev_put(dev);
+		return -EINVAL;
+	}
+
 	xso->dev = dev;
 	xso->num_exthdrs = 1;
 	xso->flags = xuo->flags;
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 02501817227b..1d38c6acf8af 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -551,6 +551,8 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
 			bitnr = replay_esn->replay_window - (diff - pos);
 	}
 
+	xfrm_dev_state_advance_esn(x);
+
 	nr = bitnr >> 5;
 	bitnr = bitnr & 0x1F;
 	replay_esn->bmp[nr] |= (1U << bitnr);
-- 
cgit v1.2.3


From d680b3524cd2b9c4f1dc2ba1823c538988bb85e2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 18 Jan 2018 16:14:49 +0100
Subject: net: sched: silence uninitialized parent variable warning in
 tc_dump_tfilter

When tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK, parent is still passed
down but the value is never used. Compiler does not recognize it and
issues a warning. Silence it down initializing parent to 0.

Fixes: 7960d1daf278 ("net: sched: use block index as a handle instead of qdisc when block is shared")
Reported-by: David Miller <davem@davemloft.net>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e500d11da9cd..86d6e9d2cf00 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1317,6 +1317,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		block = tcf_block_lookup(net, tcm->tcm_block_index);
 		if (!block)
 			goto out;
+		/* If we work with block index, q is NULL and parent value
+		 * will never be used in the following code. The check
+		 * in tcf_fill_node prevents it. However, compiler does not
+		 * see that far, so set parent to zero to silence the warning
+		 * about parent being uninitialized.
+		 */
+		parent = 0;
 	} else {
 		const struct Qdisc_class_ops *cops;
 		struct net_device *dev;
-- 
cgit v1.2.3


From 89290b831ec1a0b233fdc7aaad84acdf4ebbf6aa Mon Sep 17 00:00:00 2001
From: Christopher Díaz Riveros <chrisadr@gentoo.org>
Date: Wed, 17 Jan 2018 16:10:28 -0500
Subject: flow_netlink: Remove unneeded semicolons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trivial fix removes unneeded semicolons after if blocks.

This issue was detected by using the Coccinelle software.

Signed-off-by: Christopher Díaz Riveros <chrisadr@gentoo.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index f143908b651d..eb55f1b3d047 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2454,7 +2454,7 @@ static int validate_geneve_opts(struct sw_flow_key *key)
 
 		option = (struct geneve_opt *)((u8 *)option + len);
 		opts_len -= len;
-	};
+	}
 
 	key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
 
@@ -2487,7 +2487,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
 			break;
 		}
-	};
+	}
 
 	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log);
 	if (start < 0)
-- 
cgit v1.2.3


From 61f3c964dfd287b05d7ac6660a4f4ddfef84786c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 17 Jan 2018 16:52:02 -0800
Subject: bpf: allow socket_filter programs to use bpf_prog_test_run

in order to improve test coverage allow socket_filter program type
to be run via bpf_prog_test_run command.
Since such programs can be loaded by non-root tighten
permissions for bpf_prog_test_run to be root only
to avoid surprises.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 2 ++
 net/core/filter.c    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c28524483bf4..97a825ffc763 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1504,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index db2ee8c7e1bd..30fafaaa90fa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4526,6 +4526,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
 };
 
 const struct bpf_prog_ops sk_filter_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
-- 
cgit v1.2.3


From ef58ca38dbda0642e293bcaa2e05edc79677c617 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 18 Jan 2018 16:30:49 -0800
Subject: net/sched/sch_prio.c: work around gcc-4.4.4 union initializer issues

gcc-4.4.4 has problems witn anon union initializers.  Work around this.

net/sched/sch_prio.c: In function 'prio_dump_offload':
net/sched/sch_prio.c:260: error: unknown field 'stats' specified in initializer
net/sched/sch_prio.c:260: warning: initialization makes integer from pointer without a cast
net/sched/sch_prio.c:261: error: unknown field 'stats' specified in initializer
net/sched/sch_prio.c:261: warning: initialization makes integer from pointer without a cast

Fixes: 7fdb61b44c0c95 ("net: sch: prio: Add offload ability to PRIO qdisc")
Cc: Nogah Frankel <nogahf@mellanox.com>
Cc: Yuval Mintz <yuvalm@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_prio.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index a398502899a9..efbf51f35778 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -254,11 +254,15 @@ static int prio_dump_offload(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_prio_qopt_offload hw_stats = {
+		.command = TC_PRIO_STATS,
 		.handle = sch->handle,
 		.parent = sch->parent,
-		.command = TC_PRIO_STATS,
-		.stats.bstats = &sch->bstats,
-		.stats.qstats = &sch->qstats,
+		{
+			.stats = {
+				.bstats = &sch->bstats,
+				.qstats = &sch->qstats,
+			},
+		},
 	};
 	int err;
 
-- 
cgit v1.2.3


From f4dbc4c20f05ccf6986b0de429f7552b21a1b362 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 8 Jan 2018 00:09:37 +0900
Subject: netfilter: nf_nat_snmp_basic: remove useless comment

Remove comments that do not let us know important information.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index d5b1e0b3f687..d6f03fe9f9b4 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -81,12 +81,6 @@ struct oct1_map
 };
 
 
-/*****************************************************************************
- *
- * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
- *
- *****************************************************************************/
-
 /* Class */
 #define ASN1_UNI	0	/* Universal */
 #define ASN1_APL	1	/* Application */
@@ -491,12 +485,6 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
 	return 1;
 }
 
-/*****************************************************************************
- *
- * SNMP decoding routines (gxsnmp author Dirk Wisse)
- *
- *****************************************************************************/
-
 /* SNMP Versions */
 #define SNMP_V1				0
 #define SNMP_V2C			1
@@ -992,12 +980,6 @@ err_id_free:
 	return 0;
 }
 
-/*****************************************************************************
- *
- * Misc. routines
- *
- *****************************************************************************/
-
 /*
  * Parse and mangle SNMP message according to mapping.
  * (And this is the fucking 'basic' method).
@@ -1157,12 +1139,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 	return 1;
 }
 
-/*****************************************************************************
- *
- * NAT routines.
- *
- *****************************************************************************/
-
 /*
  * SNMP translation routine.
  */
@@ -1259,12 +1235,6 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
 	.tuple.dst.protonum	= IPPROTO_UDP,
 };
 
-/*****************************************************************************
- *
- * Module stuff.
- *
- *****************************************************************************/
-
 static int __init nf_nat_snmp_basic_init(void)
 {
 	BUG_ON(nf_nat_snmp_hook != NULL);
-- 
cgit v1.2.3


From e29e5ddca0dd7a01831f8aae6ea2664076fa8fd4 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 8 Jan 2018 00:09:56 +0900
Subject: netfilter: nf_nat_snmp_basic: remove debug parameter

To see debug message of nf_nat_snmp_basic, we should set debug value
when we insert this module. but it is inconvenient and only using of
the dynamic debugging is enough to debug.

This patch just removes debug code. then in the next patch, debugging code
will be added.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 62 ----------------------------------
 1 file changed, 62 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index d6f03fe9f9b4..e5ec946f0765 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -66,7 +66,6 @@ MODULE_ALIAS("ip_nat_snmp_basic");
 #define SNMP_TRAP_PORT 162
 #define NOCT1(n) (*(u8 *)(n))
 
-static int debug;
 static DEFINE_SPINLOCK(snmp_lock);
 
 /*
@@ -888,23 +887,13 @@ static inline void mangle_address(unsigned char *begin,
 				  __sum16 *check)
 {
 	if (map->from == NOCT1(addr)) {
-		u_int32_t old;
-
-		if (debug)
-			memcpy(&old, addr, sizeof(old));
-
 		*addr = map->to;
 
 		/* Update UDP checksum if being used */
 		if (*check) {
 			fast_csum(check,
 				  &map->from, &map->to, addr - begin);
-
 		}
-
-		if (debug)
-			printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n",
-			       &old, addr);
 	}
 }
 
@@ -995,10 +984,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 	struct asn1_octstr comm;
 	struct snmp_object *obj;
 
-	if (debug > 1)
-		print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1,
-			       msg, len, 0);
-
 	asn1_open(&ctx, msg, len);
 
 	/*
@@ -1018,8 +1003,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 		return 0;
 	if (!asn1_uint_decode (&ctx, end, &vers))
 		return 0;
-	if (debug > 1)
-		pr_debug("bsalg: snmp version: %u\n", vers + 1);
 	if (vers > 1)
 		return 1;
 
@@ -1032,14 +1015,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 		return 0;
 	if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
 		return 0;
-	if (debug > 1) {
-		unsigned int i;
-
-		pr_debug("bsalg: community: ");
-		for (i = 0; i < comm.len; i++)
-			pr_cont("%c", comm.data[i]);
-		pr_cont("\n");
-	}
 	kfree(comm.data);
 
 	/*
@@ -1049,23 +1024,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 		return 0;
 	if (cls != ASN1_CTX || con != ASN1_CON)
 		return 0;
-	if (debug > 1) {
-		static const unsigned char *const pdus[] = {
-			[SNMP_PDU_GET] = "get",
-			[SNMP_PDU_NEXT] = "get-next",
-			[SNMP_PDU_RESPONSE] = "response",
-			[SNMP_PDU_SET] = "set",
-			[SNMP_PDU_TRAP1] = "trapv1",
-			[SNMP_PDU_BULK] = "bulk",
-			[SNMP_PDU_INFORM] = "inform",
-			[SNMP_PDU_TRAP2] = "trapv2"
-		};
-
-		if (pdutype > SNMP_PDU_TRAP2)
-			pr_debug("bsalg: bad pdu type %u\n", pdutype);
-		else
-			pr_debug("bsalg: pdu: %s\n", pdus[pdutype]);
-	}
 	if (pdutype != SNMP_PDU_RESPONSE &&
 	    pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
 		return 1;
@@ -1088,11 +1046,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 
 		if (!snmp_request_decode(&ctx, &req))
 			return 0;
-
-		if (debug > 1)
-			pr_debug("bsalg: request: id=0x%lx error_status=%u "
-			"error_index=%u\n", req.id, req.error_status,
-			req.error_index);
 	}
 
 	/*
@@ -1105,8 +1058,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 		return 0;
 
 	while (!asn1_eoc_decode(&ctx, eoc)) {
-		unsigned int i;
-
 		if (!snmp_object_decode(&ctx, &obj)) {
 			if (obj) {
 				kfree(obj->id);
@@ -1115,17 +1066,6 @@ static int snmp_parse_mangle(unsigned char *msg,
 			return 0;
 		}
 
-		if (debug > 1) {
-			pr_debug("bsalg: object: ");
-			for (i = 0; i < obj->id_len; i++) {
-				if (i > 0)
-					pr_cont(".");
-				pr_cont("%lu", obj->id[i]);
-			}
-			pr_cont(": type=%u\n", obj->type);
-
-		}
-
 		if (obj->type == SNMP_IPADDR)
 			mangle_address(ctx.begin, ctx.pointer - 4, map, check);
 
@@ -1252,5 +1192,3 @@ static void __exit nf_nat_snmp_basic_fini(void)
 
 module_init(nf_nat_snmp_basic_init);
 module_exit(nf_nat_snmp_basic_fini);
-
-module_param(debug, int, 0600);
-- 
cgit v1.2.3


From 8b8f0813b763efb55a8462dab096868a6bafd1ba Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 8 Jan 2018 00:10:07 +0900
Subject: netfilter: nf_nat_snmp_basic: replace ctinfo with dir.

The snmp_translate() receives ctinfo data to get dir value only.
because of caller already has dir value, we just replace ctinfo with dir.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index e5ec946f0765..c8ac57f56318 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1082,15 +1082,12 @@ static int snmp_parse_mangle(unsigned char *msg,
 /*
  * SNMP translation routine.
  */
-static int snmp_translate(struct nf_conn *ct,
-			  enum ip_conntrack_info ctinfo,
-			  struct sk_buff *skb)
+static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb)
 {
 	struct iphdr *iph = ip_hdr(skb);
 	struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
 	u_int16_t udplen = ntohs(udph->len);
 	u_int16_t paylen = udplen - sizeof(struct udphdr);
-	int dir = CTINFO2DIR(ctinfo);
 	struct oct1_map map;
 
 	/*
@@ -1155,7 +1152,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 		return NF_DROP;
 
 	spin_lock_bh(&snmp_lock);
-	ret = snmp_translate(ct, ctinfo, skb);
+	ret = snmp_translate(ct, dir, skb);
 	spin_unlock_bh(&snmp_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From bea588b0281fa3d2ddf365039d1dfddddcbe9aa2 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 8 Jan 2018 00:10:21 +0900
Subject: netfilter: nf_nat_snmp_basic: use nf_ct_helper_log

Use nf_ct_helper_log to write log message.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c8ac57f56318..7f7d847bb048 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1109,7 +1109,7 @@ static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb)
 
 	if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
 			       paylen, &map, &udph->check)) {
-		net_warn_ratelimited("bsalg: parser failed\n");
+		nf_ct_helper_log(skb, ct, "parser failed\n");
 		return NF_DROP;
 	}
 	return NF_ACCEPT;
@@ -1143,13 +1143,14 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 	 * can mess around with the payload.
 	 */
 	if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
-		net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
-				     &iph->saddr, &iph->daddr);
-		 return NF_DROP;
+		nf_ct_helper_log(skb, ct, "dropping malformed packet\n");
+		return NF_DROP;
 	}
 
-	if (!skb_make_writable(skb, skb->len))
+	if (!skb_make_writable(skb, skb->len)) {
+		nf_ct_helper_log(skb, ct, "cannot mangle packet");
 		return NF_DROP;
+	}
 
 	spin_lock_bh(&snmp_lock);
 	ret = snmp_translate(ct, dir, skb);
-- 
cgit v1.2.3


From cc2d58634e0f489d28b5564c05abc69930b4d920 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 8 Jan 2018 00:10:33 +0900
Subject: netfilter: nf_nat_snmp_basic: use asn1 decoder library

The basic SNMP ALG parse snmp ASN.1 payload
however, since 2012 linux kernel provide ASN.1 decoder library.
If we use ASN.1 decoder in the /lib/asn1_decoder.c, we can remove
about 1000 line of ASN.1 parsing routine.

To use asn1_decoder.c, we should write mib file(nf_nat_snmp_basic.asn1)
then /script/asn1_compiler.c makes *-asn1.c and *-asn1.h file
at the compiletime.(nf_nat_snmp_basic-asn1.c, nf_nat_snmp_basic-asn1.h)
The nf_nat_snmp_basic.asn1 is made by RFC1155, RFC1157, RFC1902, RFC1905,
RFC2578, RFC3416. of course that mib file supports only the basic SNMP ALG.

Previous SNMP ALG mangles only first octet of IPv4 address.
but after this patch, the SNMP ALG mangles whole IPv4 Address.
And SNMPv3 is not supported.

I tested with snmp commands such ans snmpd, snmpwalk, snmptrap.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig                  |    1 +
 net/ipv4/netfilter/Makefile                 |    6 +
 net/ipv4/netfilter/nf_nat_snmp_basic.asn1   |  177 ++++
 net/ipv4/netfilter/nf_nat_snmp_basic.c      | 1192 ---------------------------
 net/ipv4/netfilter/nf_nat_snmp_basic_main.c |  235 ++++++
 5 files changed, 419 insertions(+), 1192 deletions(-)
 create mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic.asn1
 delete mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic.c
 create mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic_main.c

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index f28b08819f89..5f52236780b4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -158,6 +158,7 @@ config NF_NAT_SNMP_BASIC
 	depends on NF_CONNTRACK_SNMP
 	depends on NETFILTER_ADVANCED
 	default NF_NAT && NF_CONNTRACK_SNMP
+	select ASN1
 	---help---
 
 	  This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8bb1f0c7a375..2dad20eefd26 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -27,9 +27,15 @@ obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
 # NAT helpers (nf_conntrack)
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
+
+nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o
+nf_nat_snmp_basic-y : nf_nat_snmp_basic-asn1.h nf_nat_snmp_basic-asn1.c
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h
+
 obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
 
+
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
new file mode 100644
index 000000000000..24b73268f362
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
@@ -0,0 +1,177 @@
+Message ::=
+	SEQUENCE {
+		version
+			INTEGER ({snmp_version}),
+
+		community
+			OCTET STRING,
+
+		pdu
+			PDUs
+	}
+
+
+ObjectName ::=
+	OBJECT IDENTIFIER
+
+ObjectSyntax ::=
+	CHOICE {
+		simple
+			SimpleSyntax,
+
+		application-wide
+			ApplicationSyntax
+	}
+
+SimpleSyntax ::=
+	CHOICE {
+		integer-value
+			INTEGER,
+
+		string-value
+			OCTET STRING,
+
+		objectID-value
+			OBJECT IDENTIFIER
+	}
+
+ApplicationSyntax ::=
+	CHOICE {
+		ipAddress-value
+			IpAddress,
+
+		counter-value
+			Counter32,
+
+		timeticks-value
+			TimeTicks,
+
+		arbitrary-value
+			Opaque,
+
+		big-counter-value
+			Counter64,
+
+		unsigned-integer-value
+			Unsigned32
+	}
+
+IpAddress ::=
+	[APPLICATION 0]
+		IMPLICIT OCTET STRING OPTIONAL ({snmp_helper})
+
+Counter32 ::=
+	[APPLICATION 1]
+		IMPLICIT INTEGER OPTIONAL
+
+Unsigned32 ::=
+	[APPLICATION 2]
+		IMPLICIT INTEGER OPTIONAL
+
+Gauge32 ::= Unsigned32 OPTIONAL
+
+TimeTicks ::=
+	[APPLICATION 3]
+		IMPLICIT INTEGER OPTIONAL
+
+Opaque ::=
+	[APPLICATION 4]
+		IMPLICIT OCTET STRING OPTIONAL
+
+Counter64 ::=
+	[APPLICATION 6]
+		IMPLICIT INTEGER OPTIONAL
+
+PDUs ::=
+	CHOICE {
+		get-request
+			GetRequest-PDU,
+
+		get-next-request
+			GetNextRequest-PDU,
+
+		get-bulk-request
+			GetBulkRequest-PDU,
+
+		response
+			Response-PDU,
+
+		set-request
+			SetRequest-PDU,
+
+		inform-request
+			InformRequest-PDU,
+
+		snmpV2-trap
+			SNMPv2-Trap-PDU,
+
+		report
+			Report-PDU
+	}
+
+GetRequest-PDU ::=
+	[0] IMPLICIT PDU OPTIONAL
+
+GetNextRequest-PDU ::=
+	[1] IMPLICIT PDU OPTIONAL
+
+Response-PDU ::=
+	[2] IMPLICIT PDU OPTIONAL
+
+SetRequest-PDU ::=
+	[3] IMPLICIT PDU OPTIONAL
+
+-- [4] is obsolete
+
+GetBulkRequest-PDU ::=
+	[5] IMPLICIT PDU OPTIONAL
+
+InformRequest-PDU ::=
+	[6] IMPLICIT PDU OPTIONAL
+
+SNMPv2-Trap-PDU ::=
+	[7] IMPLICIT PDU OPTIONAL
+
+Report-PDU ::=
+	[8] IMPLICIT PDU OPTIONAL
+
+PDU ::=
+	SEQUENCE {
+		request-id
+			INTEGER,
+
+		error-status
+			INTEGER,
+
+		error-index
+			INTEGER,
+
+		variable-bindings
+			VarBindList
+	}
+
+
+VarBind ::=
+	SEQUENCE {
+		name
+			ObjectName,
+
+	CHOICE {
+		value
+			ObjectSyntax,
+
+		unSpecified
+			NULL,
+
+		noSuchObject
+			[0] IMPLICIT NULL,
+
+		noSuchInstance
+			[1] IMPLICIT NULL,
+
+		endOfMibView
+			[2] IMPLICIT NULL
+	}
+}
+
+VarBindList ::= SEQUENCE OF VarBind
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
deleted file mode 100644
index 7f7d847bb048..000000000000
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ /dev/null
@@ -1,1192 +0,0 @@
-/*
- * nf_nat_snmp_basic.c
- *
- * Basic SNMP Application Layer Gateway
- *
- * This IP NAT module is intended for use with SNMP network
- * discovery and monitoring applications where target networks use
- * conflicting private address realms.
- *
- * Static NAT is used to remap the networks from the view of the network
- * management system at the IP layer, and this module remaps some application
- * layer addresses to match.
- *
- * The simplest form of ALG is performed, where only tagged IP addresses
- * are modified.  The module does not need to be MIB aware and only scans
- * messages at the ASN.1/BER level.
- *
- * Currently, only SNMPv1 and SNMPv2 are supported.
- *
- * More information on ALG and associated issues can be found in
- * RFC 2962
- *
- * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
- * McLean & Jochen Friedrich, stripped down for use in the kernel.
- *
- * Copyright (c) 2000 RP Internet (www.rpi.net.au).
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * Author: James Morris <jmorris@intercode.com.au>
- *
- * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/udp.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <linux/netfilter/nf_conntrack_snmp.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
-MODULE_ALIAS("ip_nat_snmp_basic");
-
-#define SNMP_PORT 161
-#define SNMP_TRAP_PORT 162
-#define NOCT1(n) (*(u8 *)(n))
-
-static DEFINE_SPINLOCK(snmp_lock);
-
-/*
- * Application layer address mapping mimics the NAT mapping, but
- * only for the first octet in this case (a more flexible system
- * can be implemented if needed).
- */
-struct oct1_map
-{
-	u_int8_t from;
-	u_int8_t to;
-};
-
-
-/* Class */
-#define ASN1_UNI	0	/* Universal */
-#define ASN1_APL	1	/* Application */
-#define ASN1_CTX	2	/* Context */
-#define ASN1_PRV	3	/* Private */
-
-/* Tag */
-#define ASN1_EOC	0	/* End Of Contents */
-#define ASN1_BOL	1	/* Boolean */
-#define ASN1_INT	2	/* Integer */
-#define ASN1_BTS	3	/* Bit String */
-#define ASN1_OTS	4	/* Octet String */
-#define ASN1_NUL	5	/* Null */
-#define ASN1_OJI	6	/* Object Identifier  */
-#define ASN1_OJD	7	/* Object Description */
-#define ASN1_EXT	8	/* External */
-#define ASN1_SEQ	16	/* Sequence */
-#define ASN1_SET	17	/* Set */
-#define ASN1_NUMSTR	18	/* Numerical String */
-#define ASN1_PRNSTR	19	/* Printable String */
-#define ASN1_TEXSTR	20	/* Teletext String */
-#define ASN1_VIDSTR	21	/* Video String */
-#define ASN1_IA5STR	22	/* IA5 String */
-#define ASN1_UNITIM	23	/* Universal Time */
-#define ASN1_GENTIM	24	/* General Time */
-#define ASN1_GRASTR	25	/* Graphical String */
-#define ASN1_VISSTR	26	/* Visible String */
-#define ASN1_GENSTR	27	/* General String */
-
-/* Primitive / Constructed methods*/
-#define ASN1_PRI	0	/* Primitive */
-#define ASN1_CON	1	/* Constructed */
-
-/*
- * Error codes.
- */
-#define ASN1_ERR_NOERROR		0
-#define ASN1_ERR_DEC_EMPTY		2
-#define ASN1_ERR_DEC_EOC_MISMATCH	3
-#define ASN1_ERR_DEC_LENGTH_MISMATCH	4
-#define ASN1_ERR_DEC_BADVALUE		5
-
-/*
- * ASN.1 context.
- */
-struct asn1_ctx
-{
-	int error;			/* Error condition */
-	unsigned char *pointer;		/* Octet just to be decoded */
-	unsigned char *begin;		/* First octet */
-	unsigned char *end;		/* Octet after last octet */
-};
-
-/*
- * Octet string (not null terminated)
- */
-struct asn1_octstr
-{
-	unsigned char *data;
-	unsigned int len;
-};
-
-static void asn1_open(struct asn1_ctx *ctx,
-		      unsigned char *buf,
-		      unsigned int len)
-{
-	ctx->begin = buf;
-	ctx->end = buf + len;
-	ctx->pointer = buf;
-	ctx->error = ASN1_ERR_NOERROR;
-}
-
-static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
-{
-	if (ctx->pointer >= ctx->end) {
-		ctx->error = ASN1_ERR_DEC_EMPTY;
-		return 0;
-	}
-	*ch = *(ctx->pointer)++;
-	return 1;
-}
-
-static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
-{
-	unsigned char ch;
-
-	*tag = 0;
-
-	do
-	{
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-		*tag <<= 7;
-		*tag |= ch & 0x7F;
-	} while ((ch & 0x80) == 0x80);
-	return 1;
-}
-
-static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
-				    unsigned int *cls,
-				    unsigned int *con,
-				    unsigned int *tag)
-{
-	unsigned char ch;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*cls = (ch & 0xC0) >> 6;
-	*con = (ch & 0x20) >> 5;
-	*tag = (ch & 0x1F);
-
-	if (*tag == 0x1F) {
-		if (!asn1_tag_decode(ctx, tag))
-			return 0;
-	}
-	return 1;
-}
-
-static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
-					unsigned int *def,
-					unsigned int *len)
-{
-	unsigned char ch, cnt;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	if (ch == 0x80)
-		*def = 0;
-	else {
-		*def = 1;
-
-		if (ch < 0x80)
-			*len = ch;
-		else {
-			cnt = ch & 0x7F;
-			*len = 0;
-
-			while (cnt > 0) {
-				if (!asn1_octet_decode(ctx, &ch))
-					return 0;
-				*len <<= 8;
-				*len |= ch;
-				cnt--;
-			}
-		}
-	}
-
-	/* don't trust len bigger than ctx buffer */
-	if (*len > ctx->end - ctx->pointer)
-		return 0;
-
-	return 1;
-}
-
-static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
-					unsigned char **eoc,
-					unsigned int *cls,
-					unsigned int *con,
-					unsigned int *tag)
-{
-	unsigned int def, len;
-
-	if (!asn1_id_decode(ctx, cls, con, tag))
-		return 0;
-
-	def = len = 0;
-	if (!asn1_length_decode(ctx, &def, &len))
-		return 0;
-
-	/* primitive shall be definite, indefinite shall be constructed */
-	if (*con == ASN1_PRI && !def)
-		return 0;
-
-	if (def)
-		*eoc = ctx->pointer + len;
-	else
-		*eoc = NULL;
-	return 1;
-}
-
-static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
-{
-	unsigned char ch;
-
-	if (eoc == NULL) {
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		if (ch != 0x00) {
-			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		if (ch != 0x00) {
-			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
-			return 0;
-		}
-		return 1;
-	} else {
-		if (ctx->pointer != eoc) {
-			ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
-			return 0;
-		}
-		return 1;
-	}
-}
-
-static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
-{
-	ctx->pointer = eoc;
-	return 1;
-}
-
-static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
-				      unsigned char *eoc,
-				      long *integer)
-{
-	unsigned char ch;
-	unsigned int  len;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*integer = (signed char) ch;
-	len = 1;
-
-	while (ctx->pointer < eoc) {
-		if (++len > sizeof (long)) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*integer <<= 8;
-		*integer |= ch;
-	}
-	return 1;
-}
-
-static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
-				      unsigned char *eoc,
-				      unsigned int *integer)
-{
-	unsigned char ch;
-	unsigned int  len;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*integer = ch;
-	if (ch == 0) len = 0;
-	else len = 1;
-
-	while (ctx->pointer < eoc) {
-		if (++len > sizeof (unsigned int)) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*integer <<= 8;
-		*integer |= ch;
-	}
-	return 1;
-}
-
-static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
-				       unsigned char *eoc,
-				       unsigned long *integer)
-{
-	unsigned char ch;
-	unsigned int  len;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*integer = ch;
-	if (ch == 0) len = 0;
-	else len = 1;
-
-	while (ctx->pointer < eoc) {
-		if (++len > sizeof (unsigned long)) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*integer <<= 8;
-		*integer |= ch;
-	}
-	return 1;
-}
-
-static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
-					unsigned char *eoc,
-					unsigned char **octets,
-					unsigned int *len)
-{
-	unsigned char *ptr;
-
-	*len = 0;
-
-	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
-	if (*octets == NULL)
-		return 0;
-
-	ptr = *octets;
-	while (ctx->pointer < eoc) {
-		if (!asn1_octet_decode(ctx, ptr++)) {
-			kfree(*octets);
-			*octets = NULL;
-			return 0;
-		}
-		(*len)++;
-	}
-	return 1;
-}
-
-static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
-				       unsigned long *subid)
-{
-	unsigned char ch;
-
-	*subid = 0;
-
-	do {
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*subid <<= 7;
-		*subid |= ch & 0x7F;
-	} while ((ch & 0x80) == 0x80);
-	return 1;
-}
-
-static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
-				     unsigned char *eoc,
-				     unsigned long **oid,
-				     unsigned int *len)
-{
-	unsigned long subid;
-	unsigned long *optr;
-	size_t size;
-
-	size = eoc - ctx->pointer + 1;
-
-	/* first subid actually encodes first two subids */
-	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
-		return 0;
-
-	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
-	if (*oid == NULL)
-		return 0;
-
-	optr = *oid;
-
-	if (!asn1_subid_decode(ctx, &subid)) {
-		kfree(*oid);
-		*oid = NULL;
-		return 0;
-	}
-
-	if (subid < 40) {
-		optr[0] = 0;
-		optr[1] = subid;
-	} else if (subid < 80) {
-		optr[0] = 1;
-		optr[1] = subid - 40;
-	} else {
-		optr[0] = 2;
-		optr[1] = subid - 80;
-	}
-
-	*len = 2;
-	optr += 2;
-
-	while (ctx->pointer < eoc) {
-		if (++(*len) > size) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			kfree(*oid);
-			*oid = NULL;
-			return 0;
-		}
-
-		if (!asn1_subid_decode(ctx, optr++)) {
-			kfree(*oid);
-			*oid = NULL;
-			return 0;
-		}
-	}
-	return 1;
-}
-
-/* SNMP Versions */
-#define SNMP_V1				0
-#define SNMP_V2C			1
-#define SNMP_V2				2
-#define SNMP_V3				3
-
-/* Default Sizes */
-#define SNMP_SIZE_COMM			256
-#define SNMP_SIZE_OBJECTID		128
-#define SNMP_SIZE_BUFCHR		256
-#define SNMP_SIZE_BUFINT		128
-#define SNMP_SIZE_SMALLOBJECTID		16
-
-/* Requests */
-#define SNMP_PDU_GET			0
-#define SNMP_PDU_NEXT			1
-#define SNMP_PDU_RESPONSE		2
-#define SNMP_PDU_SET			3
-#define SNMP_PDU_TRAP1			4
-#define SNMP_PDU_BULK			5
-#define SNMP_PDU_INFORM			6
-#define SNMP_PDU_TRAP2			7
-
-/* Errors */
-#define SNMP_NOERROR			0
-#define SNMP_TOOBIG			1
-#define SNMP_NOSUCHNAME			2
-#define SNMP_BADVALUE			3
-#define SNMP_READONLY			4
-#define SNMP_GENERROR			5
-#define SNMP_NOACCESS			6
-#define SNMP_WRONGTYPE			7
-#define SNMP_WRONGLENGTH		8
-#define SNMP_WRONGENCODING		9
-#define SNMP_WRONGVALUE			10
-#define SNMP_NOCREATION			11
-#define SNMP_INCONSISTENTVALUE		12
-#define SNMP_RESOURCEUNAVAILABLE	13
-#define SNMP_COMMITFAILED		14
-#define SNMP_UNDOFAILED			15
-#define SNMP_AUTHORIZATIONERROR		16
-#define SNMP_NOTWRITABLE		17
-#define SNMP_INCONSISTENTNAME		18
-
-/* General SNMP V1 Traps */
-#define SNMP_TRAP_COLDSTART		0
-#define SNMP_TRAP_WARMSTART		1
-#define SNMP_TRAP_LINKDOWN		2
-#define SNMP_TRAP_LINKUP		3
-#define SNMP_TRAP_AUTFAILURE		4
-#define SNMP_TRAP_EQPNEIGHBORLOSS	5
-#define SNMP_TRAP_ENTSPECIFIC		6
-
-/* SNMPv1 Types */
-#define SNMP_NULL                0
-#define SNMP_INTEGER             1    /* l  */
-#define SNMP_OCTETSTR            2    /* c  */
-#define SNMP_DISPLAYSTR          2    /* c  */
-#define SNMP_OBJECTID            3    /* ul */
-#define SNMP_IPADDR              4    /* uc */
-#define SNMP_COUNTER             5    /* ul */
-#define SNMP_GAUGE               6    /* ul */
-#define SNMP_TIMETICKS           7    /* ul */
-#define SNMP_OPAQUE              8    /* c  */
-
-/* Additional SNMPv2 Types */
-#define SNMP_UINTEGER            5    /* ul */
-#define SNMP_BITSTR              9    /* uc */
-#define SNMP_NSAP               10    /* uc */
-#define SNMP_COUNTER64          11    /* ul */
-#define SNMP_NOSUCHOBJECT       12
-#define SNMP_NOSUCHINSTANCE     13
-#define SNMP_ENDOFMIBVIEW       14
-
-union snmp_syntax
-{
-	unsigned char uc[0];	/* 8 bit unsigned */
-	char c[0];		/* 8 bit signed */
-	unsigned long ul[0];	/* 32 bit unsigned */
-	long l[0];		/* 32 bit signed */
-};
-
-struct snmp_object
-{
-	unsigned long *id;
-	unsigned int id_len;
-	unsigned short type;
-	unsigned int syntax_len;
-	union snmp_syntax syntax;
-};
-
-struct snmp_request
-{
-	unsigned long id;
-	unsigned int error_status;
-	unsigned int error_index;
-};
-
-struct snmp_v1_trap
-{
-	unsigned long *id;
-	unsigned int id_len;
-	unsigned long ip_address;	/* pointer  */
-	unsigned int general;
-	unsigned int specific;
-	unsigned long time;
-};
-
-/* SNMP types */
-#define SNMP_IPA    0
-#define SNMP_CNT    1
-#define SNMP_GGE    2
-#define SNMP_TIT    3
-#define SNMP_OPQ    4
-#define SNMP_C64    6
-
-/* SNMP errors */
-#define SERR_NSO    0
-#define SERR_NSI    1
-#define SERR_EOM    2
-
-static inline void mangle_address(unsigned char *begin,
-				  unsigned char *addr,
-				  const struct oct1_map *map,
-				  __sum16 *check);
-struct snmp_cnv
-{
-	unsigned int class;
-	unsigned int tag;
-	int syntax;
-};
-
-static const struct snmp_cnv snmp_conv[] = {
-	{ASN1_UNI, ASN1_NUL, SNMP_NULL},
-	{ASN1_UNI, ASN1_INT, SNMP_INTEGER},
-	{ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
-	{ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
-	{ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
-	{ASN1_APL, SNMP_IPA, SNMP_IPADDR},
-	{ASN1_APL, SNMP_CNT, SNMP_COUNTER},	/* Counter32 */
-	{ASN1_APL, SNMP_GGE, SNMP_GAUGE},	/* Gauge32 == Unsigned32  */
-	{ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
-	{ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
-
-	/* SNMPv2 data types and errors */
-	{ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
-	{ASN1_APL, SNMP_C64, SNMP_COUNTER64},
-	{ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
-	{ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
-	{ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
-	{0,       0,       -1}
-};
-
-static unsigned char snmp_tag_cls2syntax(unsigned int tag,
-					 unsigned int cls,
-					 unsigned short *syntax)
-{
-	const struct snmp_cnv *cnv;
-
-	cnv = snmp_conv;
-
-	while (cnv->syntax != -1) {
-		if (cnv->tag == tag && cnv->class == cls) {
-			*syntax = cnv->syntax;
-			return 1;
-		}
-		cnv++;
-	}
-	return 0;
-}
-
-static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
-					struct snmp_object **obj)
-{
-	unsigned int cls, con, tag, len, idlen;
-	unsigned short type;
-	unsigned char *eoc, *end, *p;
-	unsigned long *lp, *id;
-	unsigned long ul;
-	long l;
-
-	*obj = NULL;
-	id = NULL;
-
-	if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
-		return 0;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
-		return 0;
-
-	if (!asn1_oid_decode(ctx, end, &id, &idlen))
-		return 0;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
-		kfree(id);
-		return 0;
-	}
-
-	if (con != ASN1_PRI) {
-		kfree(id);
-		return 0;
-	}
-
-	type = 0;
-	if (!snmp_tag_cls2syntax(tag, cls, &type)) {
-		kfree(id);
-		return 0;
-	}
-
-	l = 0;
-	switch (type) {
-	case SNMP_INTEGER:
-		len = sizeof(long);
-		if (!asn1_long_decode(ctx, end, &l)) {
-			kfree(id);
-			return 0;
-		}
-		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
-		if (*obj == NULL) {
-			kfree(id);
-			return 0;
-		}
-		(*obj)->syntax.l[0] = l;
-		break;
-	case SNMP_OCTETSTR:
-	case SNMP_OPAQUE:
-		if (!asn1_octets_decode(ctx, end, &p, &len)) {
-			kfree(id);
-			return 0;
-		}
-		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
-		if (*obj == NULL) {
-			kfree(p);
-			kfree(id);
-			return 0;
-		}
-		memcpy((*obj)->syntax.c, p, len);
-		kfree(p);
-		break;
-	case SNMP_NULL:
-	case SNMP_NOSUCHOBJECT:
-	case SNMP_NOSUCHINSTANCE:
-	case SNMP_ENDOFMIBVIEW:
-		len = 0;
-		*obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
-		if (*obj == NULL) {
-			kfree(id);
-			return 0;
-		}
-		if (!asn1_null_decode(ctx, end)) {
-			kfree(id);
-			kfree(*obj);
-			*obj = NULL;
-			return 0;
-		}
-		break;
-	case SNMP_OBJECTID:
-		if (!asn1_oid_decode(ctx, end, &lp, &len)) {
-			kfree(id);
-			return 0;
-		}
-		len *= sizeof(unsigned long);
-		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
-		if (*obj == NULL) {
-			kfree(lp);
-			kfree(id);
-			return 0;
-		}
-		memcpy((*obj)->syntax.ul, lp, len);
-		kfree(lp);
-		break;
-	case SNMP_IPADDR:
-		if (!asn1_octets_decode(ctx, end, &p, &len)) {
-			kfree(id);
-			return 0;
-		}
-		if (len != 4) {
-			kfree(p);
-			kfree(id);
-			return 0;
-		}
-		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
-		if (*obj == NULL) {
-			kfree(p);
-			kfree(id);
-			return 0;
-		}
-		memcpy((*obj)->syntax.uc, p, len);
-		kfree(p);
-		break;
-	case SNMP_COUNTER:
-	case SNMP_GAUGE:
-	case SNMP_TIMETICKS:
-		len = sizeof(unsigned long);
-		if (!asn1_ulong_decode(ctx, end, &ul)) {
-			kfree(id);
-			return 0;
-		}
-		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
-		if (*obj == NULL) {
-			kfree(id);
-			return 0;
-		}
-		(*obj)->syntax.ul[0] = ul;
-		break;
-	default:
-		kfree(id);
-		return 0;
-	}
-
-	(*obj)->syntax_len = len;
-	(*obj)->type = type;
-	(*obj)->id = id;
-	(*obj)->id_len = idlen;
-
-	if (!asn1_eoc_decode(ctx, eoc)) {
-		kfree(id);
-		kfree(*obj);
-		*obj = NULL;
-		return 0;
-	}
-	return 1;
-}
-
-static unsigned char noinline_for_stack
-snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request)
-{
-	unsigned int cls, con, tag;
-	unsigned char *end;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
-		return 0;
-
-	if (!asn1_ulong_decode(ctx, end, &request->id))
-		return 0;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
-		return 0;
-
-	if (!asn1_uint_decode(ctx, end, &request->error_status))
-		return 0;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
-		return 0;
-
-	if (!asn1_uint_decode(ctx, end, &request->error_index))
-		return 0;
-
-	return 1;
-}
-
-/*
- * Fast checksum update for possibly oddly-aligned UDP byte, from the
- * code example in the draft.
- */
-static void fast_csum(__sum16 *csum,
-		      const unsigned char *optr,
-		      const unsigned char *nptr,
-		      int offset)
-{
-	unsigned char s[4];
-
-	if (offset & 1) {
-		s[0] = ~0;
-		s[1] = ~*optr;
-		s[2] = 0;
-		s[3] = *nptr;
-	} else {
-		s[0] = ~*optr;
-		s[1] = ~0;
-		s[2] = *nptr;
-		s[3] = 0;
-	}
-
-	*csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
-}
-
-/*
- * Mangle IP address.
- * 	- begin points to the start of the snmp messgae
- *      - addr points to the start of the address
- */
-static inline void mangle_address(unsigned char *begin,
-				  unsigned char *addr,
-				  const struct oct1_map *map,
-				  __sum16 *check)
-{
-	if (map->from == NOCT1(addr)) {
-		*addr = map->to;
-
-		/* Update UDP checksum if being used */
-		if (*check) {
-			fast_csum(check,
-				  &map->from, &map->to, addr - begin);
-		}
-	}
-}
-
-static unsigned char noinline_for_stack
-snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap,
-		 const struct oct1_map *map,
-		 __sum16 *check)
-{
-	unsigned int cls, con, tag, len;
-	unsigned char *end;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
-		return 0;
-
-	if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
-		return 0;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		goto err_id_free;
-
-	if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
-	      (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
-		goto err_id_free;
-
-	if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
-		goto err_id_free;
-
-	/* IPv4 only */
-	if (len != 4)
-		goto err_addr_free;
-
-	mangle_address(ctx->begin, ctx->pointer - 4, map, check);
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		goto err_addr_free;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
-		goto err_addr_free;
-
-	if (!asn1_uint_decode(ctx, end, &trap->general))
-		goto err_addr_free;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		goto err_addr_free;
-
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
-		goto err_addr_free;
-
-	if (!asn1_uint_decode(ctx, end, &trap->specific))
-		goto err_addr_free;
-
-	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
-		goto err_addr_free;
-
-	if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
-	      (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
-		goto err_addr_free;
-
-	if (!asn1_ulong_decode(ctx, end, &trap->time))
-		goto err_addr_free;
-
-	return 1;
-
-err_addr_free:
-	kfree((unsigned long *)trap->ip_address);
-
-err_id_free:
-	kfree(trap->id);
-
-	return 0;
-}
-
-/*
- * Parse and mangle SNMP message according to mapping.
- * (And this is the fucking 'basic' method).
- */
-static int snmp_parse_mangle(unsigned char *msg,
-			     u_int16_t len,
-			     const struct oct1_map *map,
-			     __sum16 *check)
-{
-	unsigned char *eoc, *end;
-	unsigned int cls, con, tag, vers, pdutype;
-	struct asn1_ctx ctx;
-	struct asn1_octstr comm;
-	struct snmp_object *obj;
-
-	asn1_open(&ctx, msg, len);
-
-	/*
-	 * Start of SNMP message.
-	 */
-	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
-		return 0;
-	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
-		return 0;
-
-	/*
-	 * Version 1 or 2 handled.
-	 */
-	if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
-		return 0;
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
-		return 0;
-	if (!asn1_uint_decode (&ctx, end, &vers))
-		return 0;
-	if (vers > 1)
-		return 1;
-
-	/*
-	 * Community.
-	 */
-	if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
-		return 0;
-	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
-		return 0;
-	if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
-		return 0;
-	kfree(comm.data);
-
-	/*
-	 * PDU type
-	 */
-	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
-		return 0;
-	if (cls != ASN1_CTX || con != ASN1_CON)
-		return 0;
-	if (pdutype != SNMP_PDU_RESPONSE &&
-	    pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
-		return 1;
-
-	/*
-	 * Request header or v1 trap
-	 */
-	if (pdutype == SNMP_PDU_TRAP1) {
-		struct snmp_v1_trap trap;
-		unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
-
-		if (ret) {
-			kfree(trap.id);
-			kfree((unsigned long *)trap.ip_address);
-		} else
-			return ret;
-
-	} else {
-		struct snmp_request req;
-
-		if (!snmp_request_decode(&ctx, &req))
-			return 0;
-	}
-
-	/*
-	 * Loop through objects, look for IP addresses to mangle.
-	 */
-	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
-		return 0;
-
-	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
-		return 0;
-
-	while (!asn1_eoc_decode(&ctx, eoc)) {
-		if (!snmp_object_decode(&ctx, &obj)) {
-			if (obj) {
-				kfree(obj->id);
-				kfree(obj);
-			}
-			return 0;
-		}
-
-		if (obj->type == SNMP_IPADDR)
-			mangle_address(ctx.begin, ctx.pointer - 4, map, check);
-
-		kfree(obj->id);
-		kfree(obj);
-	}
-
-	if (!asn1_eoc_decode(&ctx, eoc))
-		return 0;
-
-	return 1;
-}
-
-/*
- * SNMP translation routine.
- */
-static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb)
-{
-	struct iphdr *iph = ip_hdr(skb);
-	struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
-	u_int16_t udplen = ntohs(udph->len);
-	u_int16_t paylen = udplen - sizeof(struct udphdr);
-	struct oct1_map map;
-
-	/*
-	 * Determine mappping for application layer addresses based
-	 * on NAT manipulations for the packet.
-	 */
-	if (dir == IP_CT_DIR_ORIGINAL) {
-		/* SNAT traps */
-		map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
-		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
-	} else {
-		/* DNAT replies */
-		map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip);
-		map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);
-	}
-
-	if (map.from == map.to)
-		return NF_ACCEPT;
-
-	if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
-			       paylen, &map, &udph->check)) {
-		nf_ct_helper_log(skb, ct, "parser failed\n");
-		return NF_DROP;
-	}
-	return NF_ACCEPT;
-}
-
-/* We don't actually set up expectations, just adjust internal IP
- * addresses if this is being NATted */
-static int help(struct sk_buff *skb, unsigned int protoff,
-		struct nf_conn *ct,
-		enum ip_conntrack_info ctinfo)
-{
-	int dir = CTINFO2DIR(ctinfo);
-	unsigned int ret;
-	const struct iphdr *iph = ip_hdr(skb);
-	const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
-
-	/* SNMP replies and originating SNMP traps get mangled */
-	if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
-		return NF_ACCEPT;
-	if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
-		return NF_ACCEPT;
-
-	/* No NAT? */
-	if (!(ct->status & IPS_NAT_MASK))
-		return NF_ACCEPT;
-
-	/*
-	 * Make sure the packet length is ok.  So far, we were only guaranteed
-	 * to have a valid length IP header plus 8 bytes, which means we have
-	 * enough room for a UDP header.  Just verify the UDP length field so we
-	 * can mess around with the payload.
-	 */
-	if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
-		nf_ct_helper_log(skb, ct, "dropping malformed packet\n");
-		return NF_DROP;
-	}
-
-	if (!skb_make_writable(skb, skb->len)) {
-		nf_ct_helper_log(skb, ct, "cannot mangle packet");
-		return NF_DROP;
-	}
-
-	spin_lock_bh(&snmp_lock);
-	ret = snmp_translate(ct, dir, skb);
-	spin_unlock_bh(&snmp_lock);
-	return ret;
-}
-
-static const struct nf_conntrack_expect_policy snmp_exp_policy = {
-	.max_expected	= 0,
-	.timeout	= 180,
-};
-
-static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
-	.me			= THIS_MODULE,
-	.help			= help,
-	.expect_policy		= &snmp_exp_policy,
-	.name			= "snmp_trap",
-	.tuple.src.l3num	= AF_INET,
-	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_TRAP_PORT),
-	.tuple.dst.protonum	= IPPROTO_UDP,
-};
-
-static int __init nf_nat_snmp_basic_init(void)
-{
-	BUG_ON(nf_nat_snmp_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_snmp_hook, help);
-
-	return nf_conntrack_helper_register(&snmp_trap_helper);
-}
-
-static void __exit nf_nat_snmp_basic_fini(void)
-{
-	RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
-	synchronize_rcu();
-	nf_conntrack_helper_unregister(&snmp_trap_helper);
-}
-
-module_init(nf_nat_snmp_basic_init);
-module_exit(nf_nat_snmp_basic_fini);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
new file mode 100644
index 000000000000..b6e277093e7e
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -0,0 +1,235 @@
+/*
+ * nf_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified.  The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+#include "nf_nat_snmp_basic-asn1.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+MODULE_ALIAS("ip_nat_snmp_basic");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+
+static DEFINE_SPINLOCK(snmp_lock);
+
+struct snmp_ctx {
+	unsigned char *begin;
+	__sum16 *check;
+	__be32 from;
+	__be32 to;
+};
+
+static void fast_csum(struct snmp_ctx *ctx, unsigned char offset)
+{
+	unsigned char s[12] = {0,};
+	int size;
+
+	if (offset & 1) {
+		memcpy(&s[1], &ctx->from, 4);
+		memcpy(&s[7], &ctx->to, 4);
+		s[0] = ~0;
+		s[1] = ~s[1];
+		s[2] = ~s[2];
+		s[3] = ~s[3];
+		s[4] = ~s[4];
+		s[5] = ~0;
+		size = 12;
+	} else {
+		memcpy(&s[0], &ctx->from, 4);
+		memcpy(&s[4], &ctx->to, 4);
+		s[0] = ~s[0];
+		s[1] = ~s[1];
+		s[2] = ~s[2];
+		s[3] = ~s[3];
+		size = 8;
+	}
+	*ctx->check = csum_fold(csum_partial(s, size,
+					     ~csum_unfold(*ctx->check)));
+}
+
+int snmp_version(void *context, size_t hdrlen, unsigned char tag,
+		 const void *data, size_t datalen)
+{
+	if (*(unsigned char *)data > 1)
+		return -ENOTSUPP;
+	return 1;
+}
+
+int snmp_helper(void *context, size_t hdrlen, unsigned char tag,
+		const void *data, size_t datalen)
+{
+	struct snmp_ctx *ctx = (struct snmp_ctx *)context;
+	__be32 *pdata = (__be32 *)data;
+
+	if (*pdata == ctx->from) {
+		pr_debug("%s: %pI4 to %pI4\n", __func__,
+			 (void *)&ctx->from, (void *)&ctx->to);
+
+		if (*ctx->check)
+			fast_csum(ctx, (unsigned char *)data - ctx->begin);
+		*pdata = ctx->to;
+	}
+
+	return 1;
+}
+
+static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+	u16 datalen = ntohs(udph->len) - sizeof(struct udphdr);
+	char *data = (unsigned char *)udph + sizeof(struct udphdr);
+	struct snmp_ctx ctx;
+	int ret;
+
+	if (dir == IP_CT_DIR_ORIGINAL) {
+		ctx.from = ct->tuplehash[dir].tuple.src.u3.ip;
+		ctx.to = ct->tuplehash[!dir].tuple.dst.u3.ip;
+	} else {
+		ctx.from = ct->tuplehash[!dir].tuple.src.u3.ip;
+		ctx.to = ct->tuplehash[dir].tuple.dst.u3.ip;
+	}
+
+	if (ctx.from == ctx.to)
+		return NF_ACCEPT;
+
+	ctx.begin = (unsigned char *)udph + sizeof(struct udphdr);
+	ctx.check = &udph->check;
+	ret = asn1_ber_decoder(&nf_nat_snmp_basic_decoder, &ctx, data, datalen);
+	if (ret < 0) {
+		nf_ct_helper_log(skb, ct, "parser failed\n");
+		return NF_DROP;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted
+ */
+static int help(struct sk_buff *skb, unsigned int protoff,
+		struct nf_conn *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	unsigned int ret;
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+
+	/* SNMP replies and originating SNMP traps get mangled */
+	if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+		return NF_ACCEPT;
+	if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	/* No NAT? */
+	if (!(ct->status & IPS_NAT_MASK))
+		return NF_ACCEPT;
+
+	/* Make sure the packet length is ok.  So far, we were only guaranteed
+	 * to have a valid length IP header plus 8 bytes, which means we have
+	 * enough room for a UDP header.  Just verify the UDP length field so we
+	 * can mess around with the payload.
+	 */
+	if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
+		nf_ct_helper_log(skb, ct, "dropping malformed packet\n");
+		return NF_DROP;
+	}
+
+	if (!skb_make_writable(skb, skb->len)) {
+		nf_ct_helper_log(skb, ct, "cannot mangle packet");
+		return NF_DROP;
+	}
+
+	spin_lock_bh(&snmp_lock);
+	ret = snmp_translate(ct, dir, skb);
+	spin_unlock_bh(&snmp_lock);
+	return ret;
+}
+
+static const struct nf_conntrack_expect_policy snmp_exp_policy = {
+	.max_expected	= 0,
+	.timeout	= 180,
+};
+
+static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
+	.me			= THIS_MODULE,
+	.help			= help,
+	.expect_policy		= &snmp_exp_policy,
+	.name			= "snmp_trap",
+	.tuple.src.l3num	= AF_INET,
+	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_TRAP_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+};
+
+static int __init nf_nat_snmp_basic_init(void)
+{
+	BUG_ON(nf_nat_snmp_hook != NULL);
+	RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+
+	return nf_conntrack_helper_register(&snmp_trap_helper);
+}
+
+static void __exit nf_nat_snmp_basic_fini(void)
+{
+	RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+	synchronize_rcu();
+	nf_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(nf_nat_snmp_basic_init);
+module_exit(nf_nat_snmp_basic_fini);
-- 
cgit v1.2.3


From 3ecbfd65f50e5ff9c538c1bfa3356ef52cc66586 Mon Sep 17 00:00:00 2001
From: Harsha Sharma <harshasharmaiitr@gmail.com>
Date: Wed, 27 Dec 2017 00:59:00 +0530
Subject: netfilter: nf_tables: allocate handle and delete objects via handle

This patch allows deletion of objects via unique handle which can be
listed via '-a' option.

Signed-off-by: Harsha Sharma <harshasharmaiitr@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  11 ++-
 include/uapi/linux/netfilter/nf_tables.h |  10 +++
 net/netfilter/nf_tables_api.c            | 146 ++++++++++++++++++++++++++++---
 3 files changed, 153 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4aca413367ee..663b015dace5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -374,6 +374,7 @@ void nft_unregister_set(struct nft_set_type *type);
  *	@list: table set list node
  *	@bindings: list of set bindings
  * 	@name: name of the set
+ *	@handle: unique handle of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
  * 	@dtype: data type (verdict or numeric type defined by userspace)
  * 	@objtype: object type (see NFT_OBJECT_* definitions)
@@ -396,6 +397,7 @@ struct nft_set {
 	struct list_head		list;
 	struct list_head		bindings;
 	char				*name;
+	u64				handle;
 	u32				ktype;
 	u32				dtype;
 	u32				objtype;
@@ -946,6 +948,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@objects: stateful objects in the table
  *	@flowtables: flow tables in the table
  *	@hgenerator: handle generator state
+ *	@handle: table handle
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
  *	@genmask: generation mask
@@ -959,6 +962,7 @@ struct nft_table {
 	struct list_head		objects;
 	struct list_head		flowtables;
 	u64				hgenerator;
+	u64				handle;
 	u32				use;
 	u16				family:6,
 					flags:8,
@@ -983,9 +987,9 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  *	@name: name of this stateful object
  *	@genmask: generation mask
  *	@use: number of references to this stateful object
- * 	@data: object data, layout depends on type
+ *	@handle: unique object handle
  *	@ops: object operations
- *	@data: pointer to object data
+ * 	@data: object data, layout depends on type
  */
 struct nft_object {
 	struct list_head		list;
@@ -993,6 +997,7 @@ struct nft_object {
 	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
+	u64				handle;
 	/* runtime data below here */
 	const struct nft_object_ops	*ops ____cacheline_aligned;
 	unsigned char			data[]
@@ -1074,6 +1079,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type);
  *	@ops_len: number of hooks in array
  *	@genmask: generation mask
  *	@use: number of references to this flow table
+ * 	@handle: unique object handle
  *	@data: rhashtable and garbage collector
  * 	@ops: array of hooks
  */
@@ -1086,6 +1092,7 @@ struct nft_flowtable {
 	int				ops_len;
 	u32				genmask:2,
 					use:30;
+	u64				handle;
 	/* runtime data below here */
 	struct nf_hook_ops		*ops ____cacheline_aligned;
 	struct nf_flowtable		data;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 53e8dd2a3a03..66dceee0ae30 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -174,6 +174,8 @@ enum nft_table_attributes {
 	NFTA_TABLE_NAME,
 	NFTA_TABLE_FLAGS,
 	NFTA_TABLE_USE,
+	NFTA_TABLE_HANDLE,
+	NFTA_TABLE_PAD,
 	__NFTA_TABLE_MAX
 };
 #define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
@@ -317,6 +319,7 @@ enum nft_set_desc_attributes {
  * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32)
  * @NFTA_SET_USERDATA: user data (NLA_BINARY)
  * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*)
+ * @NFTA_SET_HANDLE: set handle (NLA_U64)
  */
 enum nft_set_attributes {
 	NFTA_SET_UNSPEC,
@@ -335,6 +338,7 @@ enum nft_set_attributes {
 	NFTA_SET_USERDATA,
 	NFTA_SET_PAD,
 	NFTA_SET_OBJ_TYPE,
+	NFTA_SET_HANDLE,
 	__NFTA_SET_MAX
 };
 #define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
@@ -1314,6 +1318,7 @@ enum nft_ct_helper_attributes {
  * @NFTA_OBJ_TYPE: stateful object type (NLA_U32)
  * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED)
  * @NFTA_OBJ_USE: number of references to this expression (NLA_U32)
+ * @NFTA_OBJ_HANDLE: object handle (NLA_U64)
  */
 enum nft_object_attributes {
 	NFTA_OBJ_UNSPEC,
@@ -1322,6 +1327,8 @@ enum nft_object_attributes {
 	NFTA_OBJ_TYPE,
 	NFTA_OBJ_DATA,
 	NFTA_OBJ_USE,
+	NFTA_OBJ_HANDLE,
+	NFTA_OBJ_PAD,
 	__NFTA_OBJ_MAX
 };
 #define NFTA_OBJ_MAX		(__NFTA_OBJ_MAX - 1)
@@ -1333,6 +1340,7 @@ enum nft_object_attributes {
  * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING)
  * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
  * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
+ * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
  */
 enum nft_flowtable_attributes {
 	NFTA_FLOWTABLE_UNSPEC,
@@ -1340,6 +1348,8 @@ enum nft_flowtable_attributes {
 	NFTA_FLOWTABLE_NAME,
 	NFTA_FLOWTABLE_HOOK,
 	NFTA_FLOWTABLE_USE,
+	NFTA_FLOWTABLE_HANDLE,
+	NFTA_FLOWTABLE_PAD,
 	__NFTA_FLOWTABLE_MAX
 };
 #define NFTA_FLOWTABLE_MAX	(__NFTA_FLOWTABLE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b541e5094dce..1addc401ff7d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -26,6 +26,7 @@
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
+static u64 table_handle;
 
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
@@ -332,6 +333,20 @@ static struct nft_table *nft_table_lookup(const struct net *net,
 	return NULL;
 }
 
+static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
+						   const struct nlattr *nla,
+						   u8 genmask)
+{
+	struct nft_table *table;
+
+	list_for_each_entry(table, &net->nft.tables, list) {
+		if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
+		    nft_active_genmask(table, genmask))
+			return table;
+	}
+	return NULL;
+}
+
 static struct nft_table *nf_tables_table_lookup(const struct net *net,
 						const struct nlattr *nla,
 						u8 family, u8 genmask)
@@ -348,6 +363,22 @@ static struct nft_table *nf_tables_table_lookup(const struct net *net,
 	return ERR_PTR(-ENOENT);
 }
 
+static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
+							 const struct nlattr *nla,
+							 u8 genmask)
+{
+	struct nft_table *table;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	table = nft_table_lookup_byhandle(net, nla, genmask);
+	if (table != NULL)
+		return table;
+
+	return ERR_PTR(-ENOENT);
+}
+
 static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 {
 	return ++table->hgenerator;
@@ -394,6 +425,7 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_TABLE_MAXNAMELEN - 1 },
 	[NFTA_TABLE_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_TABLE_HANDLE]	= { .type = NLA_U64 },
 };
 
 static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -415,7 +447,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 
 	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
 	    nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
-	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
+	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
+	    nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
+			 NFTA_TABLE_PAD))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -674,6 +708,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&table->flowtables);
 	table->family = family;
 	table->flags = flags;
+	table->handle = ++table_handle;
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
@@ -791,11 +826,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	struct nft_ctx ctx;
 
 	nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
-	if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
+	if (family == AF_UNSPEC ||
+	    (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
 		return nft_flush(&ctx, family);
 
-	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
-				       genmask);
+	if (nla[NFTA_TABLE_HANDLE])
+		table = nf_tables_table_lookup_byhandle(net,
+							nla[NFTA_TABLE_HANDLE],
+							genmask);
+	else
+		table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
+					       family, genmask);
+
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1539,6 +1581,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	struct nft_rule *rule;
 	int family = nfmsg->nfgen_family;
 	struct nft_ctx ctx;
+	u64 handle;
 	u32 use;
 	int err;
 
@@ -1547,7 +1590,12 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	if (nla[NFTA_CHAIN_HANDLE]) {
+		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+	} else {
+		chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	}
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
@@ -2503,6 +2551,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
 	[NFTA_SET_USERDATA]		= { .type = NLA_BINARY,
 					    .len  = NFT_USERDATA_MAXLEN },
 	[NFTA_SET_OBJ_TYPE]		= { .type = NLA_U32 },
+	[NFTA_SET_HANDLE]		= { .type = NLA_U64 },
 };
 
 static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2546,6 +2595,22 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
 	return ERR_PTR(-ENOENT);
 }
 
+static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
+						     const struct nlattr *nla, u8 genmask)
+{
+	struct nft_set *set;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	list_for_each_entry(set, &table->sets, list) {
+		if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
+		    nft_active_genmask(set, genmask))
+			return set;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
 						 const struct nlattr *nla,
 						 u8 genmask)
@@ -2661,6 +2726,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 		goto nla_put_failure;
 	if (nla_put_string(skb, NFTA_SET_NAME, set->name))
 		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
+			 NFTA_SET_PAD))
+		goto nla_put_failure;
 	if (set->flags != 0)
 		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
 			goto nla_put_failure;
@@ -3069,6 +3137,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	set->udata  = udata;
 	set->timeout = timeout;
 	set->gc_int = gc_int;
+	set->handle = nf_tables_alloc_handle(table);
 
 	err = ops->init(set, &desc, nla);
 	if (err < 0)
@@ -3126,7 +3195,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+	if (nla[NFTA_SET_HANDLE])
+		set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
+	else
+		set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -4256,6 +4328,21 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 }
 EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
 
+struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
+						 const struct nlattr *nla,
+						 u32 objtype, u8 genmask)
+{
+	struct nft_object *obj;
+
+	list_for_each_entry(obj, &table->objects, list) {
+		if (be64_to_cpu(nla_get_be64(nla)) == obj->handle &&
+		    objtype == obj->ops->type->type &&
+		    nft_active_genmask(obj, genmask))
+			return obj;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
 	[NFTA_OBJ_TABLE]	= { .type = NLA_STRING,
 				    .len = NFT_TABLE_MAXNAMELEN - 1 },
@@ -4263,6 +4350,7 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
 				    .len = NFT_OBJ_MAXNAMELEN - 1 },
 	[NFTA_OBJ_TYPE]		= { .type = NLA_U32 },
 	[NFTA_OBJ_DATA]		= { .type = NLA_NESTED },
+	[NFTA_OBJ_HANDLE]	= { .type = NLA_U64},
 };
 
 static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -4410,6 +4498,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 	obj->table = table;
+	obj->handle = nf_tables_alloc_handle(table);
+
 	obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
 	if (!obj->name) {
 		err = -ENOMEM;
@@ -4456,7 +4546,9 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 	    nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
 	    nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
 	    nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
-	    nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+	    nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) ||
+	    nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
+			 NFTA_OBJ_PAD))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -4654,7 +4746,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	u32 objtype;
 
 	if (!nla[NFTA_OBJ_TYPE] ||
-	    !nla[NFTA_OBJ_NAME])
+	    (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
 		return -EINVAL;
 
 	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
@@ -4663,7 +4755,12 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	if (nla[NFTA_OBJ_HANDLE])
+		obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
+						    objtype, genmask);
+	else
+		obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
+					   objtype, genmask);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 	if (obj->use > 0)
@@ -4735,6 +4832,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
 	[NFTA_FLOWTABLE_NAME]		= { .type = NLA_STRING,
 					    .len = NFT_NAME_MAXLEN - 1 },
 	[NFTA_FLOWTABLE_HOOK]		= { .type = NLA_NESTED },
+	[NFTA_FLOWTABLE_HANDLE]		= { .type = NLA_U64 },
 };
 
 struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
@@ -4752,6 +4850,20 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 }
 EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
 
+struct nft_flowtable *
+nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
+				    const struct nlattr *nla, u8 genmask)
+{
+       struct nft_flowtable *flowtable;
+
+       list_for_each_entry(flowtable, &table->flowtables, list) {
+               if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle &&
+                   nft_active_genmask(flowtable, genmask))
+                       return flowtable;
+       }
+       return ERR_PTR(-ENOENT);
+}
+
 #define NFT_FLOWTABLE_DEVICE_MAX	8
 
 static int nf_tables_parse_devices(const struct nft_ctx *ctx,
@@ -4960,6 +5072,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		return -ENOMEM;
 
 	flowtable->table = table;
+	flowtable->handle = nf_tables_alloc_handle(table);
+
 	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
 	if (!flowtable->name) {
 		err = -ENOMEM;
@@ -5034,8 +5148,14 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
-					       genmask);
+	if (nla[NFTA_FLOWTABLE_HANDLE])
+		flowtable = nf_tables_flowtable_lookup_byhandle(table,
+								nla[NFTA_FLOWTABLE_HANDLE],
+								genmask);
+	else
+		flowtable = nf_tables_flowtable_lookup(table,
+						       nla[NFTA_FLOWTABLE_NAME],
+						       genmask);
 	if (IS_ERR(flowtable))
                 return PTR_ERR(flowtable);
 	if (flowtable->use > 0)
@@ -5068,7 +5188,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 
 	if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
 	    nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
-	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)))
+	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
+	    nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
+			 NFTA_FLOWTABLE_PAD))
 		goto nla_put_failure;
 
 	nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
-- 
cgit v1.2.3


From d384e65f1e752f2b52a8ef300aeb86b1d7a342a9 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Thu, 18 Jan 2018 17:25:12 -0600
Subject: netfilter: return booleans instead of integers

Return statements in functions returning bool should use
true/false instead of 1/0.

These issues were detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 2 +-
 net/netfilter/xt_hashlimit.c | 2 +-
 net/netfilter/xt_ipcomp.c    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index a95518261168..6d65389e308f 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -71,7 +71,7 @@ static inline bool already_closed(const struct nf_conn *conn)
 		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
 		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
 	else
-		return 0;
+		return false;
 }
 
 static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5da8746f7b88..ec51d9a9512d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -353,7 +353,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 static bool select_all(const struct xt_hashlimit_htable *ht,
 		       const struct dsthash_ent *he)
 {
-	return 1;
+	return true;
 }
 
 static bool select_gc(const struct xt_hashlimit_htable *ht,
diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c
index 000e70377f85..7ca64a50db04 100644
--- a/net/netfilter/xt_ipcomp.c
+++ b/net/netfilter/xt_ipcomp.c
@@ -58,7 +58,7 @@ static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		pr_debug("Dropping evil IPComp tinygram.\n");
 		par->hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return spi_match(compinfo->spis[0], compinfo->spis[1],
-- 
cgit v1.2.3


From b0c3dc65e1dec4833924bf4c7495a97c327cdaef Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@kernel.org>
Date: Tue, 16 Jan 2018 14:51:01 +0000
Subject: netfilter: nf_tables: Fix trailing semicolon

The trailing semicolon is an empty statement that does no operation.
Removing it since it doesn't do anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_dynset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index ec0fd78231d8..fc83e29d6634 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -164,7 +164,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	}
 
 	priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
-	err = nft_validate_register_load(priv->sreg_key, set->klen);;
+	err = nft_validate_register_load(priv->sreg_key, set->klen);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From 4c87158daeeff40e24f5c86a477761e5422867df Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jan 2018 00:45:32 +0300
Subject: netfilter: delete /proc THIS_MODULE references

/proc has been ignoring struct file_operations::owner field for 10 years.
Specifically, it started with commit 786d7e1612f0b0adb6046f19b906609e4fe8b1ba
("Fix rmmod/read/write races in /proc entries"). Notice the chunk where
inode->i_fop is initialized with proxy struct file_operations for
regular files:

	-               if (de->proc_fops)
	-                       inode->i_fop = de->proc_fops;
	+               if (de->proc_fops) {
	+                       if (S_ISREG(inode->i_mode))
	+                               inode->i_fop = &proc_reg_file_ops;
	+                       else
	+                               inode->i_fop = de->proc_fops;
	+               }

VFS stopped pinning module at this point.

# ipvs
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_app.c          | 1 -
 net/netfilter/ipvs/ip_vs_conn.c         | 2 --
 net/netfilter/ipvs/ip_vs_ctl.c          | 3 ---
 net/netfilter/nf_conntrack_expect.c     | 1 -
 net/netfilter/nf_conntrack_standalone.c | 2 --
 net/netfilter/nf_log.c                  | 1 -
 net/netfilter/nf_synproxy_core.c        | 1 -
 net/netfilter/nfnetlink_log.c           | 1 -
 net/netfilter/nfnetlink_queue.c         | 1 -
 net/netfilter/x_tables.c                | 3 ---
 net/netfilter/xt_hashlimit.c            | 3 ---
 11 files changed, 19 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 299edc6add5a..1c98c907bc63 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -595,7 +595,6 @@ static int ip_vs_app_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip_vs_app_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = ip_vs_app_open,
 	.read	 = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index f489b8db2406..370abbf6f421 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1143,7 +1143,6 @@ static int ip_vs_conn_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip_vs_conn_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ip_vs_conn_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1221,7 +1220,6 @@ static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip_vs_conn_sync_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ip_vs_conn_sync_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fff213eacf2a..5ebde4b15810 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2116,7 +2116,6 @@ static int ip_vs_info_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip_vs_info_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ip_vs_info_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -2161,7 +2160,6 @@ static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip_vs_stats_fops = {
-	.owner = THIS_MODULE,
 	.open = ip_vs_stats_seq_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -2230,7 +2228,6 @@ static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ip_vs_stats_percpu_fops = {
-	.owner = THIS_MODULE,
 	.open = ip_vs_stats_percpu_seq_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index d6748a8a79c5..8ef21d9f9a00 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -649,7 +649,6 @@ static int exp_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations exp_file_ops = {
-	.owner   = THIS_MODULE,
 	.open    = exp_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 46d32baad095..9123fdec5e14 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -382,7 +382,6 @@ static int ct_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ct_file_ops = {
-	.owner   = THIS_MODULE,
 	.open    = ct_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -475,7 +474,6 @@ static int ct_cpu_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ct_cpu_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open	 = ct_cpu_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 8bb152a7cca4..c2c1b16b7538 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -402,7 +402,6 @@ static int nflog_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations nflog_file_ops = {
-	.owner	 = THIS_MODULE,
 	.open	 = nflog_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 49bd8bb16b18..92139a087260 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -317,7 +317,6 @@ static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations synproxy_cpu_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= synproxy_cpu_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index e955bec0acc6..7b46aa4c478d 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1054,7 +1054,6 @@ static int nful_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations nful_file_ops = {
-	.owner	 = THIS_MODULE,
 	.open	 = nful_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 2db35f2d553d..8bba23160a68 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1477,7 +1477,6 @@ static int nfqnl_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations nfqnl_file_ops = {
-	.owner	 = THIS_MODULE,
 	.open	 = nfqnl_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 3c2548787d78..0b56bf05c169 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1362,7 +1362,6 @@ static int xt_table_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations xt_table_ops = {
-	.owner	 = THIS_MODULE,
 	.open	 = xt_table_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -1498,7 +1497,6 @@ static int xt_match_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations xt_match_ops = {
-	.owner	 = THIS_MODULE,
 	.open	 = xt_match_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
@@ -1551,7 +1549,6 @@ static int xt_target_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations xt_target_ops = {
-	.owner	 = THIS_MODULE,
 	.open	 = xt_target_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index ec51d9a9512d..ca6847403ca2 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1266,7 +1266,6 @@ static int dl_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dl_file_ops_v2 = {
-	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v2,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1274,7 +1273,6 @@ static const struct file_operations dl_file_ops_v2 = {
 };
 
 static const struct file_operations dl_file_ops_v1 = {
-	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v1,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1282,7 +1280,6 @@ static const struct file_operations dl_file_ops_v1 = {
 };
 
 static const struct file_operations dl_file_ops = {
-	.owner   = THIS_MODULE,
 	.open    = dl_proc_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-- 
cgit v1.2.3


From 0e839dfaf100266b08c0db38f13a77746cd742c2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 19 Jan 2018 12:59:03 +0100
Subject: netfilter: nf_tables: set flowtable priority and hooknum field

Otherwise netlink dump sends uninitialized fields to userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1addc401ff7d..f87314c6dcc4 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4946,6 +4946,8 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 		goto err1;
 	}
 
+	flowtable->hooknum	= hooknum;
+	flowtable->priority	= priority;
 	flowtable->ops		= ops;
 	flowtable->ops_len	= n;
 
-- 
cgit v1.2.3


From e55311665286ab2744295575948c2b08dc001bf3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 19 Jan 2018 13:35:40 +0100
Subject: netfilter: remove messages print and boot/module load time

Several reasons for this:

* Several modules maintain internal version numbers, that they print at
  boot/module load time, that are not exposed to userspace, as a
  primitive mechanism to make revision number control from the earlier
  days of Netfilter.

* IPset shows the protocol version at boot/module load time, instead
  display this via module description, as Jozsef suggested.

* Remove copyright notice at boot/module load time in two spots, the
  Netfilter codebase is a collective development effort, if we would
  have to display copyrights for each contributor at boot/module load
  time for each extensions we have, we would probably fill up logs with
  lots of useless information - from a technical standpoint.

So let's be consistent and remove them all.

Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c      | 2 --
 net/ipv4/netfilter/arp_tables.c      | 1 -
 net/ipv4/netfilter/ip_tables.c       | 1 -
 net/ipv6/netfilter/ip6_tables.c      | 1 -
 net/netfilter/ipset/ip_set_core.c    | 3 ++-
 net/netfilter/nf_conntrack_core.c    | 6 ------
 net/netfilter/nf_conntrack_netlink.c | 5 -----
 net/netfilter/nf_tables_api.c        | 1 -
 net/netfilter/nfnetlink.c            | 4 ----
 net/netfilter/nfnetlink_acct.c       | 2 --
 net/netfilter/nfnetlink_cttimeout.c  | 2 --
 net/netfilter/nft_compat.c           | 2 --
 12 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 37817d25b63d..02c4b409d317 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -2445,7 +2445,6 @@ static int __init ebtables_init(void)
 		return ret;
 	}
 
-	printk(KERN_INFO "Ebtables v2.0 registered\n");
 	return 0;
 }
 
@@ -2453,7 +2452,6 @@ static void __exit ebtables_fini(void)
 {
 	nf_unregister_sockopt(&ebt_sockopts);
 	xt_unregister_target(&ebt_standard_target);
-	printk(KERN_INFO "Ebtables v2.0 unregistered\n");
 }
 
 EXPORT_SYMBOL(ebt_register_table);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index bf8a5340f15e..5f7c0d643fb3 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1656,7 +1656,6 @@ static int __init arp_tables_init(void)
 	if (ret < 0)
 		goto err4;
 
-	pr_info("arp_tables: (C) 2002 David S. Miller\n");
 	return 0;
 
 err4:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 0b975aa2d363..1f534aec22f0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1939,7 +1939,6 @@ static int __init ip_tables_init(void)
 	if (ret < 0)
 		goto err5;
 
-	pr_info("(C) 2000-2006 Netfilter Core Team\n");
 	return 0;
 
 err5:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6ebbef2dfb60..37fa76ee5130 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1952,7 +1952,6 @@ static int __init ip6_tables_init(void)
 	if (ret < 0)
 		goto err5;
 
-	pr_info("(C) 2000-2006 Netfilter Core Team\n");
 	return 0;
 
 err5:
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 728bf31bb386..975a85a48d39 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -2122,7 +2122,6 @@ ip_set_init(void)
 		return ret;
 	}
 
-	pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
 	return 0;
 }
 
@@ -2138,3 +2137,5 @@ ip_set_fini(void)
 
 module_init(ip_set_init);
 module_exit(ip_set_fini);
+
+MODULE_DESCRIPTION("ip_set: protocol " __stringify(IPSET_PROTOCOL));
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 6a64d528d076..3d72a0842c01 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -58,8 +58,6 @@
 
 #include "nf_internals.h"
 
-#define NF_CONNTRACK_VERSION	"0.5.0"
-
 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
 				      enum nf_nat_manip_type manip,
 				      const struct nlattr *attr) __read_mostly;
@@ -2068,10 +2066,6 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_cachep)
 		goto err_cachep;
 
-	printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
-	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
-	       nf_conntrack_max);
-
 	ret = nf_conntrack_expect_init();
 	if (ret < 0)
 		goto err_expect;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7c7921a53b13..dd177ebee9aa 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -57,8 +57,6 @@
 
 MODULE_LICENSE("GPL");
 
-static char __initdata version[] = "0.93";
-
 static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_l4proto *l4proto)
@@ -3425,7 +3423,6 @@ static int __init ctnetlink_init(void)
 {
 	int ret;
 
-	pr_info("ctnetlink v%s: registering with nfnetlink.\n", version);
 	ret = nfnetlink_subsys_register(&ctnl_subsys);
 	if (ret < 0) {
 		pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
@@ -3459,8 +3456,6 @@ err_out:
 
 static void __exit ctnetlink_exit(void)
 {
-	pr_info("ctnetlink: unregistering from nfnetlink.\n");
-
 	unregister_pernet_subsys(&ctnetlink_net_ops);
 	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
 	nfnetlink_subsys_unregister(&ctnl_subsys);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f87314c6dcc4..0791813a1e7d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6603,7 +6603,6 @@ static int __init nf_tables_module_init(void)
 
 	register_netdevice_notifier(&nf_tables_flowtable_notifier);
 
-	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
 	return register_pernet_subsys(&nf_tables_net_ops);
 err3:
 	nf_tables_core_module_exit();
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 733d3e4a30d8..03ead8a9e90c 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -37,8 +37,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
 	rcu_dereference_protected(table[(id)].subsys, \
 				  lockdep_nfnl_is_held((id)))
 
-static char __initdata nfversion[] = "0.30";
-
 static struct {
 	struct mutex				mutex;
 	const struct nfnetlink_subsystem __rcu	*subsys;
@@ -580,13 +578,11 @@ static int __init nfnetlink_init(void)
 	for (i=0; i<NFNL_SUBSYS_COUNT; i++)
 		mutex_init(&table[i].mutex);
 
-	pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);
 	return register_pernet_subsys(&nfnetlink_net_ops);
 }
 
 static void __exit nfnetlink_exit(void)
 {
-	pr_info("Removing netfilter NETLINK layer.\n");
 	unregister_pernet_subsys(&nfnetlink_net_ops);
 }
 module_init(nfnetlink_init);
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c45e6d4358ab..88d427f9f9e6 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -527,7 +527,6 @@ static int __init nfnl_acct_init(void)
 		goto err_out;
 	}
 
-	pr_info("nfnl_acct: registering with nfnetlink.\n");
 	ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
 	if (ret < 0) {
 		pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
@@ -543,7 +542,6 @@ err_out:
 
 static void __exit nfnl_acct_exit(void)
 {
-	pr_info("nfnl_acct: unregistering from nfnetlink.\n");
 	nfnetlink_subsys_unregister(&nfnl_acct_subsys);
 	unregister_pernet_subsys(&nfnl_acct_ops);
 }
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 32b1c0b44e79..95b04702a655 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -615,8 +615,6 @@ err_out:
 
 static void __exit cttimeout_exit(void)
 {
-	pr_info("cttimeout: unregistering from nfnetlink.\n");
-
 	nfnetlink_subsys_unregister(&cttimeout_subsys);
 
 	unregister_pernet_subsys(&cttimeout_ops);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 7fa17e241c14..8e23726b9081 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -812,8 +812,6 @@ static int __init nft_compat_module_init(void)
 		goto err_target;
 	}
 
-	pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
-
 	return ret;
 
 err_target:
-- 
cgit v1.2.3


From ce6289661b14a8b391d90db918c91b6d6da6540a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jan 2018 17:34:00 +0100
Subject: caif: reduce stack size with KASAN

When CONFIG_KASAN is set, we can use relatively large amounts of kernel
stack space:

net/caif/cfctrl.c:555:1: warning: the frame size of 1600 bytes is larger than 1280 bytes [-Wframe-larger-than=]

This adds convenience wrappers around cfpkt_extr_head(), which is responsible
for most of the stack growth. With those wrapper functions, gcc apparently
starts reusing the stack slots for each instance, thus avoiding the
problem.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/caif/cfpkt.h | 27 ++++++++++++++++++++++++++
 net/caif/cfctrl.c        | 50 +++++++++++++++++++++---------------------------
 2 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h
index fe328c52c46b..801489bb14c3 100644
--- a/include/net/caif/cfpkt.h
+++ b/include/net/caif/cfpkt.h
@@ -32,6 +32,33 @@ void cfpkt_destroy(struct cfpkt *pkt);
  */
 int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len);
 
+static inline u8 cfpkt_extr_head_u8(struct cfpkt *pkt)
+{
+	u8 tmp;
+
+	cfpkt_extr_head(pkt, &tmp, 1);
+
+	return tmp;
+}
+
+static inline u16 cfpkt_extr_head_u16(struct cfpkt *pkt)
+{
+	__le16 tmp;
+
+	cfpkt_extr_head(pkt, &tmp, 2);
+
+	return le16_to_cpu(tmp);
+}
+
+static inline u32 cfpkt_extr_head_u32(struct cfpkt *pkt)
+{
+	__le32 tmp;
+
+	cfpkt_extr_head(pkt, &tmp, 4);
+
+	return le32_to_cpu(tmp);
+}
+
 /*
  * Peek header from packet.
  * Reads data from packet without changing packet.
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 655ed7032150..a1e85f032108 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -352,15 +352,14 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 	u8 cmdrsp;
 	u8 cmd;
 	int ret = -1;
-	u16 tmp16;
 	u8 len;
 	u8 param[255];
-	u8 linkid;
+	u8 linkid = 0;
 	struct cfctrl *cfctrl = container_obj(layer);
 	struct cfctrl_request_info rsp, *req;
 
 
-	cfpkt_extr_head(pkt, &cmdrsp, 1);
+	cmdrsp = cfpkt_extr_head_u8(pkt);
 	cmd = cmdrsp & CFCTRL_CMD_MASK;
 	if (cmd != CFCTRL_CMD_LINK_ERR
 	    && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp)
@@ -378,13 +377,12 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 			u8 physlinkid;
 			u8 prio;
 			u8 tmp;
-			u32 tmp32;
 			u8 *cp;
 			int i;
 			struct cfctrl_link_param linkparam;
 			memset(&linkparam, 0, sizeof(linkparam));
 
-			cfpkt_extr_head(pkt, &tmp, 1);
+			tmp = cfpkt_extr_head_u8(pkt);
 
 			serv = tmp & CFCTRL_SRV_MASK;
 			linkparam.linktype = serv;
@@ -392,13 +390,13 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 			servtype = tmp >> 4;
 			linkparam.chtype = servtype;
 
-			cfpkt_extr_head(pkt, &tmp, 1);
+			tmp = cfpkt_extr_head_u8(pkt);
 			physlinkid = tmp & 0x07;
 			prio = tmp >> 3;
 
 			linkparam.priority = prio;
 			linkparam.phyid = physlinkid;
-			cfpkt_extr_head(pkt, &endpoint, 1);
+			endpoint = cfpkt_extr_head_u8(pkt);
 			linkparam.endpoint = endpoint & 0x03;
 
 			switch (serv) {
@@ -407,45 +405,43 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 				if (CFCTRL_ERR_BIT & cmdrsp)
 					break;
 				/* Link ID */
-				cfpkt_extr_head(pkt, &linkid, 1);
+				linkid = cfpkt_extr_head_u8(pkt);
 				break;
 			case CFCTRL_SRV_VIDEO:
-				cfpkt_extr_head(pkt, &tmp, 1);
+				tmp = cfpkt_extr_head_u8(pkt);
 				linkparam.u.video.connid = tmp;
 				if (CFCTRL_ERR_BIT & cmdrsp)
 					break;
 				/* Link ID */
-				cfpkt_extr_head(pkt, &linkid, 1);
+				linkid = cfpkt_extr_head_u8(pkt);
 				break;
 
 			case CFCTRL_SRV_DATAGRAM:
-				cfpkt_extr_head(pkt, &tmp32, 4);
 				linkparam.u.datagram.connid =
-				    le32_to_cpu(tmp32);
+				    cfpkt_extr_head_u32(pkt);
 				if (CFCTRL_ERR_BIT & cmdrsp)
 					break;
 				/* Link ID */
-				cfpkt_extr_head(pkt, &linkid, 1);
+				linkid = cfpkt_extr_head_u8(pkt);
 				break;
 			case CFCTRL_SRV_RFM:
 				/* Construct a frame, convert
 				 * DatagramConnectionID
 				 * to network format long and copy it out...
 				 */
-				cfpkt_extr_head(pkt, &tmp32, 4);
 				linkparam.u.rfm.connid =
-				  le32_to_cpu(tmp32);
+				    cfpkt_extr_head_u32(pkt);
 				cp = (u8 *) linkparam.u.rfm.volume;
-				for (cfpkt_extr_head(pkt, &tmp, 1);
+				for (tmp = cfpkt_extr_head_u8(pkt);
 				     cfpkt_more(pkt) && tmp != '\0';
-				     cfpkt_extr_head(pkt, &tmp, 1))
+				     tmp = cfpkt_extr_head_u8(pkt))
 					*cp++ = tmp;
 				*cp = '\0';
 
 				if (CFCTRL_ERR_BIT & cmdrsp)
 					break;
 				/* Link ID */
-				cfpkt_extr_head(pkt, &linkid, 1);
+				linkid = cfpkt_extr_head_u8(pkt);
 
 				break;
 			case CFCTRL_SRV_UTIL:
@@ -454,13 +450,11 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 				 * to network format long and copy it out...
 				 */
 				/* Fifosize KB */
-				cfpkt_extr_head(pkt, &tmp16, 2);
 				linkparam.u.utility.fifosize_kb =
-				    le16_to_cpu(tmp16);
+				    cfpkt_extr_head_u16(pkt);
 				/* Fifosize bufs */
-				cfpkt_extr_head(pkt, &tmp16, 2);
 				linkparam.u.utility.fifosize_bufs =
-				    le16_to_cpu(tmp16);
+				    cfpkt_extr_head_u16(pkt);
 				/* name */
 				cp = (u8 *) linkparam.u.utility.name;
 				caif_assert(sizeof(linkparam.u.utility.name)
@@ -468,24 +462,24 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 				for (i = 0;
 				     i < UTILITY_NAME_LENGTH
 				     && cfpkt_more(pkt); i++) {
-					cfpkt_extr_head(pkt, &tmp, 1);
+					tmp = cfpkt_extr_head_u8(pkt);
 					*cp++ = tmp;
 				}
 				/* Length */
-				cfpkt_extr_head(pkt, &len, 1);
+				len = cfpkt_extr_head_u8(pkt);
 				linkparam.u.utility.paramlen = len;
 				/* Param Data */
 				cp = linkparam.u.utility.params;
 				while (cfpkt_more(pkt) && len--) {
-					cfpkt_extr_head(pkt, &tmp, 1);
+					tmp = cfpkt_extr_head_u8(pkt);
 					*cp++ = tmp;
 				}
 				if (CFCTRL_ERR_BIT & cmdrsp)
 					break;
 				/* Link ID */
-				cfpkt_extr_head(pkt, &linkid, 1);
+				linkid = cfpkt_extr_head_u8(pkt);
 				/* Length */
-				cfpkt_extr_head(pkt, &len, 1);
+				len = cfpkt_extr_head_u8(pkt);
 				/* Param Data */
 				cfpkt_extr_head(pkt, &param, len);
 				break;
@@ -522,7 +516,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 		}
 		break;
 	case CFCTRL_CMD_LINK_DESTROY:
-		cfpkt_extr_head(pkt, &linkid, 1);
+		linkid = cfpkt_extr_head_u8(pkt);
 		cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
 		break;
 	case CFCTRL_CMD_LINK_ERR:
-- 
cgit v1.2.3


From 5a75114adb718b262ac3e831d0f0b156c0455ab2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 Jan 2018 15:40:00 -0800
Subject: ipv6: mcast: remove dead code

Since commit 41033f029e39 ("snmp: Remove duplicate OUTMCAST stat
increment") one line of code became unneeded.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 40b223a930a3..6a5d0e39bb87 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1655,8 +1655,6 @@ static void mld_sendpack(struct sk_buff *skb)
 	if (err)
 		goto err_out;
 
-	payload_len = skb->len;
-
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 		      net, net->ipv6.igmp_sk, skb, NULL, skb->dev,
 		      dst_output);
-- 
cgit v1.2.3


From 43dd7512b51c0f3dd8170916bd3eeb2eba808ed1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 17 Jan 2018 03:27:42 +0000
Subject: devlink: Make some functions static

Fixes the following sparse warnings:

net/core/devlink.c:2297:25: warning:
 symbol 'devlink_resource_find' was not declared. Should it be static?
net/core/devlink.c:2322:6: warning:
 symbol 'devlink_resource_validate_children' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index dd7d6dd07bfb..66d36705fb9d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2294,7 +2294,7 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
-struct devlink_resource *
+static struct devlink_resource *
 devlink_resource_find(struct devlink *devlink,
 		      struct devlink_resource *resource, u64 resource_id)
 {
@@ -2319,7 +2319,8 @@ devlink_resource_find(struct devlink *devlink,
 	return NULL;
 }
 
-void devlink_resource_validate_children(struct devlink_resource *resource)
+static void
+devlink_resource_validate_children(struct devlink_resource *resource)
 {
 	struct devlink_resource *child_resource;
 	bool size_valid = true;
-- 
cgit v1.2.3


From dfffc97d0e196c33452a6bce5a78e33786247d23 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 16 Jan 2018 23:01:54 +0100
Subject: l2tp: double-check l2specific_type provided by userspace

Add sanity check on l2specific_type provided by userspace in
l2tp_nl_cmd_session_create() since just L2TP_L2SPECTYPE_DEFAULT and
L2TP_L2SPECTYPE_NONE are currently supported.
Moreover explicitly set l2specific_type to L2TP_L2SPECTYPE_DEFAULT
only if the userspace does not provide a value for it

Reviewed-by: Guillaume Nault <g.nault@alphalink.fr>
Tested-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_netlink.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index e1ca29f79821..9ba2b8a68f65 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -550,9 +550,16 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
 
-		cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT;
-		if (info->attrs[L2TP_ATTR_L2SPEC_TYPE])
+		if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) {
 			cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
+			if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT &&
+			    cfg.l2specific_type != L2TP_L2SPECTYPE_NONE) {
+				ret = -EINVAL;
+				goto out_tunnel;
+			}
+		} else {
+			cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT;
+		}
 
 		cfg.l2specific_len = 4;
 		if (info->attrs[L2TP_ATTR_L2SPEC_LEN])
-- 
cgit v1.2.3


From 62e7b6a57c7b9bf3c6fd99418eeec05b08a85c38 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 16 Jan 2018 23:01:55 +0100
Subject: l2tp: remove l2specific_len dependency in l2tp_core

Remove l2specific_len dependency while building l2tpv3 header or
parsing the received frame since default L2-Specific Sublayer is
always four bytes long and we don't need to rely on a user supplied
value.
Moreover in l2tp netlink code there are no sanity checks to
enforce the relation between l2specific_len and l2specific_type,
so sending a malformed netlink message is possible to set
l2specific_type to L2TP_L2SPECTYPE_DEFAULT (or even
L2TP_L2SPECTYPE_NONE) and set l2specific_len to a value greater than
4 leaking memory on the wire and sending corrupted frames.

Reviewed-by: Guillaume Nault <g.nault@alphalink.fr>
Tested-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 34 ++++++++++++++++------------------
 net/l2tp/l2tp_core.h | 11 +++++++++++
 2 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 62285fc6eb59..88efb8b845ca 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -730,11 +730,9 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 				 "%s: recv data ns=%u, session nr=%u\n",
 				 session->name, ns, session->nr);
 		}
+		ptr += 4;
 	}
 
-	/* Advance past L2-specific header, if present */
-	ptr += session->l2specific_len;
-
 	if (L2TP_SKB_CB(skb)->has_seq) {
 		/* Received a packet with sequence numbers. If we're the LNS,
 		 * check if we sre sending sequence numbers and if not,
@@ -1048,21 +1046,20 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
 		memcpy(bufp, &session->cookie[0], session->cookie_len);
 		bufp += session->cookie_len;
 	}
-	if (session->l2specific_len) {
-		if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
-			u32 l2h = 0;
-			if (session->send_seq) {
-				l2h = 0x40000000 | session->ns;
-				session->ns++;
-				session->ns &= 0xffffff;
-				l2tp_dbg(session, L2TP_MSG_SEQ,
-					 "%s: updated ns to %u\n",
-					 session->name, session->ns);
-			}
+	if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
+		u32 l2h = 0;
 
-			*((__be32 *) bufp) = htonl(l2h);
+		if (session->send_seq) {
+			l2h = 0x40000000 | session->ns;
+			session->ns++;
+			session->ns &= 0xffffff;
+			l2tp_dbg(session, L2TP_MSG_SEQ,
+				 "%s: updated ns to %u\n",
+				 session->name, session->ns);
 		}
-		bufp += session->l2specific_len;
+
+		*((__be32 *)bufp) = htonl(l2h);
+		bufp += 4;
 	}
 
 	return bufp - optr;
@@ -1719,7 +1716,7 @@ int l2tp_session_delete(struct l2tp_session *session)
 EXPORT_SYMBOL_GPL(l2tp_session_delete);
 
 /* We come here whenever a session's send_seq, cookie_len or
- * l2specific_len parameters are set.
+ * l2specific_type parameters are set.
  */
 void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 {
@@ -1728,7 +1725,8 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 		if (session->send_seq)
 			session->hdr_len += 4;
 	} else {
-		session->hdr_len = 4 + session->cookie_len + session->l2specific_len;
+		session->hdr_len = 4 + session->cookie_len;
+		session->hdr_len += l2tp_get_l2specific_len(session);
 		if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP)
 			session->hdr_len += 4;
 	}
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index c2e9bbd79b35..7bef304de4f0 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -302,6 +302,17 @@ static inline void l2tp_session_dec_refcount(struct l2tp_session *session)
 		l2tp_session_free(session);
 }
 
+static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
+{
+	switch (session->l2specific_type) {
+	case L2TP_L2SPECTYPE_DEFAULT:
+		return 4;
+	case L2TP_L2SPECTYPE_NONE:
+	default:
+		return 0;
+	}
+}
+
 #define l2tp_printk(ptr, type, func, fmt, ...)				\
 do {									\
 	if (((ptr)->debug) & (type))					\
-- 
cgit v1.2.3


From 9afa65857d21c13a9209b82c309c3a1503e9646c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 16 Jan 2018 23:01:56 +0100
Subject: l2tp: remove l2specific_len configurable parameter

Remove l2specific_len configuration parameter since now L2-Specific
Sublayer length is computed according to l2specific_type provided by
userspace.

Reviewed-by: Guillaume Nault <g.nault@alphalink.fr>
Tested-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c    | 1 -
 net/l2tp/l2tp_core.h    | 2 --
 net/l2tp/l2tp_debugfs.c | 2 +-
 net/l2tp/l2tp_netlink.c | 4 ----
 4 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 88efb8b845ca..194a7483bb93 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1777,7 +1777,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->lns_mode = cfg->lns_mode;
 			session->reorder_timeout = cfg->reorder_timeout;
 			session->l2specific_type = cfg->l2specific_type;
-			session->l2specific_len = cfg->l2specific_len;
 			session->cookie_len = cfg->cookie_len;
 			memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len);
 			session->peer_cookie_len = cfg->peer_cookie_len;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 7bef304de4f0..9bbee90e9963 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -59,7 +59,6 @@ struct l2tp_session_cfg {
 	int			debug;		/* bitmask of debug message
 						 * categories */
 	u16			vlan_id;	/* VLAN pseudowire only */
-	u16			l2specific_len;	/* Layer 2 specific length */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
 	int			cookie_len;	/* 0, 4 or 8 bytes */
@@ -85,7 +84,6 @@ struct l2tp_session {
 	int			cookie_len;
 	u8			peer_cookie[8];
 	int			peer_cookie_len;
-	u16			l2specific_len;
 	u16			l2specific_type;
 	u16			hdr_len;
 	u32			nr;		/* session NR state (receive) */
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 2c30587d1a14..72e713da4733 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -181,7 +181,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
 	seq_printf(m, "   offset 0 l2specific %hu/%hu\n",
-		   session->l2specific_type, session->l2specific_len);
+		   session->l2specific_type, l2tp_get_l2specific_len(session));
 	if (session->cookie_len) {
 		seq_printf(m, "   cookie %02x%02x%02x%02x",
 			   session->cookie[0], session->cookie[1],
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 9ba2b8a68f65..405a5341ed1e 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -561,10 +561,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 			cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT;
 		}
 
-		cfg.l2specific_len = 4;
-		if (info->attrs[L2TP_ATTR_L2SPEC_LEN])
-			cfg.l2specific_len = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_LEN]);
-
 		if (info->attrs[L2TP_ATTR_COOKIE]) {
 			u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]);
 			if (len > 8) {
-- 
cgit v1.2.3


From 30c3e9d470358a6741e00e1034a1ea85c6a516f0 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 17 Jan 2018 11:41:20 +0100
Subject: l2tp: remove switch block in l2tp_nl_cmd_session_create()

Remove the switch block in l2tp_nl_cmd_session_create() that
checks pseudowire-specific parameters since just L2TP_PWTYPE_ETH and
L2TP_PWTYPE_PPP are currently supported and no actual checks are
performed. Moreover the L2TP_PWTYPE_IP/default case presents a harmless
issue in error handling (break instead of goto out_tunnel)

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_netlink.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 405a5341ed1e..e7ea9c4b89ff 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -620,27 +620,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 		goto out_tunnel;
 	}
 
-	/* Check that pseudowire-specific params are present */
-	switch (cfg.pw_type) {
-	case L2TP_PWTYPE_NONE:
-		break;
-	case L2TP_PWTYPE_ETH_VLAN:
-		if (!info->attrs[L2TP_ATTR_VLAN_ID]) {
-			ret = -EINVAL;
-			goto out_tunnel;
-		}
-		break;
-	case L2TP_PWTYPE_ETH:
-		break;
-	case L2TP_PWTYPE_PPP:
-	case L2TP_PWTYPE_PPP_AC:
-		break;
-	case L2TP_PWTYPE_IP:
-	default:
-		ret = -EPROTONOSUPPORT;
-		break;
-	}
-
 	ret = l2tp_nl_cmd_ops[cfg.pw_type]->session_create(net, tunnel,
 							   session_id,
 							   peer_session_id,
-- 
cgit v1.2.3


From 60c2530696320ee6ffe4491c17079fa403790c98 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 17 Jan 2018 16:42:46 +0100
Subject: tipc: fix race between poll() and setsockopt()

Letting tipc_poll() dereference a socket's pointer to struct tipc_group
entails a race risk, as the group item may be deleted in a concurrent
tipc_sk_join() or tipc_sk_leave() thread.

We now move the 'open' flag in struct tipc_group to struct tipc_sock,
and let the former retain only a pointer to the moved field. This will
eliminate the race risk.

Reported-by: syzbot+799dafde0286795858ac@syzkaller.appspotmail.com
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c  | 19 ++++++++-----------
 net/tipc/group.h  |  4 ++--
 net/tipc/socket.c |  7 +++----
 3 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 497ee34bfab9..122162a31816 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -93,26 +93,21 @@ struct tipc_group {
 	u16 max_active;
 	u16 bc_snd_nxt;
 	u16 bc_ackers;
+	bool *open;
 	bool loopback;
 	bool events;
-	bool open;
 };
 
 static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 				  int mtyp, struct sk_buff_head *xmitq);
 
-bool tipc_group_is_open(struct tipc_group *grp)
-{
-	return grp->open;
-}
-
 static void tipc_group_open(struct tipc_member *m, bool *wakeup)
 {
 	*wakeup = false;
 	if (list_empty(&m->small_win))
 		return;
 	list_del_init(&m->small_win);
-	m->group->open = true;
+	*m->group->open = true;
 	*wakeup = true;
 }
 
@@ -170,7 +165,8 @@ int tipc_group_size(struct tipc_group *grp)
 }
 
 struct tipc_group *tipc_group_create(struct net *net, u32 portid,
-				     struct tipc_group_req *mreq)
+				     struct tipc_group_req *mreq,
+				     bool *group_is_open)
 {
 	u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS;
 	bool global = mreq->scope != TIPC_NODE_SCOPE;
@@ -192,6 +188,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	grp->scope = mreq->scope;
 	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
 	grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS;
+	grp->open = group_is_open;
 	filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE;
 	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0,
 				    filter, &grp->subid))
@@ -430,7 +427,7 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport,
 	if (m->window >= len)
 		return false;
 
-	grp->open = false;
+	*grp->open = false;
 
 	/* If not fully advertised, do it now to prevent mutual blocking */
 	adv = m->advertised;
@@ -453,7 +450,7 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len)
 
 	/* If prev bcast was replicast, reject until all receivers have acked */
 	if (grp->bc_ackers) {
-		grp->open = false;
+		*grp->open = false;
 		return true;
 	}
 	if (list_empty(&grp->small_win))
@@ -800,7 +797,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		if (--grp->bc_ackers)
 			return;
 		list_del_init(&m->small_win);
-		m->group->open = true;
+		*m->group->open = true;
 		*usr_wakeup = true;
 		tipc_group_update_member(m, 0);
 		return;
diff --git a/net/tipc/group.h b/net/tipc/group.h
index f4a596ed9848..5996af6e9f1d 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -43,7 +43,8 @@ struct tipc_member;
 struct tipc_msg;
 
 struct tipc_group *tipc_group_create(struct net *net, u32 portid,
-				     struct tipc_group_req *mreq);
+				     struct tipc_group_req *mreq,
+				     bool *group_is_open);
 void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcv_buf);
 void tipc_group_delete(struct net *net, struct tipc_group *grp);
 void tipc_group_add_member(struct tipc_group *grp, u32 node,
@@ -67,7 +68,6 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack);
 bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport,
 		     int len, struct tipc_member **m);
 bool tipc_group_bc_cong(struct tipc_group *grp, int len);
-bool tipc_group_is_open(struct tipc_group *grp);
 void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 			       u32 port, struct sk_buff_head *xmitq);
 u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index d799e50ff722..473a096b6fba 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -116,6 +116,7 @@ struct tipc_sock {
 	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
 	struct tipc_group *group;
+	bool group_is_open;
 };
 
 static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
@@ -715,7 +716,6 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	struct tipc_group *grp;
 	u32 revents = 0;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
@@ -736,8 +736,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			revents |= POLLIN | POLLRDNORM;
 		break;
 	case TIPC_OPEN:
-		grp = tsk->group;
-		if ((!grp || tipc_group_is_open(grp)) && !tsk->cong_link_cnt)
+		if (tsk->group_is_open && !tsk->cong_link_cnt)
 			revents |= POLLOUT;
 		if (!tipc_sk_type_connectionless(sk))
 			break;
@@ -2758,7 +2757,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 		return -EINVAL;
 	if (grp)
 		return -EACCES;
-	grp = tipc_group_create(net, tsk->portid, mreq);
+	grp = tipc_group_create(net, tsk->portid, mreq, &tsk->group_is_open);
 	if (!grp)
 		return -ENOMEM;
 	tsk->group = grp;
-- 
cgit v1.2.3


From eb36be0fd55e0a6f2cb3226acd711b2c7a2d7d09 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 17 Jan 2018 12:11:00 -0800
Subject: tcp: avoid min-RTT overestimation from delayed ACKs

This patch avoids having TCP sender or congestion control
overestimate the min RTT by orders of magnitude. This happens when
all the samples in the windowed filter are one-packet transfer
like small request and health-check like chit-chat, which is farily
common for applications using persistent connections. This patch
tries to conservatively labels and skip RTT samples obtained from
this type of workload.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ff71b18d9682..2c6797134553 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 #define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
 #define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
 #define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
+#define FLAG_ACK_MAYBE_DELAYED	0x10000 /* Likely a delayed ACK */
 
 #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -2857,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 	*rexmit = REXMIT_LOST;
 }
 
-static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
 {
 	u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
+		/* If the remote keeps returning delayed ACKs, eventually
+		 * the min filter would pick it up and overestimate the
+		 * prop. delay when it expires. Skip suspected delayed ACKs.
+		 */
+		return;
+	}
 	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
 			   rtt_us ? : jiffies_to_usecs(1));
 }
@@ -2901,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 	 * always taken together with ACK, SACK, or TS-opts. Any negative
 	 * values will be skipped with the seq_rtt_us < 0 check above.
 	 */
-	tcp_update_rtt_min(sk, ca_rtt_us);
+	tcp_update_rtt_min(sk, ca_rtt_us, flag);
 	tcp_rtt_estimator(sk, seq_rtt_us);
 	tcp_set_rto(sk);
 
@@ -3125,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
 		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
 		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
+
+		if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
+		    last_in_flight && !prior_sacked && fully_acked &&
+		    sack->rate->prior_delivered + 1 == tp->delivered &&
+		    !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
+			/* Conservatively mark a delayed ACK. It's typically
+			 * from a lone runt packet over the round trip to
+			 * a receiver w/o out-of-order or CE events.
+			 */
+			flag |= FLAG_ACK_MAYBE_DELAYED;
+		}
 	}
 	if (sack->first_sackt) {
 		sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
-- 
cgit v1.2.3


From e42866031ff03c89a5bdd2056c76dd6cb41c3d35 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 17 Jan 2018 12:11:01 -0800
Subject: tcp: avoid min RTT bloat by skipping RTT from delayed-ACK in BBR

A persistent connection may send tiny amount of data (e.g. health-check)
for a long period of time. BBR's windowed min RTT filter may only see
RTT samples from delayed ACKs causing BBR to grossly over-estimate
the path delay depending how much the ACK was delayed at the receiver.

This patch skips RTT samples that are likely coming from delayed ACKs. Note
that it is possible the sender never obtains a valid measure to set the
min RTT. In this case BBR will continue to set cwnd to initial window
which seems fine because the connection is thin stream.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 1 +
 net/ipv4/tcp_bbr.c   | 3 ++-
 net/ipv4/tcp_input.c | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6939e69d3c37..5a1d26a18599 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -953,6 +953,7 @@ struct rate_sample {
 	u32  prior_in_flight;	/* in flight before this ACK */
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
+	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
 };
 
 struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 8322f26e770e..785712be5b0d 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -766,7 +766,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 	filter_expired = after(tcp_jiffies32,
 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
 	if (rs->rtt_us >= 0 &&
-	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+	    (rs->rtt_us <= bbr->min_rtt_us ||
+	     (filter_expired && !rs->is_ack_delayed))) {
 		bbr->min_rtt_us = rs->rtt_us;
 		bbr->min_rtt_stamp = tcp_jiffies32;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2c6797134553..cfa51cfd2d99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3633,6 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
+	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
-- 
cgit v1.2.3


From 8865fdd4e1538a775c5ac2157fb8eb45bee9dc18 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:49 -0500
Subject: net: sched: cls: fix code style issues

This patch changes some code style issues pointed out by checkpatch
inside the TC cls subsystem.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/cls_matchall.c  | 2 +-
 net/sched/cls_u32.c       | 8 ++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cfc19d0ba2ad..c90f5fe6bed9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -240,7 +240,8 @@ struct tcf_proto_ops {
 					struct tcf_proto*, unsigned long,
 					u32 handle, struct nlattr **,
 					void **, bool);
-	int			(*delete)(struct tcf_proto*, void *, bool*);
+	int			(*delete)(struct tcf_proto *tp, void *arg,
+					  bool *last);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
 	void			(*bind_class)(void *, u32, unsigned long);
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index d0e57c86636f..aeae89eeed0d 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -203,7 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 		goto err_set_parms;
 
 	if (!tc_skip_hw(new->flags)) {
-		err = mall_replace_hw_filter(tp, new, (unsigned long) new);
+		err = mall_replace_hw_filter(tp, new, (unsigned long)new);
 		if (err)
 			goto err_replace_hw_filter;
 	}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 020d328d0afd..84129b3c14e5 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -783,7 +783,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 		if (handle) {
 			ht_down = u32_lookup_ht(ht->tp_c, handle);
 
-			if (ht_down == NULL)
+			if (!ht_down)
 				return -EINVAL;
 			ht_down->refcnt++;
 		}
@@ -907,7 +907,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	size_t size;
 #endif
 
-	if (opt == NULL)
+	if (!opt)
 		return handle ? -EINVAL : 0;
 
 	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL);
@@ -1011,7 +1011,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			htid = ht->handle;
 		} else {
 			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
-			if (ht == NULL)
+			if (!ht)
 				return -EINVAL;
 		}
 	} else {
@@ -1023,7 +1023,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		return -EINVAL;
 
 	if (handle) {
-		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
+		if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid))
 			return -EINVAL;
 		handle = htid | TC_U32_NODE(handle);
 		err = idr_alloc_ext(&ht->handle_idr, NULL, NULL,
-- 
cgit v1.2.3


From c35a4acc298528ac7ac8d21284af7cad981aa79d Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:50 -0500
Subject: net: sched: cls_api: handle generic cls errors

This patch adds extack support for generic cls handling. The extack
will be set deeper to each called function which is not part of netdev
core api.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 52 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 41 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 86d6e9d2cf00..3e3841adbd5c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -122,7 +122,8 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 }
 
 static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
-					  u32 prio, struct tcf_chain *chain)
+					  u32 prio, struct tcf_chain *chain,
+					  struct netlink_ext_ack *extack)
 {
 	struct tcf_proto *tp;
 	int err;
@@ -148,6 +149,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
 			module_put(tp->ops->owner);
 			err = -EAGAIN;
 		} else {
+			NL_SET_ERR_MSG(extack, "TC classifier not found");
 			err = -ENOENT;
 		}
 		goto errout;
@@ -935,7 +937,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 			      struct nlmsghdr *n, struct tcf_proto *tp,
 			      struct tcf_block *block, struct Qdisc *q,
-			      u32 parent, void *fh, bool unicast, bool *last)
+			      u32 parent, void *fh, bool unicast, bool *last,
+			      struct netlink_ext_ack *extack)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -947,6 +950,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 
 	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
 			  n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+		NL_SET_ERR_MSG(extack, "Failed to build del event notification");
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -960,8 +964,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 	if (unicast)
 		return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 
-	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
-			      n->nlmsg_flags & NLM_F_ECHO);
+	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
+	if (err < 0)
+		NL_SET_ERR_MSG(extack, "Failed to send filter delete notification");
+	return err;
 }
 
 static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
@@ -1021,8 +1028,10 @@ replay:
 	if (prio == 0) {
 		switch (n->nlmsg_type) {
 		case RTM_DELTFILTER:
-			if (protocol || t->tcm_handle || tca[TCA_KIND])
+			if (protocol || t->tcm_handle || tca[TCA_KIND]) {
+				NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
 				return -ENOENT;
+			}
 			break;
 		case RTM_NEWTFILTER:
 			/* If no priority is provided by the user,
@@ -1035,6 +1044,7 @@ replay:
 			}
 			/* fall-through */
 		default:
+			NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
 			return -ENOENT;
 		}
 	}
@@ -1063,23 +1073,31 @@ replay:
 			parent = q->handle;
 		} else {
 			q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
-			if (!q)
+			if (!q) {
+				NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
 				return -EINVAL;
+			}
 		}
 
 		/* Is it classful? */
 		cops = q->ops->cl_ops;
-		if (!cops)
+		if (!cops) {
+			NL_SET_ERR_MSG(extack, "Qdisc not classful");
 			return -EINVAL;
+		}
 
-		if (!cops->tcf_block)
+		if (!cops->tcf_block) {
+			NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
 			return -EOPNOTSUPP;
+		}
 
 		/* Do we search for filter, attached to class? */
 		if (TC_H_MIN(parent)) {
 			cl = cops->find(q, parent);
-			if (cl == 0)
+			if (cl == 0) {
+				NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
 				return -ENOENT;
+			}
 		}
 
 		/* And the last stroke */
@@ -1097,12 +1115,14 @@ replay:
 
 	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
 	if (chain_index > TC_ACT_EXT_VAL_MASK) {
+		NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
 		err = -EINVAL;
 		goto errout;
 	}
 	chain = tcf_chain_get(block, chain_index,
 			      n->nlmsg_type == RTM_NEWTFILTER);
 	if (!chain) {
+		NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
 		err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL;
 		goto errout;
 	}
@@ -1118,6 +1138,7 @@ replay:
 	tp = tcf_chain_tp_find(chain, &chain_info, protocol,
 			       prio, prio_allocate);
 	if (IS_ERR(tp)) {
+		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
 		err = PTR_ERR(tp);
 		goto errout;
 	}
@@ -1126,12 +1147,14 @@ replay:
 		/* Proto-tcf does not exist, create new one */
 
 		if (tca[TCA_KIND] == NULL || !protocol) {
+			NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified");
 			err = -EINVAL;
 			goto errout;
 		}
 
 		if (n->nlmsg_type != RTM_NEWTFILTER ||
 		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+			NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
 			err = -ENOENT;
 			goto errout;
 		}
@@ -1140,13 +1163,14 @@ replay:
 			prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
 
 		tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
-				      protocol, prio, chain);
+				      protocol, prio, chain, extack);
 		if (IS_ERR(tp)) {
 			err = PTR_ERR(tp);
 			goto errout;
 		}
 		tp_created = 1;
 	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+		NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
 		err = -EINVAL;
 		goto errout;
 	}
@@ -1165,6 +1189,7 @@ replay:
 
 		if (n->nlmsg_type != RTM_NEWTFILTER ||
 		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+			NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
 			err = -ENOENT;
 			goto errout;
 		}
@@ -1176,13 +1201,15 @@ replay:
 			if (n->nlmsg_flags & NLM_F_EXCL) {
 				if (tp_created)
 					tcf_proto_destroy(tp);
+				NL_SET_ERR_MSG(extack, "Filter already exists");
 				err = -EEXIST;
 				goto errout;
 			}
 			break;
 		case RTM_DELTFILTER:
 			err = tfilter_del_notify(net, skb, n, tp, block,
-						 q, parent, fh, false, &last);
+						 q, parent, fh, false, &last,
+						 extack);
 			if (err)
 				goto errout;
 			if (last) {
@@ -1193,8 +1220,11 @@ replay:
 		case RTM_GETTFILTER:
 			err = tfilter_notify(net, skb, n, tp, block, q, parent,
 					     fh, RTM_NEWTFILTER, true);
+			if (err < 0)
+				NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
 			goto errout;
 		default:
+			NL_SET_ERR_MSG(extack, "Invalid netlink message type");
 			err = -EINVAL;
 			goto errout;
 		}
-- 
cgit v1.2.3


From 7306db38a67cf6b8e1ca354b1d0c0117b7b880d5 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:51 -0500
Subject: net: sched: cls: add extack support for change callback

This patch adds extack support for classifier change callback api. This
prepares to handle extack support inside each specific classifier
implementation.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/cls_api.c       | 3 ++-
 net/sched/cls_basic.c     | 3 ++-
 net/sched/cls_bpf.c       | 2 +-
 net/sched/cls_cgroup.c    | 3 ++-
 net/sched/cls_flow.c      | 2 +-
 net/sched/cls_flower.c    | 2 +-
 net/sched/cls_fw.c        | 2 +-
 net/sched/cls_matchall.c  | 2 +-
 net/sched/cls_route.c     | 3 ++-
 net/sched/cls_rsvp.h      | 2 +-
 net/sched/cls_tcindex.c   | 3 ++-
 net/sched/cls_u32.c       | 3 ++-
 13 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c90f5fe6bed9..ee398bcd46e7 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -239,7 +239,8 @@ struct tcf_proto_ops {
 	int			(*change)(struct net *net, struct sk_buff *,
 					struct tcf_proto*, unsigned long,
 					u32 handle, struct nlattr **,
-					void **, bool);
+					void **, bool,
+					struct netlink_ext_ack *);
 	int			(*delete)(struct tcf_proto *tp, void *arg,
 					  bool *last);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3e3841adbd5c..06797c2e8102 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1231,7 +1231,8 @@ replay:
 	}
 
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
-			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
+			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
+			      extack);
 	if (err == 0) {
 		if (tp_created)
 			tcf_chain_tp_insert(chain, &chain_info, tp);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 5f169ded347e..2cc38cd71938 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -175,7 +175,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int basic_change(struct net *net, struct sk_buff *in_skb,
 			struct tcf_proto *tp, unsigned long base, u32 handle,
-			struct nlattr **tca, void **arg, bool ovr)
+			struct nlattr **tca, void **arg, bool ovr,
+			struct netlink_ext_ack *extack)
 {
 	int err;
 	struct basic_head *head = rtnl_dereference(tp->root);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index cf72aefcf98d..e51eb503a23a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -452,7 +452,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 			  struct tcf_proto *tp, unsigned long base,
 			  u32 handle, struct nlattr **tca,
-			  void **arg, bool ovr)
+			  void **arg, bool ovr, struct netlink_ext_ack *extack)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 	struct cls_bpf_prog *oldprog = *arg;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 309d5899265f..b74af0b55820 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -91,7 +91,8 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root)
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
-			     void **arg, bool ovr)
+			     void **arg, bool ovr,
+			     struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
 	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 28cd6fb52c16..faa0b6793a17 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -401,7 +401,7 @@ static void flow_destroy_filter(struct rcu_head *head)
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       void **arg, bool ovr)
+		       void **arg, bool ovr, struct netlink_ext_ack *extack)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *fold, *fnew;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f61df19b1026..7d92bbeeba54 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -853,7 +853,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca,
-		     void **arg, bool ovr)
+		     void **arg, bool ovr, struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *fold = *arg;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 20f0de1a960a..72784491ce20 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -257,7 +257,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 static int fw_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca, void **arg,
-		     bool ovr)
+		     bool ovr, struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f = *arg;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index aeae89eeed0d..689bd199aa14 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -160,7 +160,7 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 static int mall_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       void **arg, bool ovr)
+		       void **arg, bool ovr, struct netlink_ext_ack *extack)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct nlattr *tb[TCA_MATCHALL_MAX + 1];
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index a1f2b1b7c014..f436d4d894a9 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -471,7 +471,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int route4_change(struct net *net, struct sk_buff *in_skb,
 			 struct tcf_proto *tp, unsigned long base, u32 handle,
-			 struct nlattr **tca, void **arg, bool ovr)
+			 struct nlattr **tca, void **arg, bool ovr,
+			 struct netlink_ext_ack *extack)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter __rcu **fp;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index cf325625c99d..d1f67529c01d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -486,7 +486,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle,
 		       struct nlattr **tca,
-		       void **arg, bool ovr)
+		       void **arg, bool ovr, struct netlink_ext_ack *extack)
 {
 	struct rsvp_head *data = rtnl_dereference(tp->root);
 	struct rsvp_filter *f, *nfp;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 67467ae24c97..0ec84cf2d6b7 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -520,7 +520,8 @@ errout:
 static int
 tcindex_change(struct net *net, struct sk_buff *in_skb,
 	       struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, void **arg, bool ovr)
+	       struct nlattr **tca, void **arg, bool ovr,
+	       struct netlink_ext_ack *extack)
 {
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 84129b3c14e5..d9cadebc7eaa 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -893,7 +893,8 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 
 static int u32_change(struct net *net, struct sk_buff *in_skb,
 		      struct tcf_proto *tp, unsigned long base, u32 handle,
-		      struct nlattr **tca, void **arg, bool ovr)
+		      struct nlattr **tca, void **arg, bool ovr,
+		      struct netlink_ext_ack *extack)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *ht;
-- 
cgit v1.2.3


From 50a561900e66a03f5127edac57487079bc0b8201 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:52 -0500
Subject: net: sched: cls: add extack support for tcf_exts_validate

The tcf_exts_validate function calls the act api change callback. For
preparing extack support for act api, this patch adds the extack as
parameter for this function which is common used in cls implementations.

Furthermore the tcf_exts_validate will call action init callback which
prepares the TC action subsystem for extack support.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    |  3 ++-
 net/sched/cls_api.c      |  7 +++++--
 net/sched/cls_basic.c    |  8 +++++---
 net/sched/cls_bpf.c      |  8 +++++---
 net/sched/cls_cgroup.c   |  3 ++-
 net/sched/cls_flow.c     |  3 ++-
 net/sched/cls_flower.c   |  8 +++++---
 net/sched/cls_fw.c       | 10 ++++++----
 net/sched/cls_matchall.c |  8 +++++---
 net/sched/cls_route.c    |  6 +++---
 net/sched/cls_rsvp.h     |  2 +-
 net/sched/cls_tcindex.c  |  6 +++---
 net/sched/cls_u32.c      | 10 ++++++----
 13 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 789d818c4a61..6dd009e10e5d 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -376,7 +376,8 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
-		      struct tcf_exts *exts, bool ovr);
+		      struct tcf_exts *exts, bool ovr,
+		      struct netlink_ext_ack *extack);
 void tcf_exts_destroy(struct tcf_exts *exts);
 void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 06797c2e8102..f365970dc68c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1423,7 +1423,8 @@ void tcf_exts_destroy(struct tcf_exts *exts)
 EXPORT_SYMBOL(tcf_exts_destroy);
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
-		      struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
+		      struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr,
+		      struct netlink_ext_ack *extack)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	{
@@ -1456,8 +1457,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 	}
 #else
 	if ((exts->action && tb[exts->action]) ||
-	    (exts->police && tb[exts->police]))
+	    (exts->police && tb[exts->police])) {
+		NL_SET_ERR_MSG(extack, "Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT)");
 		return -EOPNOTSUPP;
+	}
 #endif
 
 	return 0;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 2cc38cd71938..b7bcf67641bf 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -152,11 +152,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
 static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 			   struct basic_filter *f, unsigned long base,
 			   struct nlattr **tb,
-			   struct nlattr *est, bool ovr)
+			   struct nlattr *est, bool ovr,
+			   struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
 	if (err < 0)
 		return err;
 
@@ -222,7 +223,8 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 		fnew->handle = idr_index;
 	}
 
-	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
+	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
+			      extack);
 	if (err < 0) {
 		if (!fold)
 			idr_remove_ext(&head->handle_idr, fnew->handle);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index e51eb503a23a..c86072779b0a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -403,7 +403,8 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 
 static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 			     struct cls_bpf_prog *prog, unsigned long base,
-			     struct nlattr **tb, struct nlattr *est, bool ovr)
+			     struct nlattr **tb, struct nlattr *est, bool ovr,
+			     struct netlink_ext_ack *extack)
 {
 	bool is_bpf, is_ebpf, have_exts = false;
 	u32 gen_flags = 0;
@@ -414,7 +415,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
 		return -EINVAL;
 
-	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr);
+	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack);
 	if (ret < 0)
 		return ret;
 
@@ -500,7 +501,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		prog->handle = handle;
 	}
 
-	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr,
+				extack);
 	if (ret < 0)
 		goto errout_idr;
 
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index b74af0b55820..aaafcf6965f7 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -122,7 +122,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto errout;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr,
+				extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index faa0b6793a17..07816133cbb9 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -454,7 +454,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto err2;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr,
+				extack);
 	if (err < 0)
 		goto err2;
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 7d92bbeeba54..8235ed9143c4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -827,11 +827,12 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
 static int fl_set_parms(struct net *net, struct tcf_proto *tp,
 			struct cls_fl_filter *f, struct fl_flow_mask *mask,
 			unsigned long base, struct nlattr **tb,
-			struct nlattr *est, bool ovr)
+			struct nlattr *est, bool ovr,
+			struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
 	if (err < 0)
 		return err;
 
@@ -916,7 +917,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
-	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
+	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr,
+			   extack);
 	if (err)
 		goto errout_idr;
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 72784491ce20..72a924a38753 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -218,13 +218,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
 
 static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 			struct fw_filter *f, struct nlattr **tb,
-			struct nlattr **tca, unsigned long base, bool ovr)
+			struct nlattr **tca, unsigned long base, bool ovr,
+			struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	u32 mask;
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr,
+				extack);
 	if (err < 0)
 		return err;
 
@@ -296,7 +298,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
-		err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr);
+		err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack);
 		if (err < 0) {
 			tcf_exts_destroy(&fnew->exts);
 			kfree(fnew);
@@ -345,7 +347,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 	f->id = handle;
 	f->tp = tp;
 
-	err = fw_set_parms(net, tp, f, tb, tca, base, ovr);
+	err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 689bd199aa14..9a0901ee6b74 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -142,11 +142,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
 static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 			  struct cls_mall_head *head,
 			  unsigned long base, struct nlattr **tb,
-			  struct nlattr *est, bool ovr)
+			  struct nlattr *est, bool ovr,
+			  struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack);
 	if (err < 0)
 		return err;
 
@@ -198,7 +199,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	new->handle = handle;
 	new->flags = flags;
 
-	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr);
+	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
+			     extack);
 	if (err)
 		goto err_set_parms;
 
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index f436d4d894a9..92d683851cb0 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -389,7 +389,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 			    unsigned long base, struct route4_filter *f,
 			    u32 handle, struct route4_head *head,
 			    struct nlattr **tb, struct nlattr *est, int new,
-			    bool ovr)
+			    bool ovr, struct netlink_ext_ack *extack)
 {
 	u32 id = 0, to = 0, nhandle = 0x8000;
 	struct route4_filter *fp;
@@ -397,7 +397,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 	struct route4_bucket *b;
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
 	if (err < 0)
 		return err;
 
@@ -516,7 +516,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	}
 
 	err = route4_set_parms(net, tp, base, f, handle, head, tb,
-			       tca[TCA_RATE], new, ovr);
+			       tca[TCA_RATE], new, ovr, extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index d1f67529c01d..c27d23694002 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -511,7 +511,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 	err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
 	if (err < 0)
 		return err;
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack);
 	if (err < 0)
 		goto errout2;
 
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 0ec84cf2d6b7..9d6621caa92f 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -322,7 +322,7 @@ static int
 tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 		  u32 handle, struct tcindex_data *p,
 		  struct tcindex_filter_result *r, struct nlattr **tb,
-		  struct nlattr *est, bool ovr)
+		  struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
 {
 	struct tcindex_filter_result new_filter_result, *old_r = r;
 	struct tcindex_filter_result cr;
@@ -334,7 +334,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
 	if (err < 0)
 		return err;
-	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack);
 	if (err < 0)
 		goto errout;
 
@@ -541,7 +541,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
-				 tca[TCA_RATE], ovr);
+				 tca[TCA_RATE], ovr, extack);
 }
 
 static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d9cadebc7eaa..933058bbd0a9 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -765,11 +765,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 			 unsigned long base, struct tc_u_hnode *ht,
 			 struct tc_u_knode *n, struct nlattr **tb,
-			 struct nlattr *est, bool ovr)
+			 struct nlattr *est, bool ovr,
+			 struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr);
+	err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack);
 	if (err < 0)
 		return err;
 
@@ -937,7 +938,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		err = u32_set_parms(net, tp, base,
 				    rtnl_dereference(n->ht_up), new, tb,
-				    tca[TCA_RATE], ovr);
+				    tca[TCA_RATE], ovr, extack);
 
 		if (err) {
 			u32_destroy_key(tp, new, false);
@@ -1084,7 +1085,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	}
 #endif
 
-	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
+	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr,
+			    extack);
 	if (err == 0) {
 		struct tc_u_knode __rcu **ins;
 		struct tc_u_knode *pins;
-- 
cgit v1.2.3


From 571acf2106963d6c1c0ce1ed13e711bd296b2d25 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:53 -0500
Subject: net: sched: cls: add extack support for delete callback

This patch adds extack support for classifier delete callback api. This
prepares to handle extack support inside each specific classifier
implementation.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ++-
 net/sched/cls_api.c       | 2 +-
 net/sched/cls_basic.c     | 3 ++-
 net/sched/cls_bpf.c       | 3 ++-
 net/sched/cls_cgroup.c    | 3 ++-
 net/sched/cls_flow.c      | 3 ++-
 net/sched/cls_flower.c    | 3 ++-
 net/sched/cls_fw.c        | 3 ++-
 net/sched/cls_matchall.c  | 3 ++-
 net/sched/cls_route.c     | 3 ++-
 net/sched/cls_rsvp.h      | 3 ++-
 net/sched/cls_tcindex.c   | 5 +++--
 net/sched/cls_u32.c       | 3 ++-
 13 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ee398bcd46e7..cd1be1f25c36 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -242,7 +242,8 @@ struct tcf_proto_ops {
 					void **, bool,
 					struct netlink_ext_ack *);
 	int			(*delete)(struct tcf_proto *tp, void *arg,
-					  bool *last);
+					  bool *last,
+					  struct netlink_ext_ack *);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
 	void			(*bind_class)(void *, u32, unsigned long);
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f365970dc68c..f5d293416f46 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -955,7 +955,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 		return -EINVAL;
 	}
 
-	err = tp->ops->delete(tp, fh, last);
+	err = tp->ops->delete(tp, fh, last, extack);
 	if (err) {
 		kfree_skb(skb);
 		return err;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index b7bcf67641bf..6088be65d167 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -130,7 +130,8 @@ static void basic_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static int basic_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
+			struct netlink_ext_ack *extack)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f = arg;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c86072779b0a..fc024fc3ec2f 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -295,7 +295,8 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 		__cls_bpf_delete_prog(prog);
 }
 
-static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
+			  struct netlink_ext_ack *extack)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index aaafcf6965f7..1b54fbfca414 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -156,7 +156,8 @@ static void cls_cgroup_destroy(struct tcf_proto *tp)
 	}
 }
 
-static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last,
+			     struct netlink_ext_ack *extack)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 07816133cbb9..64c24b488058 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -575,7 +575,8 @@ err1:
 	return err;
 }
 
-static int flow_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
+		       struct netlink_ext_ack *extack)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f = arg;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 8235ed9143c4..50145b873ff1 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -985,7 +985,8 @@ errout_tb:
 	return err;
 }
 
-static int fl_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
+		     struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f = arg;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 72a924a38753..bd21ed83eb07 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -172,7 +172,8 @@ static void fw_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static int fw_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
+		     struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f = arg;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 9a0901ee6b74..f67d3d7fcf40 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -225,7 +225,8 @@ err_exts_init:
 	return err;
 }
 
-static int mall_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
+		       struct netlink_ext_ack *extack)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 92d683851cb0..55467c30d524 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -316,7 +316,8 @@ static void route4_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static int route4_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
+			 struct netlink_ext_ack *extack)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter *f = arg;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index c27d23694002..5cc0df690cff 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -350,7 +350,8 @@ static void rsvp_destroy(struct tcf_proto *tp)
 	kfree_rcu(data, rcu);
 }
 
-static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last,
+		       struct netlink_ext_ack *extack)
 {
 	struct rsvp_head *head = rtnl_dereference(tp->root);
 	struct rsvp_filter *nfp, *f = arg;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9d6621caa92f..01a163e0b6aa 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -193,7 +193,8 @@ static void tcindex_destroy_fexts(struct rcu_head *head)
 	tcf_queue_work(&f->work);
 }
 
-static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
+			  struct netlink_ext_ack *extack)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
 	struct tcindex_filter_result *r = arg;
@@ -246,7 +247,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp,
 {
 	bool last;
 
-	return tcindex_delete(tp, arg, &last);
+	return tcindex_delete(tp, arg, &last, NULL);
 }
 
 static void __tcindex_destroy(struct rcu_head *head)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 933058bbd0a9..7f772da8e627 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -672,7 +672,8 @@ static void u32_destroy(struct tcf_proto *tp)
 	tp->data = NULL;
 }
 
-static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
+static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
+		      struct netlink_ext_ack *extack)
 {
 	struct tc_u_hnode *ht = arg;
 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
-- 
cgit v1.2.3


From 1057c55f6b6cdc4fa3e8e29cfb9061c211e58395 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:54 -0500
Subject: net: sched: cls: add extack support for tcf_change_indev

This patch adds extack handling for the tcf_change_indev function which
is common used by TC classifier implementations.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h  | 7 +++++--
 net/sched/cls_flower.c | 7 ++++---
 net/sched/cls_fw.c     | 2 +-
 net/sched/cls_u32.c    | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6dd009e10e5d..2e4b8e436d25 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -557,13 +557,16 @@ static inline int tcf_valid_offset(const struct sk_buff *skb,
 #include <net/net_namespace.h>
 
 static inline int
-tcf_change_indev(struct net *net, struct nlattr *indev_tlv)
+tcf_change_indev(struct net *net, struct nlattr *indev_tlv,
+		 struct netlink_ext_ack *extack)
 {
 	char indev[IFNAMSIZ];
 	struct net_device *dev;
 
-	if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ)
+	if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ) {
+		NL_SET_ERR_MSG(extack, "Interface name too long");
 		return -EINVAL;
+	}
 	dev = __dev_get_by_name(net, indev);
 	if (!dev)
 		return -ENODEV;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 50145b873ff1..c6ac4a612c4a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -526,13 +526,14 @@ static void fl_set_key_ip(struct nlattr **tb,
 }
 
 static int fl_set_key(struct net *net, struct nlattr **tb,
-		      struct fl_flow_key *key, struct fl_flow_key *mask)
+		      struct fl_flow_key *key, struct fl_flow_key *mask,
+		      struct netlink_ext_ack *extack)
 {
 	__be16 ethertype;
 	int ret = 0;
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_FLOWER_INDEV]) {
-		int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
+		int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
 		if (err < 0)
 			return err;
 		key->indev_ifindex = err;
@@ -841,7 +842,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
 		tcf_bind_filter(tp, &f->res, base);
 	}
 
-	err = fl_set_key(net, tb, &f->key, &mask->key);
+	err = fl_set_key(net, tb, &f->key, &mask->key, extack);
 	if (err)
 		return err;
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index bd21ed83eb07..94d159a8869a 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -239,7 +239,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_FW_INDEV]) {
 		int ret;
-		ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
+		ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
 		if (ret < 0)
 			return ret;
 		f->ifindex = ret;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 7f772da8e627..e8963ed35899 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -804,7 +804,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_U32_INDEV]) {
 		int ret;
-		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
+		ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
 		if (ret < 0)
 			return -EINVAL;
 		n->ifindex = ret;
-- 
cgit v1.2.3


From 4b981dbc22727fbb9162ec76a2d0a1c4ecb0831c Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 18 Jan 2018 11:20:55 -0500
Subject: net: sched: cls_u32: add extack support

This patch adds extack support for the u32 classifier as example for
delete and init callback.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 58 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index e8963ed35899..57113e936155 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -689,13 +689,16 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 		goto out;
 	}
 
-	if (root_ht == ht)
+	if (root_ht == ht) {
+		NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
 		return -EINVAL;
+	}
 
 	if (ht->refcnt == 1) {
 		ht->refcnt--;
 		u32_destroy_hnode(tp, ht);
 	} else {
+		NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
 		return -EBUSY;
 	}
 
@@ -779,14 +782,18 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
 		struct tc_u_hnode *ht_down = NULL, *ht_old;
 
-		if (TC_U32_KEY(handle))
+		if (TC_U32_KEY(handle)) {
+			NL_SET_ERR_MSG_MOD(extack, "u32 Link handle must be a hash table");
 			return -EINVAL;
+		}
 
 		if (handle) {
 			ht_down = u32_lookup_ht(ht->tp_c, handle);
 
-			if (!ht_down)
+			if (!ht_down) {
+				NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
 				return -EINVAL;
+			}
 			ht_down->refcnt++;
 		}
 
@@ -910,28 +917,40 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	size_t size;
 #endif
 
-	if (!opt)
-		return handle ? -EINVAL : 0;
+	if (!opt) {
+		if (handle) {
+			NL_SET_ERR_MSG_MOD(extack, "Filter handle requires options");
+			return -EINVAL;
+		} else {
+			return 0;
+		}
+	}
 
-	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL);
+	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack);
 	if (err < 0)
 		return err;
 
 	if (tb[TCA_U32_FLAGS]) {
 		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
-		if (!tc_flags_valid(flags))
+		if (!tc_flags_valid(flags)) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags");
 			return -EINVAL;
+		}
 	}
 
 	n = *arg;
 	if (n) {
 		struct tc_u_knode *new;
 
-		if (TC_U32_KEY(n->handle) == 0)
+		if (TC_U32_KEY(n->handle) == 0) {
+			NL_SET_ERR_MSG_MOD(extack, "Key node id cannot be zero");
 			return -EINVAL;
+		}
 
-		if (n->flags != flags)
+		if (n->flags != flags) {
+			NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
 			return -EINVAL;
+		}
 
 		new = u32_init_knode(tp, n);
 		if (!new)
@@ -965,10 +984,14 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (tb[TCA_U32_DIVISOR]) {
 		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
 
-		if (--divisor > 0x100)
+		if (--divisor > 0x100) {
+			NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
 			return -EINVAL;
-		if (TC_U32_KEY(handle))
+		}
+		if (TC_U32_KEY(handle)) {
+			NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table");
 			return -EINVAL;
+		}
 		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
 		if (ht == NULL)
 			return -ENOBUFS;
@@ -1014,20 +1037,26 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			htid = ht->handle;
 		} else {
 			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
-			if (!ht)
+			if (!ht) {
+				NL_SET_ERR_MSG_MOD(extack, "Specified hash table not found");
 				return -EINVAL;
+			}
 		}
 	} else {
 		ht = rtnl_dereference(tp->root);
 		htid = ht->handle;
 	}
 
-	if (ht->divisor < TC_U32_HASH(htid))
+	if (ht->divisor < TC_U32_HASH(htid)) {
+		NL_SET_ERR_MSG_MOD(extack, "Specified hash table buckets exceed configured value");
 		return -EINVAL;
+	}
 
 	if (handle) {
-		if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid))
+		if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) {
+			NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch");
 			return -EINVAL;
+		}
 		handle = htid | TC_U32_NODE(handle);
 		err = idr_alloc_ext(&ht->handle_idr, NULL, NULL,
 				    handle, handle + 1,
@@ -1038,6 +1067,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		handle = gen_new_kid(ht, htid);
 
 	if (tb[TCA_U32_SEL] == NULL) {
+		NL_SET_ERR_MSG_MOD(extack, "Selector not specified");
 		err = -EINVAL;
 		goto erridr;
 	}
-- 
cgit v1.2.3


From 205c380778d09291668da5267057a80385f89437 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:30 +0100
Subject: bpf: add csum_diff helper to xdp as well

Useful for porting cls_bpf programs w/o increasing program
complexity limits much at the same time, so add the helper
to XDP as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 30fafaaa90fa..e5178851536f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3456,6 +3456,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
 	case BPF_FUNC_xdp_adjust_meta:
-- 
cgit v1.2.3


From fa9dd599b4dae841924b022768354cfde9affecb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:33 +0100
Subject: bpf: get rid of pure_initcall dependency to enable jits

Having a pure_initcall() callback just to permanently enable BPF
JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave
a small race window in future where JIT is still disabled on boot.
Since we know about the setting at compilation time anyway, just
initialize it properly there. Also consolidate all the individual
bpf_jit_enable variables into a single one and move them under one
location. Moreover, don't allow for setting unspecified garbage
values on them.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm/net/bpf_jit_32.c         |  2 --
 arch/arm64/net/bpf_jit_comp.c     |  2 --
 arch/mips/net/bpf_jit.c           |  2 --
 arch/mips/net/ebpf_jit.c          |  2 --
 arch/powerpc/net/bpf_jit_comp.c   |  2 --
 arch/powerpc/net/bpf_jit_comp64.c |  2 --
 arch/s390/net/bpf_jit_comp.c      |  2 --
 arch/sparc/net/bpf_jit_comp_32.c  |  2 --
 arch/sparc/net/bpf_jit_comp_64.c  |  2 --
 arch/x86/net/bpf_jit_comp.c       |  2 --
 kernel/bpf/core.c                 | 19 ++++++++++++-------
 net/core/sysctl_net_core.c        | 18 ++++++++++++------
 net/socket.c                      |  9 ---------
 13 files changed, 24 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 4425189bb24c..a15e7cdf8754 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -25,8 +25,6 @@
 
 #include "bpf_jit_32.h"
 
-int bpf_jit_enable __read_mostly;
-
 #define STACK_OFFSET(k)	(k)
 #define TMP_REG_1	(MAX_BPF_JIT_REG + 0)	/* TEMP Register 1 */
 #define TMP_REG_2	(MAX_BPF_JIT_REG + 1)	/* TEMP Register 2 */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index acaa935ed977..8d456ee6dddc 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -31,8 +31,6 @@
 
 #include "bpf_jit.h"
 
-int bpf_jit_enable __read_mostly;
-
 #define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 44b925005dd3..4d8cb9bb8365 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1207,8 +1207,6 @@ jmp_cmp:
 	return 0;
 }
 
-int bpf_jit_enable __read_mostly;
-
 void bpf_jit_compile(struct bpf_prog *fp)
 {
 	struct jit_ctx ctx;
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 97069a1b6f43..4e347030ed2c 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -177,8 +177,6 @@ static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx)
 		(ctx->idx * 4) - 4;
 }
 
-int bpf_jit_enable __read_mostly;
-
 enum which_ebpf_reg {
 	src_reg,
 	src_reg_no_fp,
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index f9941b3b5770..872d1f6dd11e 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -18,8 +18,6 @@
 
 #include "bpf_jit32.h"
 
-int bpf_jit_enable __read_mostly;
-
 static inline void bpf_flush_icache(void *start, void *end)
 {
 	smp_wmb();
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 6771c63b2bec..217a78e84865 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -21,8 +21,6 @@
 
 #include "bpf_jit64.h"
 
-int bpf_jit_enable __read_mostly;
-
 static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
 {
 	memset32(area, BREAKPOINT_INSTRUCTION, size/4);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 1dfadbd126f3..e50188773ff3 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -28,8 +28,6 @@
 #include <asm/set_memory.h>
 #include "bpf_jit.h"
 
-int bpf_jit_enable __read_mostly;
-
 struct bpf_jit {
 	u32 seen;		/* Flags to remember seen eBPF instructions */
 	u32 seen_reg[16];	/* Array to remember which registers are used */
diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index 09e318eb34ee..3bd8ca95e521 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -11,8 +11,6 @@
 
 #include "bpf_jit_32.h"
 
-int bpf_jit_enable __read_mostly;
-
 static inline bool is_simm13(unsigned int value)
 {
 	return value + 0x1000 < 0x2000;
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 635fdefd4ae2..50a24d7bd4c5 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -12,8 +12,6 @@
 
 #include "bpf_jit_64.h"
 
-int bpf_jit_enable __read_mostly;
-
 static inline bool is_simm13(unsigned int value)
 {
 	return value + 0x1000 < 0x2000;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 87f214fbe66e..b881a979efe1 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -15,8 +15,6 @@
 #include <asm/set_memory.h>
 #include <linux/bpf.h>
 
-int bpf_jit_enable __read_mostly;
-
 /*
  * assembly code in arch/x86/net/bpf_jit.S
  */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 25e723b0dfd4..bc9c5b11d6a9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 
 #ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden   __read_mostly;
+int bpf_jit_kallsyms __read_mostly;
+
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
 			 unsigned long *symbol_start,
@@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;
 
-int bpf_jit_kallsyms __read_mostly;
-
 static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
 	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	bpf_prog_unlock_free(fp);
 }
 
-int bpf_jit_harden __read_mostly;
-
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      const struct bpf_insn *aux,
 			      struct bpf_insn *to_buff)
@@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
 }
 
 #else
-static unsigned int __bpf_prog_ret0(const void *ctx,
-				    const struct bpf_insn *insn)
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
+					 const struct bpf_insn *insn)
 {
+	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+	 * is not working properly, so warn about it!
+	 */
+	WARN_ON_ONCE(1);
 	return 0;
 }
 #endif
@@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 
 	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 #else
-	fp->bpf_func = __bpf_prog_ret0;
+	fp->bpf_func = __bpf_prog_ret0_warn;
 #endif
 
 	/* eBPF JITs can rewrite the program in case constant
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a47ad6cd41c0..6d39b4c01fc6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -25,6 +25,7 @@
 
 static int zero = 0;
 static int one = 1;
+static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
@@ -325,13 +326,14 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-		.proc_handler	= proc_dointvec
-#else
 		.proc_handler	= proc_dointvec_minmax,
+# ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		.extra1		= &one,
 		.extra2		= &one,
-#endif
+# else
+		.extra1		= &zero,
+		.extra2		= &two,
+# endif
 	},
 # ifdef CONFIG_HAVE_EBPF_JIT
 	{
@@ -339,14 +341,18 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_harden,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "bpf_jit_kallsyms",
 		.data		= &bpf_jit_kallsyms,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
 	},
 # endif
 #endif
diff --git a/net/socket.c b/net/socket.c
index fbfae1ed3ff5..1536515b6437 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2613,15 +2613,6 @@ out_fs:
 
 core_initcall(sock_init);	/* early initcall */
 
-static int __init jit_init(void)
-{
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
-	bpf_jit_enable = 1;
-#endif
-	return 0;
-}
-pure_initcall(jit_init);
-
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-- 
cgit v1.2.3


From 2e4a30983b0f9b19b59e38bbf7427d7fdd480d98 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:34 +0100
Subject: bpf: restrict access to core bpf sysctls

Given BPF reaches far beyond just networking these days, it was
never intended to allow setting and in some cases reading those
knobs out of a user namespace root running without CAP_SYS_ADMIN,
thus tighten such access.

Also the bpf_jit_enable = 2 debugging mode should only be allowed
if kptr_restrict is not set since it otherwise can leak addresses
to the kernel log. Dump a note to the kernel log that this is for
debugging JITs only when enabled.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/sysctl_net_core.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6d39b4c01fc6..f2d0462611c3 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -251,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
 	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
+#ifdef CONFIG_BPF_JIT
+static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
+					   void __user *buffer, size_t *lenp,
+					   loff_t *ppos)
+{
+	int ret, jit_enable = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &jit_enable;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		if (jit_enable < 2 ||
+		    (jit_enable == 2 && bpf_dump_raw_ok())) {
+			*(int *)table->data = jit_enable;
+			if (jit_enable == 2)
+				pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
+		} else {
+			ret = -EPERM;
+		}
+	}
+	return ret;
+}
+
+# ifdef CONFIG_HAVE_EBPF_JIT
+static int
+proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+				    void __user *buffer, size_t *lenp,
+				    loff_t *ppos)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+# endif
+#endif
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -326,7 +366,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_enable,
 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		.extra1		= &one,
 		.extra2		= &one,
@@ -341,7 +381,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_harden,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
@@ -350,7 +390,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_kallsyms,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-- 
cgit v1.2.3


From 1728a4f2ad6840746a6b1b9f01d652c5842f7e8d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:37 +0100
Subject: bpf: move event_output to const_size_or_zero for xdp/skb as well

Similar rationale as in a60dd35d2e39 ("bpf: change bpf_perf_event_output
arg5 type to ARG_CONST_SIZE_OR_ZERO"), change the type to CONST_SIZE_OR_ZERO
such that we can better deal with optimized code. No changes needed in
bpf_event_output() as it can also deal with 0 size entirely (e.g. as only
wake-up signal with empty frame in perf RB, or packet dumps w/o meta data
as another such possibility).

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index e5178851536f..9b9b70dee208 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2861,7 +2861,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -3150,7 +3150,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
-- 
cgit v1.2.3


From 9cb05f93d618ff414a9d0e982c0c93f3daba7463 Mon Sep 17 00:00:00 2001
From: Christopher Díaz Riveros <chrisadr@gentoo.org>
Date: Wed, 17 Jan 2018 16:03:00 -0500
Subject: debugfs_sta: Remove unneeded semicolons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trivial fix removes unneeded semicolons after switch blocks.

Signed-off-by: Christopher Díaz Riveros <chrisadr@gentoo.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs_sta.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index b15412c21ac9..444ea8d127fe 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -420,7 +420,7 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
 		default:
 			p += scnprintf(p, sizeof(buf) + buf - p,
 				       "\t\tMAX-MPDU-UNKNOWN\n");
-		};
+		}
 		switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
 		case 0:
 			p += scnprintf(p, sizeof(buf) + buf - p,
@@ -438,7 +438,7 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
 			p += scnprintf(p, sizeof(buf) + buf - p,
 				       "\t\tUNKNOWN-MHZ: 0x%x\n",
 				       (vhtc->cap >> 2) & 0x3);
-		};
+		}
 		PFLAG(RXLDPC, "RXLDPC");
 		PFLAG(SHORT_GI_80, "SHORT-GI-80");
 		PFLAG(SHORT_GI_160, "SHORT-GI-160");
-- 
cgit v1.2.3


From b75703de16301b80f1eedecafdf37bb02c9e155f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 22 Jan 2018 10:31:19 +0000
Subject: devlink: fix memory leak on 'resource'

Currently, if the call to devlink_resource_find returns null then
the error exit path does not free the devlink_resource 'resource'
and a memory leak occurs. Fix this by kfree'ing resource on the
error exit path.

Detected by CoverityScan, CID#1464184 ("Resource leak")

Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 66d36705fb9d..18d385ed8237 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3182,6 +3182,7 @@ int devlink_resource_register(struct devlink *devlink,
 			resource_list = &parent_resource->resource_list;
 			resource->parent = parent_resource;
 		} else {
+			kfree(resource);
 			err = -EINVAL;
 			goto out;
 		}
-- 
cgit v1.2.3


From b2d3bcfa26a7a8de41f358a6cae8b848673b3c6e Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Thu, 18 Jan 2018 09:59:13 -0800
Subject: net: core: Expose number of link up/down transitions

Expose the number of times the link has been going UP or DOWN, and
update the "carrier_changes" counter to be the sum of these two events.
While at it, also update the sysfs-class-net documentation to cover:
carrier_changes (3.15), carrier_up_count (4.16) and carrier_down_count
(4.16)

Signed-off-by: David Decotigny <decot@googlers.com>
[Florian:
* rebase
* add documentation
* merge carrier_changes with up/down counters]
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net | 24 ++++++++++++++++++++++++
 include/linux/netdevice.h                 |  6 ++++--
 include/uapi/linux/if_link.h              |  2 ++
 net/core/net-sysfs.c                      | 25 ++++++++++++++++++++++++-
 net/core/rtnetlink.c                      | 13 +++++++++++--
 net/sched/sch_generic.c                   |  4 ++--
 6 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 6856da99b6f7..2f1788111cd9 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -259,3 +259,27 @@ Contact:	netdev@vger.kernel.org
 Description:
 		Symbolic link to the PHY device this network device is attached
 		to.
+
+What:		/sys/class/net/<iface>/carrier_changes
+Date:		Mar 2014
+KernelVersion:	3.15
+Contact:	netdev@vger.kernel.org
+Description:
+		32-bit unsigned integer counting the number of times the link has
+		seen a change from UP to DOWN and vice versa
+
+What:		/sys/class/net/<iface>/carrier_up_count
+Date:		Jan 2018
+KernelVersion:	4.16
+Contact:	netdev@vger.kernel.org
+Description:
+		32-bit unsigned integer counting the number of times the link has
+		been up
+
+What:		/sys/class/net/<iface>/carrier_down_count
+Date:		Jan 2018
+KernelVersion:	4.16
+Contact:	netdev@vger.kernel.org
+Description:
+		32-bit unsigned integer counting the number of times the link has
+		been down
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ed0799a12bf2..837e9cb7e358 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1680,8 +1680,6 @@ struct net_device {
 	unsigned long		base_addr;
 	int			irq;
 
-	atomic_t		carrier_changes;
-
 	/*
 	 *	Some hardware also needs these fields (state,dev_list,
 	 *	napi_list,unreg_list,close_list) but they are not
@@ -1719,6 +1717,10 @@ struct net_device {
 	atomic_long_t		tx_dropped;
 	atomic_long_t		rx_nohandler;
 
+	/* Stats to monitor link on/off, flapping */
+	atomic_t		carrier_up_count;
+	atomic_t		carrier_down_count;
+
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *wireless_handlers;
 	struct iw_public_data	*wireless_data;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f8f04fed6186..8616131e2c61 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -161,6 +161,8 @@ enum {
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
 	IFLA_IF_NETNSID,
+	IFLA_CARRIER_UP_COUNT,
+	IFLA_CARRIER_DOWN_COUNT,
 	__IFLA_MAX
 };
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7bf8b85ade16..c4a28f4667b6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev,
 	struct net_device *netdev = to_net_dev(dev);
 
 	return sprintf(buf, fmt_dec,
-		       atomic_read(&netdev->carrier_changes));
+		       atomic_read(&netdev->carrier_up_count) +
+		       atomic_read(&netdev->carrier_down_count));
 }
 static DEVICE_ATTR_RO(carrier_changes);
 
+static ssize_t carrier_up_count_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+}
+static DEVICE_ATTR_RO(carrier_up_count);
+
+static ssize_t carrier_down_count_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+}
+static DEVICE_ATTR_RO(carrier_down_count);
+
 /* read-write attributes */
 
 static int change_mtu(struct net_device *dev, unsigned long new_mtu)
@@ -547,6 +568,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
 	&dev_attr_proto_down.attr,
+	&dev_attr_carrier_up_count.attr,
+	&dev_attr_carrier_down_count.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 16d644a4f974..97874daa1336 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -990,6 +990,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1)  /* IFLA_PROTO_DOWN */
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
+	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
+	       + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
 	       + 0;
 }
 
@@ -1551,8 +1553,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
 	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
-			atomic_read(&dev->carrier_changes)) ||
-	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
+			atomic_read(&dev->carrier_up_count) +
+			atomic_read(&dev->carrier_down_count)) ||
+	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) ||
+	    nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
+			atomic_read(&dev->carrier_up_count)) ||
+	    nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
+			atomic_read(&dev->carrier_down_count)))
 		goto nla_put_failure;
 
 	if (event != IFLA_EVENT_NONE) {
@@ -1656,6 +1663,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_EVENT]		= { .type = NLA_U32 },
 	[IFLA_GROUP]		= { .type = NLA_U32 },
 	[IFLA_IF_NETNSID]	= { .type = NLA_S32 },
+	[IFLA_CARRIER_UP_COUNT]	= { .type = NLA_U32 },
+	[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ef8b4ecde2ac..1816bde47256 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -510,7 +510,7 @@ void netif_carrier_on(struct net_device *dev)
 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
-		atomic_inc(&dev->carrier_changes);
+		atomic_inc(&dev->carrier_up_count);
 		linkwatch_fire_event(dev);
 		if (netif_running(dev))
 			__netdev_watchdog_up(dev);
@@ -529,7 +529,7 @@ void netif_carrier_off(struct net_device *dev)
 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
-		atomic_inc(&dev->carrier_changes);
+		atomic_inc(&dev->carrier_down_count);
 		linkwatch_fire_event(dev);
 	}
 }
-- 
cgit v1.2.3


From 03aaa9e2677ee25bb31fb9f54ea72b4dacbe0df8 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Thu, 18 Jan 2018 17:37:45 -0600
Subject: bridge: return boolean instead of integer in br_multicast_is_router

Return statements in functions returning bool should use
true/false instead of 1/0.

This issue was detected with the help of Coccinelle.

Fixes: 85b352693264 ("bridge: Fix build error when IGMP_SNOOPING is not enabled")
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_private.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 80559fd11b7e..8e13a64d8c99 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -760,7 +760,7 @@ static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {
-	return 0;
+	return false;
 }
 
 static inline bool br_multicast_querier_exists(struct net_bridge *br,
-- 
cgit v1.2.3


From 41002038f946b36dd3b4a73f13d75e8730f81621 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 19 Jan 2018 17:44:43 -0800
Subject: net: sched: cls_flower: propagate extack support for filter offload

Propagate the extack pointer from the `->change()` classifier operation
to the function used for filter replacement in cls_flower. This makes it
possible to use netlink extack messages in the future at replacement
time for this filter, although it is not used at this point.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c6ac4a612c4a..f675a92e1b66 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -235,7 +235,8 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 static int fl_hw_replace_filter(struct tcf_proto *tp,
 				struct flow_dissector *dissector,
 				struct fl_flow_key *mask,
-				struct cls_fl_filter *f)
+				struct cls_fl_filter *f,
+				struct netlink_ext_ack *extack)
 {
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
@@ -943,7 +944,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		err = fl_hw_replace_filter(tp,
 					   &head->dissector,
 					   &mask.key,
-					   fnew);
+					   fnew,
+					   extack);
 		if (err)
 			goto errout_idr;
 	}
-- 
cgit v1.2.3


From 0279814055a0800aacacd18331bdc4ac4f3d2241 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 19 Jan 2018 17:44:44 -0800
Subject: net: sched: cls_matchall: propagate extack support for filter offload

Propagate the extack pointer from the `->change()` classifier operation
to the function used for filter replacement in cls_matchall. This makes
it possible to use netlink extack messages in the future at replacement
time for this filter, although it is not used at this point.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f67d3d7fcf40..b47929c15792 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -86,7 +86,8 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 
 static int mall_replace_hw_filter(struct tcf_proto *tp,
 				  struct cls_mall_head *head,
-				  unsigned long cookie)
+				  unsigned long cookie,
+				  struct netlink_ext_ack *extack)
 {
 	struct tc_cls_matchall_offload cls_mall = {};
 	struct tcf_block *block = tp->chain->block;
@@ -205,7 +206,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 		goto err_set_parms;
 
 	if (!tc_skip_hw(new->flags)) {
-		err = mall_replace_hw_filter(tp, new, (unsigned long)new);
+		err = mall_replace_hw_filter(tp, new, (unsigned long)new,
+					     extack);
 		if (err)
 			goto err_replace_hw_filter;
 	}
-- 
cgit v1.2.3


From 10a47e0f0930149664dbc44169edc1fef09c4257 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 19 Jan 2018 17:44:45 -0800
Subject: net: sched: cls_u32: propagate extack support for filter offload

Propagate the extack pointer from the `->change()` classifier operation
to the function used for filter replacement in cls_u32. This makes it
possible to use netlink extack messages in the future at replacement
time for this filter, although it is not used at this point.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 57113e936155..0206c210e25b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -501,7 +501,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 }
 
 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
-				u32 flags)
+				u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
@@ -543,7 +543,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 }
 
 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
-				u32 flags)
+				u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
@@ -965,7 +965,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
-		err = u32_replace_hw_knode(tp, new, flags);
+		err = u32_replace_hw_knode(tp, new, flags, extack);
 		if (err) {
 			u32_destroy_key(tp, new, false);
 			return err;
@@ -1016,7 +1016,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		ht->prio = tp->prio;
 		idr_init(&ht->handle_idr);
 
-		err = u32_replace_hw_hnode(tp, ht, flags);
+		err = u32_replace_hw_hnode(tp, ht, flags, extack);
 		if (err) {
 			idr_remove_ext(&tp_c->handle_idr, handle);
 			kfree(ht);
@@ -1122,7 +1122,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		struct tc_u_knode __rcu **ins;
 		struct tc_u_knode *pins;
 
-		err = u32_replace_hw_knode(tp, n, flags);
+		err = u32_replace_hw_knode(tp, n, flags, extack);
 		if (err)
 			goto errhw;
 
-- 
cgit v1.2.3


From 631f65ff222211711715aad9a640fb5f954c81f5 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 19 Jan 2018 17:44:46 -0800
Subject: net: sched: cls_bpf: plumb extack support in filter for hardware
 offload

Pass the extack pointer obtained in the `->change()` filter operation to
cls_bpf_offload() and then to cls_bpf_offload_cmd(). This makes it
possible to use this extack pointer in drivers offloading BPF programs
in a future patch.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 988ad45d78b8..cd4194b1d5e4 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -147,7 +147,8 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
 }
 
 static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
-			       struct cls_bpf_prog *oldprog)
+			       struct cls_bpf_prog *oldprog,
+			       struct netlink_ext_ack *extack)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_bpf_offload cls_bpf = {};
@@ -173,7 +174,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
 	if (prog) {
 		if (err < 0) {
-			cls_bpf_offload_cmd(tp, oldprog, prog);
+			cls_bpf_offload_cmd(tp, oldprog, prog, extack);
 			return err;
 		} else if (err > 0) {
 			tcf_block_offload_inc(block, &prog->gen_flags);
@@ -192,7 +193,8 @@ static u32 cls_bpf_flags(u32 flags)
 }
 
 static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
-			   struct cls_bpf_prog *oldprog)
+			   struct cls_bpf_prog *oldprog,
+			   struct netlink_ext_ack *extack)
 {
 	if (prog && oldprog &&
 	    cls_bpf_flags(prog->gen_flags) !=
@@ -206,7 +208,7 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	if (!prog && !oldprog)
 		return 0;
 
-	return cls_bpf_offload_cmd(tp, prog, oldprog);
+	return cls_bpf_offload_cmd(tp, prog, oldprog, extack);
 }
 
 static void cls_bpf_stop_offload(struct tcf_proto *tp,
@@ -214,7 +216,7 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp,
 {
 	int err;
 
-	err = cls_bpf_offload_cmd(tp, NULL, prog);
+	err = cls_bpf_offload_cmd(tp, NULL, prog, NULL);
 	if (err)
 		pr_err("Stopping hardware offload failed: %d\n", err);
 }
@@ -514,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	if (ret < 0)
 		goto errout_idr;
 
-	ret = cls_bpf_offload(tp, prog, oldprog);
+	ret = cls_bpf_offload(tp, prog, oldprog, extack);
 	if (ret)
 		goto errout_parms;
 
-- 
cgit v1.2.3


From 8f0b425a712b82732127ff7880f92504f20fcc11 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 19 Jan 2018 17:44:47 -0800
Subject: net: sched: add extack support for offload via tc_cls_common_offload

Add extack support for hardware offload of classifiers. In order
to achieve this, a pointer to a struct netlink_ext_ack is added to the
struct tc_cls_common_offload that is passed to the callback for setting
up the classifier. Function tc_cls_common_offload_init() is updated to
support initialization of this new attribute.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    | 5 ++++-
 net/sched/cls_bpf.c      | 4 ++--
 net/sched/cls_flower.c   | 6 +++---
 net/sched/cls_matchall.c | 4 ++--
 net/sched/cls_u32.c      | 8 ++++----
 5 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 2e4b8e436d25..f497f622580b 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -602,15 +602,18 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
+	struct netlink_ext_ack *extack;
 };
 
 static inline void
 tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
-			   const struct tcf_proto *tp)
+			   const struct tcf_proto *tp,
+			   struct netlink_ext_ack *extack)
 {
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
+	cls_common->extack = extack;
 }
 
 struct tc_cls_u32_knode {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index cd4194b1d5e4..c11e0fe23a17 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -159,7 +159,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	skip_sw = prog && tc_skip_sw(prog->gen_flags);
 	obj = prog ?: oldprog;
 
-	tc_cls_common_offload_init(&cls_bpf.common, tp);
+	tc_cls_common_offload_init(&cls_bpf.common, tp, extack);
 	cls_bpf.command = TC_CLSBPF_OFFLOAD;
 	cls_bpf.exts = &obj->exts;
 	cls_bpf.prog = prog ? prog->filter : NULL;
@@ -227,7 +227,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_bpf_offload cls_bpf = {};
 
-	tc_cls_common_offload_init(&cls_bpf.common, tp);
+	tc_cls_common_offload_init(&cls_bpf.common, tp, NULL);
 	cls_bpf.command = TC_CLSBPF_STATS;
 	cls_bpf.exts = &prog->exts;
 	cls_bpf.prog = prog->filter;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f675a92e1b66..727c10378f37 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -223,7 +223,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp);
+	tc_cls_common_offload_init(&cls_flower.common, tp, NULL);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
@@ -243,7 +243,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(f->flags);
 	int err;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp);
+	tc_cls_common_offload_init(&cls_flower.common, tp, extack);
 	cls_flower.command = TC_CLSFLOWER_REPLACE;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.dissector = dissector;
@@ -272,7 +272,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp);
+	tc_cls_common_offload_init(&cls_flower.common, tp, NULL);
 	cls_flower.command = TC_CLSFLOWER_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.exts = &f->exts;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index b47929c15792..d990d2a52c6d 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -76,7 +76,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	struct tc_cls_matchall_offload cls_mall = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_mall.common, tp);
+	tc_cls_common_offload_init(&cls_mall.common, tp, NULL);
 	cls_mall.command = TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = cookie;
 
@@ -94,7 +94,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(head->flags);
 	int err;
 
-	tc_cls_common_offload_init(&cls_mall.common, tp);
+	tc_cls_common_offload_init(&cls_mall.common, tp, extack);
 	cls_mall.command = TC_CLSMATCHALL_REPLACE;
 	cls_mall.exts = &head->exts;
 	cls_mall.cookie = cookie;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 0206c210e25b..7030240f8826 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -491,7 +491,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp);
+	tc_cls_common_offload_init(&cls_u32.common, tp, NULL);
 	cls_u32.command = TC_CLSU32_DELETE_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -509,7 +509,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	bool offloaded = false;
 	int err;
 
-	tc_cls_common_offload_init(&cls_u32.common, tp);
+	tc_cls_common_offload_init(&cls_u32.common, tp, extack);
 	cls_u32.command = TC_CLSU32_NEW_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -534,7 +534,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp);
+	tc_cls_common_offload_init(&cls_u32.common, tp, NULL);
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
 	cls_u32.knode.handle = n->handle;
 
@@ -550,7 +550,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	bool skip_sw = tc_skip_sw(flags);
 	int err;
 
-	tc_cls_common_offload_init(&cls_u32.common, tp);
+	tc_cls_common_offload_init(&cls_u32.common, tp, extack);
 	cls_u32.command = TC_CLSU32_REPLACE_KNODE;
 	cls_u32.knode.handle = n->handle;
 	cls_u32.knode.fshift = n->fshift;
-- 
cgit v1.2.3


From 5de30d5df95cfda14dfdbd6f8ed5021ab13be79b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 22 Jan 2018 19:14:27 -0800
Subject: net: core: Fix kernel-doc for call_netdevice_notifiers_info()

Remove the @dev comment, since we do not have a net_device argument, fixes the
following kernel doc warning: /net/core/dev.c:1707: warning: Excess function
parameter 'dev' description in 'call_netdevice_notifiers_info'

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 94435cd09072..7af0ef425ca3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1694,7 +1694,6 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
 /**
  *	call_netdevice_notifiers_info - call all network notifier blocks
  *	@val: value passed unmodified to notifier function
- *	@dev: net_device pointer passed unmodified to notifier function
  *	@info: notifier information data
  *
  *	Call all network notifier blocks.  Parameters and return value
-- 
cgit v1.2.3


From 7a006d5988ebd99922784176d902a335b8eb5321 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 22 Jan 2018 19:14:28 -0800
Subject: net: core: Fix kernel-doc for netdev_upper_link()

Fixes the following warnings:
./net/core/dev.c:6438: warning: No description found for parameter 'extack'
./net/core/dev.c:6461: warning: No description found for parameter 'extack'

Fixes: 42ab19ee9029 ("net: Add extack to upper device linking")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7af0ef425ca3..77795f66c246 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6424,6 +6424,7 @@ rollback:
  * netdev_upper_dev_link - Add a link to the upper device
  * @dev: device
  * @upper_dev: new upper device
+ * @extack: netlink extended ack
  *
  * Adds a link to device which is upper to this one. The caller must hold
  * the RTNL lock. On a failure a negative errno code is returned.
@@ -6445,6 +6446,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * @upper_dev: new upper device
  * @upper_priv: upper device private
  * @upper_info: upper info to be passed down via notifier
+ * @extack: netlink extended ack
  *
  * Adds a link to device which is upper to this one. In this case, only
  * one master upper device can be linked, although other non-master devices
-- 
cgit v1.2.3


From f53d77e19b6587527a3dd60a0e638f115e5cd7a9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 23 Jan 2018 18:22:25 +0800
Subject: sctp: reset ret in again path in sctp_for_each_transport

Commit 97a6ec4ac021 ("rhashtable: Change rhashtable_walk_start to
return void") only initialized ret for the first time, when going
to again path, the next tsp could be NULL. Without resetting ret,
cb_done would be called with tsp as NULL.

A kernel crash was caused by this when running sctpdiag testcase
in sctp-tests.

Note that this issue doesn't affect net.git yet.

Fixes: 97a6ec4ac021 ("rhashtable: Change rhashtable_walk_start to return void")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7ff444ecee75..a40fa53c93ef 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4860,9 +4860,10 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
 			    struct net *net, int *pos, void *p) {
 	struct rhashtable_iter hti;
 	struct sctp_transport *tsp;
-	int ret = 0;
+	int ret;
 
 again:
+	ret = 0;
 	sctp_transport_walk_start(&hti);
 
 	tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
-- 
cgit v1.2.3


From 33615367f378fed87aeba27cf86f83bb8d214eaf Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Date: Tue, 23 Jan 2018 16:03:46 +0100
Subject: net: dsa: Support internal phy on 'cpu' port

This adds support for enabling the internal PHY for a 'cpu' port.
It has been tested on GE B850v3,  B650v3 and B450v3, which have a
built-in MV88E6240 switch hardwired to a PCIe based network card.
On these machines the internal PHY of the i210 network card and
the Marvell switch are connected to each other and must be enabled
for properly using the switch. While the i210 PHY will be enabled
when the network interface is enabled, the switch's port is not
exposed as network interface. Additionally the mv88e6xxx driver
resets the chip during probe, so the PHY is disabled without this
patch.

Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c     |   7 ++--
 net/dsa/dsa_priv.h |   4 +--
 net/dsa/legacy.c   |   4 +--
 net/dsa/port.c     | 103 ++++++++++++++++++++++++++++++++++++++++-------------
 4 files changed, 86 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 21f9bed11988..adf50fbc4c13 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -271,13 +271,12 @@ static int dsa_port_setup(struct dsa_port *dp)
 		break;
 	case DSA_PORT_TYPE_CPU:
 	case DSA_PORT_TYPE_DSA:
-		err = dsa_port_fixed_link_register_of(dp);
+		err = dsa_port_link_register_of(dp);
 		if (err) {
-			dev_err(ds->dev, "failed to register fixed link for port %d.%d\n",
+			dev_err(ds->dev, "failed to setup link for port %d.%d\n",
 				ds->index, dp->index);
 			return err;
 		}
-
 		break;
 	case DSA_PORT_TYPE_USER:
 		err = dsa_slave_create(dp);
@@ -301,7 +300,7 @@ static void dsa_port_teardown(struct dsa_port *dp)
 		break;
 	case DSA_PORT_TYPE_CPU:
 	case DSA_PORT_TYPE_DSA:
-		dsa_port_fixed_link_unregister_of(dp);
+		dsa_port_link_unregister_of(dp);
 		break;
 	case DSA_PORT_TYPE_USER:
 		if (dp->slave) {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index cefb0c3c6d51..70de7895e5b8 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -166,8 +166,8 @@ int dsa_port_vlan_add(struct dsa_port *dp,
 		      struct switchdev_trans *trans);
 int dsa_port_vlan_del(struct dsa_port *dp,
 		      const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_fixed_link_register_of(struct dsa_port *dp);
-void dsa_port_fixed_link_unregister_of(struct dsa_port *dp);
+int dsa_port_link_register_of(struct dsa_port *dp);
+void dsa_port_link_unregister_of(struct dsa_port *dp);
 
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index aa56d3fb5da4..cb54b81d0bd9 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -86,7 +86,7 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds)
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 
-		ret = dsa_port_fixed_link_register_of(&ds->ports[port]);
+		ret = dsa_port_link_register_of(&ds->ports[port]);
 		if (ret)
 			return ret;
 	}
@@ -275,7 +275,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 	for (port = 0; port < ds->num_ports; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
-		dsa_port_fixed_link_unregister_of(&ds->ports[port]);
+		dsa_port_link_unregister_of(&ds->ports[port]);
 	}
 
 	if (ds->slave_mii_bus && ds->ops->phy_read)
diff --git a/net/dsa/port.c b/net/dsa/port.c
index bb4be2679904..7acc1169d75e 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -273,7 +273,56 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 	return 0;
 }
 
-int dsa_port_fixed_link_register_of(struct dsa_port *dp)
+static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+{
+	struct device_node *port_dn = dp->dn;
+	struct device_node *phy_dn;
+	struct dsa_switch *ds = dp->ds;
+	struct phy_device *phydev;
+	int port = dp->index;
+	int err = 0;
+
+	phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+	if (!phy_dn)
+		return 0;
+
+	phydev = of_phy_find_device(phy_dn);
+	if (!phydev) {
+		err = -EPROBE_DEFER;
+		goto err_put_of;
+	}
+
+	if (enable) {
+		err = genphy_config_init(phydev);
+		if (err < 0)
+			goto err_put_dev;
+
+		err = genphy_resume(phydev);
+		if (err < 0)
+			goto err_put_dev;
+
+		err = genphy_read_status(phydev);
+		if (err < 0)
+			goto err_put_dev;
+	} else {
+		err = genphy_suspend(phydev);
+		if (err < 0)
+			goto err_put_dev;
+	}
+
+	if (ds->ops->adjust_link)
+		ds->ops->adjust_link(ds, port, phydev);
+
+	dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev));
+
+err_put_dev:
+	put_device(&phydev->mdio.dev);
+err_put_of:
+	of_node_put(phy_dn);
+	return err;
+}
+
+static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
 {
 	struct device_node *dn = dp->dn;
 	struct dsa_switch *ds = dp->ds;
@@ -282,38 +331,44 @@ int dsa_port_fixed_link_register_of(struct dsa_port *dp)
 	int mode;
 	int err;
 
-	if (of_phy_is_fixed_link(dn)) {
-		err = of_phy_register_fixed_link(dn);
-		if (err) {
-			dev_err(ds->dev,
-				"failed to register the fixed PHY of port %d\n",
-				port);
-			return err;
-		}
+	err = of_phy_register_fixed_link(dn);
+	if (err) {
+		dev_err(ds->dev,
+			"failed to register the fixed PHY of port %d\n",
+			port);
+		return err;
+	}
 
-		phydev = of_phy_find_device(dn);
+	phydev = of_phy_find_device(dn);
 
-		mode = of_get_phy_mode(dn);
-		if (mode < 0)
-			mode = PHY_INTERFACE_MODE_NA;
-		phydev->interface = mode;
+	mode = of_get_phy_mode(dn);
+	if (mode < 0)
+		mode = PHY_INTERFACE_MODE_NA;
+	phydev->interface = mode;
 
-		genphy_config_init(phydev);
-		genphy_read_status(phydev);
+	genphy_config_init(phydev);
+	genphy_read_status(phydev);
 
-		if (ds->ops->adjust_link)
-			ds->ops->adjust_link(ds, port, phydev);
+	if (ds->ops->adjust_link)
+		ds->ops->adjust_link(ds, port, phydev);
 
-		put_device(&phydev->mdio.dev);
-	}
+	put_device(&phydev->mdio.dev);
 
 	return 0;
 }
 
-void dsa_port_fixed_link_unregister_of(struct dsa_port *dp)
+int dsa_port_link_register_of(struct dsa_port *dp)
 {
-	struct device_node *dn = dp->dn;
+	if (of_phy_is_fixed_link(dp->dn))
+		return dsa_port_fixed_link_register_of(dp);
+	else
+		return dsa_port_setup_phy_of(dp, true);
+}
 
-	if (of_phy_is_fixed_link(dn))
-		of_phy_deregister_fixed_link(dn);
+void dsa_port_link_unregister_of(struct dsa_port *dp)
+{
+	if (of_phy_is_fixed_link(dp->dn))
+		of_phy_deregister_fixed_link(dp->dn);
+	else
+		dsa_port_setup_phy_of(dp, false);
 }
-- 
cgit v1.2.3


From b76f4189df5c54a892ae54ac23908cc54ae7134f Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Mon, 22 Jan 2018 08:07:19 -0800
Subject: net: link_watch: mark bonding link events urgent

It takes 1sec for bond link down notification to hit user-space
when all slaves of the bond go down. 1sec is too long for
protocol daemons in user-space relying on bond notification
to recover (eg: multichassis lag implementations in user-space).
Since the link event code already marks team device port link events
 as urgent, this patch moves the code to cover all lag ports and master.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/link_watch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 982861607f88..e38e641e98d5 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -92,7 +92,7 @@ static bool linkwatch_urgent_event(struct net_device *dev)
 	if (dev->ifindex != dev_get_iflink(dev))
 		return true;
 
-	if (dev->priv_flags & IFF_TEAM_PORT)
+	if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
 		return true;
 
 	return netif_carrier_ok(dev) &&	qdisc_tx_changing(dev);
-- 
cgit v1.2.3


From f6052cf2fc51772ea51e54c795b9ea234834ad9a Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 22 Jan 2018 18:14:31 +0100
Subject: net/sched: act_csum: use per-core statistics

use per-CPU counters, like other TC actions do, instead of maintaining one
set of stats across all cores. This allows updating act_csum stats without
the need of protecting them using spin_{,un}lock_bh() invocations.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_csum.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index af4b8ec60d9a..df22da365cd9 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -67,7 +67,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_idr_check(tn, parm->index, a, bind)) {
 		ret = tcf_idr_create(tn, parm->index, est, a,
-				     &act_csum_ops, bind, false);
+				     &act_csum_ops, bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -542,9 +542,9 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
 	int action;
 	u32 update_flags;
 
-	spin_lock(&p->tcf_lock);
 	tcf_lastuse_update(&p->tcf_tm);
-	bstats_update(&p->tcf_bstats, skb);
+	bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+	spin_lock(&p->tcf_lock);
 	action = p->tcf_action;
 	update_flags = p->update_flags;
 	spin_unlock(&p->tcf_lock);
@@ -566,9 +566,7 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
 	return action;
 
 drop:
-	spin_lock(&p->tcf_lock);
-	p->tcf_qstats.drops++;
-	spin_unlock(&p->tcf_lock);
+	qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
 	return TC_ACT_SHOT;
 }
 
-- 
cgit v1.2.3


From 9c5f69bbd75a7db80578782b037629c5f1e59dce Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 22 Jan 2018 18:14:32 +0100
Subject: net/sched: act_csum: don't use spinlock in the fast path

use RCU instead of spin_{,unlock}_bh() to protect concurrent read/write on
act_csum configuration, to reduce the effects of contention in the data
path when multiple readers are present.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_csum.h | 16 ++++++++++--
 net/sched/act_csum.c         | 58 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 59 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 781f3433a0be..9470fd7e4350 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -6,10 +6,16 @@
 #include <net/act_api.h>
 #include <linux/tc_act/tc_csum.h>
 
+struct tcf_csum_params {
+	int action;
+	u32 update_flags;
+	struct rcu_head rcu;
+};
+
 struct tcf_csum {
 	struct tc_action common;
 
-	u32 update_flags;
+	struct tcf_csum_params __rcu *params;
 };
 #define to_tcf_csum(a) ((struct tcf_csum *)a)
 
@@ -24,7 +30,13 @@ static inline bool is_tcf_csum(const struct tc_action *a)
 
 static inline u32 tcf_csum_update_flags(const struct tc_action *a)
 {
-	return to_tcf_csum(a)->update_flags;
+	u32 update_flags;
+
+	rcu_read_lock();
+	update_flags = rcu_dereference(to_tcf_csum(a)->params)->update_flags;
+	rcu_read_unlock();
+
+	return update_flags;
 }
 
 #endif /* __NET_TC_CSUM_H */
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index df22da365cd9..b7ba9b06b147 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -49,6 +49,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 			 int bind)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
+	struct tcf_csum_params *params_old, *params_new;
 	struct nlattr *tb[TCA_CSUM_MAX + 1];
 	struct tc_csum *parm;
 	struct tcf_csum *p;
@@ -80,10 +81,21 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	}
 
 	p = to_tcf_csum(*a);
-	spin_lock_bh(&p->tcf_lock);
-	p->tcf_action = parm->action;
-	p->update_flags = parm->update_flags;
-	spin_unlock_bh(&p->tcf_lock);
+	ASSERT_RTNL();
+
+	params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+	if (unlikely(!params_new)) {
+		if (ret == ACT_P_CREATED)
+			tcf_idr_release(*a, bind);
+		return -ENOMEM;
+	}
+	params_old = rtnl_dereference(p->params);
+
+	params_new->action = parm->action;
+	params_new->update_flags = parm->update_flags;
+	rcu_assign_pointer(p->params, params_new);
+	if (params_old)
+		kfree_rcu(params_old, rcu);
 
 	if (ret == ACT_P_CREATED)
 		tcf_idr_insert(tn, *a);
@@ -539,19 +551,21 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
 	struct tcf_csum *p = to_tcf_csum(a);
-	int action;
+	struct tcf_csum_params *params;
 	u32 update_flags;
+	int action;
+
+	rcu_read_lock();
+	params = rcu_dereference(p->params);
 
 	tcf_lastuse_update(&p->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
-	spin_lock(&p->tcf_lock);
-	action = p->tcf_action;
-	update_flags = p->update_flags;
-	spin_unlock(&p->tcf_lock);
 
+	action = params->action;
 	if (unlikely(action == TC_ACT_SHOT))
-		goto drop;
+		goto drop_stats;
 
+	update_flags = params->update_flags;
 	switch (tc_skb_protocol(skb)) {
 	case cpu_to_be16(ETH_P_IP):
 		if (!tcf_csum_ipv4(skb, update_flags))
@@ -563,11 +577,16 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
 		break;
 	}
 
+unlock:
+	rcu_read_unlock();
 	return action;
 
 drop:
+	action = TC_ACT_SHOT;
+
+drop_stats:
 	qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
-	return TC_ACT_SHOT;
+	goto unlock;
 }
 
 static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -575,15 +594,18 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_csum *p = to_tcf_csum(a);
+	struct tcf_csum_params *params;
 	struct tc_csum opt = {
-		.update_flags = p->update_flags,
 		.index   = p->tcf_index,
-		.action  = p->tcf_action,
 		.refcnt  = p->tcf_refcnt - ref,
 		.bindcnt = p->tcf_bindcnt - bind,
 	};
 	struct tcf_t t;
 
+	params = rtnl_dereference(p->params);
+	opt.action = params->action;
+	opt.update_flags = params->update_flags;
+
 	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
@@ -598,6 +620,15 @@ nla_put_failure:
 	return -1;
 }
 
+static void tcf_csum_cleanup(struct tc_action *a)
+{
+	struct tcf_csum *p = to_tcf_csum(a);
+	struct tcf_csum_params *params;
+
+	params = rcu_dereference_protected(p->params, 1);
+	kfree_rcu(params, rcu);
+}
+
 static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
 			   const struct tc_action_ops *ops)
@@ -621,6 +652,7 @@ static struct tc_action_ops act_csum_ops = {
 	.act		= tcf_csum,
 	.dump		= tcf_csum_dump,
 	.init		= tcf_csum_init,
+	.cleanup	= tcf_csum_cleanup,
 	.walk		= tcf_csum_walker,
 	.lookup		= tcf_csum_search,
 	.size		= sizeof(struct tcf_csum),
-- 
cgit v1.2.3


From 3163c5071f25e36567608cde0df21b198ee5fbeb Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 10:28:12 +0100
Subject: net/smc: use local struct sock variables consistently

Cleanup to consistently exploit the local struct sock definitions.
No functional change.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c    | 33 +++++++++++++++++----------------
 net/smc/smc_close.c | 38 ++++++++++++++++++++------------------
 2 files changed, 37 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index daf8075f5a4c..eccccf743b9f 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -581,39 +581,39 @@ out_err:
 
 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 {
-	struct sock *sk = &lsmc->sk;
-	struct socket *new_clcsock;
+	struct socket *new_clcsock = NULL;
+	struct sock *lsk = &lsmc->sk;
 	struct sock *new_sk;
 	int rc;
 
-	release_sock(&lsmc->sk);
-	new_sk = smc_sock_alloc(sock_net(sk), NULL);
+	release_sock(lsk);
+	new_sk = smc_sock_alloc(sock_net(lsk), NULL);
 	if (!new_sk) {
 		rc = -ENOMEM;
-		lsmc->sk.sk_err = ENOMEM;
+		lsk->sk_err = ENOMEM;
 		*new_smc = NULL;
-		lock_sock(&lsmc->sk);
+		lock_sock(lsk);
 		goto out;
 	}
 	*new_smc = smc_sk(new_sk);
 
 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
-	lock_sock(&lsmc->sk);
+	lock_sock(lsk);
 	if  (rc < 0) {
-		lsmc->sk.sk_err = -rc;
+		lsk->sk_err = -rc;
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
-		sk->sk_prot->unhash(new_sk);
+		new_sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
 	}
-	if (lsmc->sk.sk_state == SMC_CLOSED) {
+	if (lsk->sk_state == SMC_CLOSED) {
 		if (new_clcsock)
 			sock_release(new_clcsock);
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
-		sk->sk_prot->unhash(new_sk);
+		new_sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -936,11 +936,12 @@ static void smc_tcp_listen_work(struct work_struct *work)
 {
 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
 					     tcp_listen_work);
+	struct sock *lsk = &lsmc->sk;
 	struct smc_sock *new_smc;
 	int rc = 0;
 
-	lock_sock(&lsmc->sk);
-	while (lsmc->sk.sk_state == SMC_LISTEN) {
+	lock_sock(lsk);
+	while (lsk->sk_state == SMC_LISTEN) {
 		rc = smc_clcsock_accept(lsmc, &new_smc);
 		if (rc)
 			goto out;
@@ -949,15 +950,15 @@ static void smc_tcp_listen_work(struct work_struct *work)
 
 		new_smc->listen_smc = lsmc;
 		new_smc->use_fallback = false; /* assume rdma capability first*/
-		sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
+		sock_hold(lsk); /* sock_put in smc_listen_work */
 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 		smc_copy_sock_settings_to_smc(new_smc);
 		schedule_work(&new_smc->smc_listen_work);
 	}
 
 out:
-	release_sock(&lsmc->sk);
-	lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+	release_sock(lsk);
+	lsk->sk_data_ready(lsk); /* no more listening, wake accept */
 }
 
 static int smc_listen(struct socket *sock, int backlog)
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index e194c6cc308a..11793912f001 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -115,36 +115,38 @@ static int smc_close_abort(struct smc_connection *conn)
  */
 static void smc_close_active_abort(struct smc_sock *smc)
 {
+	struct sock *sk = &smc->sk;
+
 	struct smc_cdc_conn_state_flags *txflags =
 		&smc->conn.local_tx_ctrl.conn_state_flags;
 
-	smc->sk.sk_err = ECONNABORTED;
+	sk->sk_err = ECONNABORTED;
 	if (smc->clcsock && smc->clcsock->sk) {
 		smc->clcsock->sk->sk_err = ECONNABORTED;
 		smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
 	}
-	switch (smc->sk.sk_state) {
+	switch (sk->sk_state) {
 	case SMC_INIT:
 	case SMC_ACTIVE:
-		smc->sk.sk_state = SMC_PEERABORTWAIT;
+		sk->sk_state = SMC_PEERABORTWAIT;
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
 		txflags->peer_conn_abort = 1;
 		sock_release(smc->clcsock);
 		if (!smc_cdc_rxed_any_close(&smc->conn))
-			smc->sk.sk_state = SMC_PEERABORTWAIT;
+			sk->sk_state = SMC_PEERABORTWAIT;
 		else
-			smc->sk.sk_state = SMC_CLOSED;
+			sk->sk_state = SMC_CLOSED;
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 		if (!txflags->peer_conn_closed) {
-			smc->sk.sk_state = SMC_PEERABORTWAIT;
+			sk->sk_state = SMC_PEERABORTWAIT;
 			txflags->peer_conn_abort = 1;
 			sock_release(smc->clcsock);
 		} else {
-			smc->sk.sk_state = SMC_CLOSED;
+			sk->sk_state = SMC_CLOSED;
 		}
 		break;
 	case SMC_PROCESSABORT:
@@ -153,7 +155,7 @@ static void smc_close_active_abort(struct smc_sock *smc)
 			txflags->peer_conn_abort = 1;
 			sock_release(smc->clcsock);
 		}
-		smc->sk.sk_state = SMC_CLOSED;
+		sk->sk_state = SMC_CLOSED;
 		break;
 	case SMC_PEERFINCLOSEWAIT:
 	case SMC_PEERABORTWAIT:
@@ -161,8 +163,8 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		break;
 	}
 
-	sock_set_flag(&smc->sk, SOCK_DEAD);
-	smc->sk.sk_state_change(&smc->sk);
+	sock_set_flag(sk, SOCK_DEAD);
+	sk->sk_state_change(sk);
 }
 
 static inline bool smc_close_sent_any_close(struct smc_connection *conn)
@@ -278,7 +280,7 @@ again:
 	}
 
 	if (old_state != sk->sk_state)
-		sk->sk_state_change(&smc->sk);
+		sk->sk_state_change(sk);
 	return rc;
 }
 
@@ -331,7 +333,7 @@ static void smc_close_passive_work(struct work_struct *work)
 	struct sock *sk = &smc->sk;
 	int old_state;
 
-	lock_sock(&smc->sk);
+	lock_sock(sk);
 	old_state = sk->sk_state;
 
 	if (!conn->alert_token_local) {
@@ -340,7 +342,7 @@ static void smc_close_passive_work(struct work_struct *work)
 		goto wakeup;
 	}
 
-	rxflags = &smc->conn.local_rx_ctrl.conn_state_flags;
+	rxflags = &conn->local_rx_ctrl.conn_state_flags;
 	if (rxflags->peer_conn_abort) {
 		smc_close_passive_abort_received(smc);
 		goto wakeup;
@@ -348,7 +350,7 @@ static void smc_close_passive_work(struct work_struct *work)
 
 	switch (sk->sk_state) {
 	case SMC_INIT:
-		if (atomic_read(&smc->conn.bytes_to_rcv) ||
+		if (atomic_read(&conn->bytes_to_rcv) ||
 		    (rxflags->peer_done_writing &&
 		     !smc_cdc_rxed_any_close(conn)))
 			sk->sk_state = SMC_APPCLOSEWAIT1;
@@ -365,7 +367,7 @@ static void smc_close_passive_work(struct work_struct *work)
 		/* to check for closing */
 	case SMC_PEERCLOSEWAIT2:
 	case SMC_PEERFINCLOSEWAIT:
-		if (!smc_cdc_rxed_any_close(&smc->conn))
+		if (!smc_cdc_rxed_any_close(conn))
 			break;
 		if (sock_flag(sk, SOCK_DEAD) &&
 		    smc_close_sent_any_close(conn)) {
@@ -394,12 +396,12 @@ wakeup:
 		sk->sk_state_change(sk);
 		if ((sk->sk_state == SMC_CLOSED) &&
 		    (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
-			smc_conn_free(&smc->conn);
+			smc_conn_free(conn);
 			schedule_delayed_work(&smc->sock_put_work,
 					      SMC_CLOSE_SOCK_PUT_DELAY);
 		}
 	}
-	release_sock(&smc->sk);
+	release_sock(sk);
 }
 
 void smc_close_sock_put_work(struct work_struct *work)
@@ -462,7 +464,7 @@ again:
 	}
 
 	if (old_state != sk->sk_state)
-		sk->sk_state_change(&smc->sk);
+		sk->sk_state_change(sk);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 35a6b17847175c4a6bdb0a16c1692627ca56b7f1 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 10:28:13 +0100
Subject: net/smc: simplify function smc_clcsock_accept()

Cleanup to avoid duplicate code in smc_clcsock_accept().
No functional change.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index eccccf743b9f..05cbcd3a6f60 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -599,16 +599,9 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 
 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
 	lock_sock(lsk);
-	if  (rc < 0) {
+	if  (rc < 0)
 		lsk->sk_err = -rc;
-		new_sk->sk_state = SMC_CLOSED;
-		sock_set_flag(new_sk, SOCK_DEAD);
-		new_sk->sk_prot->unhash(new_sk);
-		sock_put(new_sk);
-		*new_smc = NULL;
-		goto out;
-	}
-	if (lsk->sk_state == SMC_CLOSED) {
+	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
 		if (new_clcsock)
 			sock_release(new_clcsock);
 		new_sk->sk_state = SMC_CLOSED;
-- 
cgit v1.2.3


From 8429c1343519d7452b8ccd9af9716147512831c9 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 10:28:14 +0100
Subject: net/smc: get rid of tx_pend waits in socket closing

There is no need to wait for confirmation of pending tx requests
for a closing connection, since pending tx slots are dismissed
when finishing a connection.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c   | 11 -----------
 net/smc/smc_cdc.h   |  1 -
 net/smc/smc_close.c | 25 -------------------------
 net/smc/smc_wr.c    | 18 ------------------
 net/smc/smc_wr.h    |  2 --
 5 files changed, 57 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index d4155ff6acde..51805334e001 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -57,9 +57,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 			       cdcpend->conn);
 	}
 	smc_tx_sndbuf_nonfull(smc);
-	if (smc->sk.sk_state != SMC_ACTIVE)
-		/* wake up smc_close_wait_tx_pends() */
-		smc->sk.sk_state_change(&smc->sk);
 	bh_unlock_sock(&smc->sk);
 }
 
@@ -155,14 +152,6 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
 				(unsigned long)conn);
 }
 
-bool smc_cdc_tx_has_pending(struct smc_connection *conn)
-{
-	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
-
-	return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
-				     smc_cdc_tx_filter, (unsigned long)conn);
-}
-
 /********************************* receive ***********************************/
 
 static inline bool smc_cdc_before(u16 seq1, u16 seq2)
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 149ceda1b088..ab240b37ad11 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -214,7 +214,6 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
 int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend);
 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
-bool smc_cdc_tx_has_pending(struct smc_connection *conn);
 int smc_cdc_init(void) __init;
 
 #endif /* SMC_CDC_H */
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 11793912f001..bc539ccb8fa0 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -19,8 +19,6 @@
 #include "smc_cdc.h"
 #include "smc_close.h"
 
-#define SMC_CLOSE_WAIT_TX_PENDS_TIME		(5 * HZ)
-
 static void smc_close_cleanup_listen(struct sock *parent)
 {
 	struct sock *sk;
@@ -30,26 +28,6 @@ static void smc_close_cleanup_listen(struct sock *parent)
 		smc_close_non_accepted(sk);
 }
 
-static void smc_close_wait_tx_pends(struct smc_sock *smc)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct sock *sk = &smc->sk;
-	signed long timeout;
-
-	timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
-	add_wait_queue(sk_sleep(sk), &wait);
-	while (!signal_pending(current) && timeout) {
-		int rc;
-
-		rc = sk_wait_event(sk, &timeout,
-				   !smc_cdc_tx_has_pending(&smc->conn),
-				   &wait);
-		if (rc)
-			break;
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-}
-
 /* wait for sndbuf data being transmitted */
 static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
 {
@@ -230,7 +208,6 @@ again:
 			rc = smc_close_final(conn);
 		}
 		sk->sk_state = SMC_CLOSED;
-		smc_close_wait_tx_pends(smc);
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
@@ -251,7 +228,6 @@ again:
 		else
 			/* peer has just issued a shutdown write */
 			sk->sk_state = SMC_PEERFINCLOSEWAIT;
-		smc_close_wait_tx_pends(smc);
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
@@ -271,7 +247,6 @@ again:
 		lock_sock(sk);
 		smc_close_abort(conn);
 		sk->sk_state = SMC_CLOSED;
-		smc_close_wait_tx_pends(smc);
 		break;
 	case SMC_PEERABORTWAIT:
 	case SMC_CLOSED:
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index de4537f66832..a4477b4a9c18 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -319,24 +319,6 @@ void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
 	}
 }
 
-bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
-			   smc_wr_tx_filter filter, unsigned long data)
-{
-	struct smc_wr_tx_pend_priv *tx_pend;
-	struct smc_wr_rx_hdr *wr_rx;
-	int i;
-
-	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
-		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
-		if (wr_rx->type != wr_rx_hdr_type)
-			continue;
-		tx_pend = &link->wr_tx_pends[i].priv;
-		if (filter(tx_pend, data))
-			return true;
-	}
-	return false;
-}
-
 /****************************** receive queue ********************************/
 
 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 2acf12b06063..ef0c3494c9cb 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -93,8 +93,6 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 int smc_wr_tx_send(struct smc_link *link,
 		   struct smc_wr_tx_pend_priv *wr_pend_priv);
 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
-bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
-			   smc_wr_tx_filter filter, unsigned long data);
 void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
 			     smc_wr_tx_filter filter,
 			     smc_wr_tx_dismisser dismisser,
-- 
cgit v1.2.3


From 86e780d3a312faad967d2cfd5281f6bae81c0e55 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 10:28:15 +0100
Subject: net/smc: make wait for work request uninterruptible

Work requests are needed for every ib_post_send(), among them the
ib_post_send() to signal closing. If an smc socket program is cancelled,
the smc connections should be cleaned up, and require sending of closing
signals to the peer. This may fail, if a wait for
a free work request is needed, but is cancelled immediately due to the
cancel interrupt. To guarantee notification of the peer, the wait for
a work request is changed to uninterruptible.

And the area to receive work request completion info with
ib_poll_cq() is cleared first.
And _tx_ variable names are used in the _tx_routines for the
demultiplexing common type in the header.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_wr.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index a4477b4a9c18..5ed94109d1d6 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -122,6 +122,7 @@ static void smc_wr_tx_tasklet_fn(unsigned long data)
 again:
 	polled++;
 	do {
+		memset(&wc, 0, sizeof(wc));
 		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
 		if (polled == 1) {
 			ib_req_notify_cq(dev->roce_cq_send,
@@ -185,7 +186,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 		if (rc)
 			return rc;
 	} else {
-		rc = wait_event_interruptible_timeout(
+		rc = wait_event_timeout(
 			link->wr_tx_wait,
 			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
 			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
@@ -198,8 +199,6 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 			smc_lgr_terminate(lgr);
 			return -EPIPE;
 		}
-		if (rc == -ERESTARTSYS)
-			return -EINTR;
 		if (idx == link->wr_tx_cnt)
 			return -EPIPE;
 	}
@@ -300,18 +299,18 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
 	return rc;
 }
 
-void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
+void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
 			     smc_wr_tx_filter filter,
 			     smc_wr_tx_dismisser dismisser,
 			     unsigned long data)
 {
 	struct smc_wr_tx_pend_priv *tx_pend;
-	struct smc_wr_rx_hdr *wr_rx;
+	struct smc_wr_rx_hdr *wr_tx;
 	int i;
 
 	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
-		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
-		if (wr_rx->type != wr_rx_hdr_type)
+		wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
+		if (wr_tx->type != wr_tx_hdr_type)
 			continue;
 		tx_pend = &link->wr_tx_pends[i].priv;
 		if (filter(tx_pend, data))
-- 
cgit v1.2.3


From bbb96bf2366e502fd16a8082f723c570e50269e8 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 10:28:16 +0100
Subject: net/smc: improve state change handling after close wait

When a socket is closed or shutdown, smc waits for data being transmitted
in certain states. If the state changes during this wait, the close
switch depending on state should be reentered.
In addition, state change is avoided if sending of close or shutdown fails.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index bc539ccb8fa0..1468a2a3cdf4 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -165,9 +165,9 @@ int smc_close_active(struct smc_sock *smc)
 		  0 : sock_flag(sk, SOCK_LINGER) ?
 		      sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
 
-again:
 	old_state = sk->sk_state;
-	switch (old_state) {
+again:
+	switch (sk->sk_state) {
 	case SMC_INIT:
 		sk->sk_state = SMC_CLOSED;
 		if (smc->smc_listen_work.func)
@@ -194,6 +194,8 @@ again:
 		if (sk->sk_state == SMC_ACTIVE) {
 			/* send close request */
 			rc = smc_close_final(conn);
+			if (rc)
+				break;
 			sk->sk_state = SMC_PEERCLOSEWAIT1;
 		} else {
 			/* peer event has changed the state */
@@ -206,6 +208,8 @@ again:
 		    !smc_close_sent_any_close(conn)) {
 			/* just shutdown wr done, send close request */
 			rc = smc_close_final(conn);
+			if (rc)
+				break;
 		}
 		sk->sk_state = SMC_CLOSED;
 		break;
@@ -216,12 +220,13 @@ again:
 		release_sock(sk);
 		cancel_delayed_work_sync(&conn->tx_work);
 		lock_sock(sk);
-		if (sk->sk_err != ECONNABORTED) {
-			/* confirm close from peer */
-			rc = smc_close_final(conn);
-			if (rc)
-				break;
-		}
+		if (sk->sk_state != SMC_APPCLOSEWAIT1 &&
+		    sk->sk_state != SMC_APPCLOSEWAIT2)
+			goto again;
+		/* confirm close from peer */
+		rc = smc_close_final(conn);
+		if (rc)
+			break;
 		if (smc_cdc_rxed_any_close(conn))
 			/* peer has closed the socket already */
 			sk->sk_state = SMC_CLOSED;
@@ -235,6 +240,8 @@ again:
 		    !smc_close_sent_any_close(conn)) {
 			/* just shutdown wr done, send close request */
 			rc = smc_close_final(conn);
+			if (rc)
+				break;
 		}
 		/* peer sending PeerConnectionClosed will cause transition */
 		break;
@@ -401,20 +408,21 @@ int smc_close_shutdown_write(struct smc_sock *smc)
 		  0 : sock_flag(sk, SOCK_LINGER) ?
 		      sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
 
-again:
 	old_state = sk->sk_state;
-	switch (old_state) {
+again:
+	switch (sk->sk_state) {
 	case SMC_ACTIVE:
 		smc_close_stream_wait(smc, timeout);
 		release_sock(sk);
 		cancel_delayed_work_sync(&conn->tx_work);
 		lock_sock(sk);
+		if (sk->sk_state != SMC_ACTIVE)
+			goto again;
 		/* send close wr request */
 		rc = smc_close_wr(conn);
-		if (sk->sk_state == SMC_ACTIVE)
-			sk->sk_state = SMC_PEERCLOSEWAIT1;
-		else
-			goto again;
+		if (rc)
+			break;
+		sk->sk_state = SMC_PEERCLOSEWAIT1;
 		break;
 	case SMC_APPCLOSEWAIT1:
 		/* passive close */
@@ -423,8 +431,12 @@ again:
 		release_sock(sk);
 		cancel_delayed_work_sync(&conn->tx_work);
 		lock_sock(sk);
+		if (sk->sk_state != SMC_APPCLOSEWAIT1)
+			goto again;
 		/* confirm close from peer */
 		rc = smc_close_wr(conn);
+		if (rc)
+			break;
 		sk->sk_state = SMC_APPCLOSEWAIT2;
 		break;
 	case SMC_APPCLOSEWAIT2:
-- 
cgit v1.2.3


From aa377e682d0703452642d4059a94ab7fdaa90b89 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 10:28:17 +0100
Subject: net/smc: continue waiting if peer signals write_shutdown

If the peer sends a shutdown WRITE, this should not affect sending
in general, and waiting for send buffer space in particular.
Stop waiting of the local socket for send buffer space only, if peer
signals closing, but not if peer signals just shutdown WRITE.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 2e50fddf8ce9..fea6482233a6 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -86,7 +86,7 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 			rc = -EPIPE;
 			break;
 		}
-		if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+		if (smc_cdc_rxed_any_close(conn)) {
 			rc = -ECONNRESET;
 			break;
 		}
@@ -107,7 +107,7 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 		sk_wait_event(sk, &timeo,
 			      sk->sk_err ||
 			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
-			      smc_cdc_rxed_any_close_or_senddone(conn) ||
+			      smc_cdc_rxed_any_close(conn) ||
 			      atomic_read(&conn->sndbuf_space),
 			      &wait);
 	}
-- 
cgit v1.2.3


From 57a5749b0fa3639a228d7c0ac080ee1704abe8fd Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:34 +0000
Subject: pktgen: Add missing !flag parameters

o FLOW_SEQ now can be disabled with pgset "flag !FLOW_SEQ"
o FLOW_SEQ and FLOW_RND are antonyms, as it's shown by pktgen_if_show()
o IPSEC now may be disabled

Note, that IPV6 is enabled with dst6/src6 parameters, not with
a flag parameter.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4fcfcb14e7c6..20f1c873a1ed 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1284,9 +1284,12 @@ static ssize_t pktgen_if_write(struct file *file,
 		else if (strcmp(f, "!SVID_RND") == 0)
 			pkt_dev->flags &= ~F_SVID_RND;
 
-		else if (strcmp(f, "FLOW_SEQ") == 0)
+		else if (strcmp(f, "FLOW_SEQ") == 0 || strcmp(f, "!FLOW_RND") == 0)
 			pkt_dev->flags |= F_FLOW_SEQ;
 
+		else if (strcmp(f, "FLOW_RND") == 0 || strcmp(f, "!FLOW_SEQ") == 0)
+			pkt_dev->flags &= ~F_FLOW_SEQ;
+
 		else if (strcmp(f, "QUEUE_MAP_RND") == 0)
 			pkt_dev->flags |= F_QUEUE_MAP_RND;
 
@@ -1301,6 +1304,9 @@ static ssize_t pktgen_if_write(struct file *file,
 #ifdef CONFIG_XFRM
 		else if (strcmp(f, "IPSEC") == 0)
 			pkt_dev->flags |= F_IPSEC_ON;
+
+		else if (strcmp(f, "!IPSEC") == 0)
+			pkt_dev->flags &= ~F_IPSEC_ON;
 #endif
 
 		else if (strcmp(f, "!IPV6") == 0)
-- 
cgit v1.2.3


From 6f107c741212ca5fbfb0724571395716420017d6 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:35 +0000
Subject: pktgen: Add behaviour flags macro to generate flags/names

PKT_FALGS macro will be used to add package behavior names definitions
to simplify the code that prints/reads pkg flags.
Sorted the array in order of printing the flags in pktgen_if_show()
Note: Renamed IPSEC_ON => IPSEC for simplicity.

No visible behavior change expected.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 57 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 20f1c873a1ed..e6d862214cc0 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -184,25 +184,36 @@
 
 #define func_enter() pr_debug("entering %s\n", __func__);
 
+#define PKT_FLAGS							\
+	pf(IPV6)		/* Interface in IPV6 Mode */		\
+	pf(IPSRC_RND)		/* IP-Src Random  */			\
+	pf(IPDST_RND)		/* IP-Dst Random  */			\
+	pf(TXSIZE_RND)		/* Transmit size is random */		\
+	pf(UDPSRC_RND)		/* UDP-Src Random */			\
+	pf(UDPDST_RND)		/* UDP-Dst Random */			\
+	pf(UDPCSUM)		/* Include UDP checksum */		\
+	pf(NO_TIMESTAMP)	/* Don't timestamp packets (default TS) */ \
+	pf(MPLS_RND)		/* Random MPLS labels */		\
+	pf(QUEUE_MAP_RND)	/* queue map Random */			\
+	pf(QUEUE_MAP_CPU)	/* queue map mirrors smp_processor_id() */ \
+	pf(FLOW_SEQ)		/* Sequential flows */			\
+	pf(IPSEC)		/* ipsec on for flows */		\
+	pf(MACSRC_RND)		/* MAC-Src Random */			\
+	pf(MACDST_RND)		/* MAC-Dst Random */			\
+	pf(VID_RND)		/* Random VLAN ID */			\
+	pf(SVID_RND)		/* Random SVLAN ID */			\
+	pf(NODE)		/* Node memory alloc*/			\
+
+#define pf(flag)		flag##_SHIFT,
+enum pkt_flags {
+	PKT_FLAGS
+};
+#undef pf
+
 /* Device flag bits */
-#define F_IPSRC_RND   (1<<0)	/* IP-Src Random  */
-#define F_IPDST_RND   (1<<1)	/* IP-Dst Random  */
-#define F_UDPSRC_RND  (1<<2)	/* UDP-Src Random */
-#define F_UDPDST_RND  (1<<3)	/* UDP-Dst Random */
-#define F_MACSRC_RND  (1<<4)	/* MAC-Src Random */
-#define F_MACDST_RND  (1<<5)	/* MAC-Dst Random */
-#define F_TXSIZE_RND  (1<<6)	/* Transmit size is random */
-#define F_IPV6        (1<<7)	/* Interface in IPV6 Mode */
-#define F_MPLS_RND    (1<<8)	/* Random MPLS labels */
-#define F_VID_RND     (1<<9)	/* Random VLAN ID */
-#define F_SVID_RND    (1<<10)	/* Random SVLAN ID */
-#define F_FLOW_SEQ    (1<<11)	/* Sequential flows */
-#define F_IPSEC_ON    (1<<12)	/* ipsec on for flows */
-#define F_QUEUE_MAP_RND (1<<13)	/* queue map Random */
-#define F_QUEUE_MAP_CPU (1<<14)	/* queue map mirrors smp_processor_id() */
-#define F_NODE          (1<<15)	/* Node memory alloc*/
-#define F_UDPCSUM       (1<<16)	/* Include UDP checksum */
-#define F_NO_TIMESTAMP  (1<<17)	/* Don't timestamp packets (default TS) */
+#define pf(flag)		static const __u32 F_##flag = (1<<flag##_SHIFT);
+PKT_FLAGS
+#undef pf
 
 /* Thread control flag bits */
 #define T_STOP        (1<<0)	/* Stop run */
@@ -672,7 +683,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 	}
 
 #ifdef CONFIG_XFRM
-	if (pkt_dev->flags & F_IPSEC_ON) {
+	if (pkt_dev->flags & F_IPSEC) {
 		seq_puts(seq,  "IPSEC  ");
 		if (pkt_dev->spi)
 			seq_printf(seq, "spi:%u", pkt_dev->spi);
@@ -1303,10 +1314,10 @@ static ssize_t pktgen_if_write(struct file *file,
 			pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
 #ifdef CONFIG_XFRM
 		else if (strcmp(f, "IPSEC") == 0)
-			pkt_dev->flags |= F_IPSEC_ON;
+			pkt_dev->flags |= F_IPSEC;
 
 		else if (strcmp(f, "!IPSEC") == 0)
-			pkt_dev->flags &= ~F_IPSEC_ON;
+			pkt_dev->flags &= ~F_IPSEC;
 #endif
 
 		else if (strcmp(f, "!IPV6") == 0)
@@ -2547,7 +2558,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
 				pkt_dev->flows[flow].cur_daddr =
 				    pkt_dev->cur_daddr;
 #ifdef CONFIG_XFRM
-				if (pkt_dev->flags & F_IPSEC_ON)
+				if (pkt_dev->flags & F_IPSEC)
 					get_ipsec_sa(pkt_dev, flow);
 #endif
 				pkt_dev->nflows++;
@@ -2652,7 +2663,7 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
 static int process_ipsec(struct pktgen_dev *pkt_dev,
 			      struct sk_buff *skb, __be16 protocol)
 {
-	if (pkt_dev->flags & F_IPSEC_ON) {
+	if (pkt_dev->flags & F_IPSEC) {
 		struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
 		int nhead = 0;
 		if (x) {
-- 
cgit v1.2.3


From 99c6d3d20d6287d7dd6e65686dba9e696de91f90 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:36 +0000
Subject: pktgen: Remove brute-force printing of flags

Add macro generated pkt_flag_names array, with a little help of which
the flags can be printed by using an index.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 77 ++++++++++++++-----------------------------------------
 1 file changed, 19 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index e6d862214cc0..17fb036d7297 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -215,6 +215,14 @@ enum pkt_flags {
 PKT_FLAGS
 #undef pf
 
+#define pf(flag)		__stringify(flag),
+static char *pkt_flag_names[] = {
+	PKT_FLAGS
+};
+#undef pf
+
+#define NR_PKT_FLAGS		ARRAY_SIZE(pkt_flag_names)
+
 /* Thread control flag bits */
 #define T_STOP        (1<<0)	/* Stop run */
 #define T_RUN         (1<<1)	/* Start run */
@@ -545,6 +553,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 {
 	const struct pktgen_dev *pkt_dev = seq->private;
 	ktime_t stopped;
+	unsigned int i;
 	u64 idle;
 
 	seq_printf(seq,
@@ -606,7 +615,6 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 		   pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
 
 	if (pkt_dev->nr_labels) {
-		unsigned int i;
 		seq_puts(seq, "     mpls: ");
 		for (i = 0; i < pkt_dev->nr_labels; i++)
 			seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
@@ -642,68 +650,21 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 
 	seq_puts(seq, "     Flags: ");
 
-	if (pkt_dev->flags & F_IPV6)
-		seq_puts(seq, "IPV6  ");
-
-	if (pkt_dev->flags & F_IPSRC_RND)
-		seq_puts(seq, "IPSRC_RND  ");
-
-	if (pkt_dev->flags & F_IPDST_RND)
-		seq_puts(seq, "IPDST_RND  ");
-
-	if (pkt_dev->flags & F_TXSIZE_RND)
-		seq_puts(seq, "TXSIZE_RND  ");
-
-	if (pkt_dev->flags & F_UDPSRC_RND)
-		seq_puts(seq, "UDPSRC_RND  ");
-
-	if (pkt_dev->flags & F_UDPDST_RND)
-		seq_puts(seq, "UDPDST_RND  ");
-
-	if (pkt_dev->flags & F_UDPCSUM)
-		seq_puts(seq, "UDPCSUM  ");
-
-	if (pkt_dev->flags & F_NO_TIMESTAMP)
-		seq_puts(seq, "NO_TIMESTAMP  ");
-
-	if (pkt_dev->flags & F_MPLS_RND)
-		seq_puts(seq,  "MPLS_RND  ");
-
-	if (pkt_dev->flags & F_QUEUE_MAP_RND)
-		seq_puts(seq,  "QUEUE_MAP_RND  ");
-
-	if (pkt_dev->flags & F_QUEUE_MAP_CPU)
-		seq_puts(seq,  "QUEUE_MAP_CPU  ");
+	for (i = 0; i < NR_PKT_FLAGS; i++) {
+		if (i == F_FLOW_SEQ)
+			if (!pkt_dev->cflows)
+				continue;
 
-	if (pkt_dev->cflows) {
-		if (pkt_dev->flags & F_FLOW_SEQ)
-			seq_puts(seq,  "FLOW_SEQ  "); /*in sequence flows*/
-		else
-			seq_puts(seq,  "FLOW_RND  ");
-	}
+		if (pkt_dev->flags & (1 << i))
+			seq_printf(seq, "%s  ", pkt_flag_names[i]);
+		else if (i == F_FLOW_SEQ)
+			seq_puts(seq, "FLOW_RND  ");
 
 #ifdef CONFIG_XFRM
-	if (pkt_dev->flags & F_IPSEC) {
-		seq_puts(seq,  "IPSEC  ");
-		if (pkt_dev->spi)
+		if (i == F_IPSEC && pkt_dev->spi)
 			seq_printf(seq, "spi:%u", pkt_dev->spi);
-	}
 #endif
-
-	if (pkt_dev->flags & F_MACSRC_RND)
-		seq_puts(seq, "MACSRC_RND  ");
-
-	if (pkt_dev->flags & F_MACDST_RND)
-		seq_puts(seq, "MACDST_RND  ");
-
-	if (pkt_dev->flags & F_VID_RND)
-		seq_puts(seq, "VID_RND  ");
-
-	if (pkt_dev->flags & F_SVID_RND)
-		seq_puts(seq, "SVID_RND  ");
-
-	if (pkt_dev->flags & F_NODE)
-		seq_puts(seq, "NODE_ALLOC  ");
+	}
 
 	seq_puts(seq, "\n");
 
-- 
cgit v1.2.3


From 52e12d5daea48c92f0b0354f4cdc127a2c0a3c52 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:37 +0000
Subject: pktgen: Clean read user supplied flag mess

Don't use error-prone-brute-force way.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 144 +++++++++++++++---------------------------------------
 1 file changed, 39 insertions(+), 105 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 17fb036d7297..b8ab5c829511 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -830,6 +830,35 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
 	return i;
 }
 
+static __u32 pktgen_read_flag(const char *f, bool *disable)
+{
+	__u32 i;
+
+	if (f[0] == '!') {
+		*disable = true;
+		f++;
+	}
+
+	for (i = 0; i < NR_PKT_FLAGS; i++) {
+		if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT)
+			continue;
+
+		/* allow only disabling ipv6 flag */
+		if (!*disable && i == IPV6_SHIFT)
+			continue;
+
+		if (strcmp(f, pkt_flag_names[i]) == 0)
+			return 1 << i;
+	}
+
+	if (strcmp(f, "FLOW_RND") == 0) {
+		*disable = !*disable;
+		return F_FLOW_SEQ;
+	}
+
+	return 0;
+}
+
 static ssize_t pktgen_if_write(struct file *file,
 			       const char __user * user_buffer, size_t count,
 			       loff_t * offset)
@@ -1187,7 +1216,10 @@ static ssize_t pktgen_if_write(struct file *file,
 		return count;
 	}
 	if (!strcmp(name, "flag")) {
+		__u32 flag;
 		char f[32];
+		bool disable = false;
+
 		memset(f, 0, 32);
 		len = strn_len(&user_buffer[i], sizeof(f) - 1);
 		if (len < 0)
@@ -1196,113 +1228,15 @@ static ssize_t pktgen_if_write(struct file *file,
 		if (copy_from_user(f, &user_buffer[i], len))
 			return -EFAULT;
 		i += len;
-		if (strcmp(f, "IPSRC_RND") == 0)
-			pkt_dev->flags |= F_IPSRC_RND;
-
-		else if (strcmp(f, "!IPSRC_RND") == 0)
-			pkt_dev->flags &= ~F_IPSRC_RND;
-
-		else if (strcmp(f, "TXSIZE_RND") == 0)
-			pkt_dev->flags |= F_TXSIZE_RND;
-
-		else if (strcmp(f, "!TXSIZE_RND") == 0)
-			pkt_dev->flags &= ~F_TXSIZE_RND;
-
-		else if (strcmp(f, "IPDST_RND") == 0)
-			pkt_dev->flags |= F_IPDST_RND;
-
-		else if (strcmp(f, "!IPDST_RND") == 0)
-			pkt_dev->flags &= ~F_IPDST_RND;
-
-		else if (strcmp(f, "UDPSRC_RND") == 0)
-			pkt_dev->flags |= F_UDPSRC_RND;
-
-		else if (strcmp(f, "!UDPSRC_RND") == 0)
-			pkt_dev->flags &= ~F_UDPSRC_RND;
-
-		else if (strcmp(f, "UDPDST_RND") == 0)
-			pkt_dev->flags |= F_UDPDST_RND;
-
-		else if (strcmp(f, "!UDPDST_RND") == 0)
-			pkt_dev->flags &= ~F_UDPDST_RND;
-
-		else if (strcmp(f, "MACSRC_RND") == 0)
-			pkt_dev->flags |= F_MACSRC_RND;
-
-		else if (strcmp(f, "!MACSRC_RND") == 0)
-			pkt_dev->flags &= ~F_MACSRC_RND;
 
-		else if (strcmp(f, "MACDST_RND") == 0)
-			pkt_dev->flags |= F_MACDST_RND;
+		flag = pktgen_read_flag(f, &disable);
 
-		else if (strcmp(f, "!MACDST_RND") == 0)
-			pkt_dev->flags &= ~F_MACDST_RND;
-
-		else if (strcmp(f, "MPLS_RND") == 0)
-			pkt_dev->flags |= F_MPLS_RND;
-
-		else if (strcmp(f, "!MPLS_RND") == 0)
-			pkt_dev->flags &= ~F_MPLS_RND;
-
-		else if (strcmp(f, "VID_RND") == 0)
-			pkt_dev->flags |= F_VID_RND;
-
-		else if (strcmp(f, "!VID_RND") == 0)
-			pkt_dev->flags &= ~F_VID_RND;
-
-		else if (strcmp(f, "SVID_RND") == 0)
-			pkt_dev->flags |= F_SVID_RND;
-
-		else if (strcmp(f, "!SVID_RND") == 0)
-			pkt_dev->flags &= ~F_SVID_RND;
-
-		else if (strcmp(f, "FLOW_SEQ") == 0 || strcmp(f, "!FLOW_RND") == 0)
-			pkt_dev->flags |= F_FLOW_SEQ;
-
-		else if (strcmp(f, "FLOW_RND") == 0 || strcmp(f, "!FLOW_SEQ") == 0)
-			pkt_dev->flags &= ~F_FLOW_SEQ;
-
-		else if (strcmp(f, "QUEUE_MAP_RND") == 0)
-			pkt_dev->flags |= F_QUEUE_MAP_RND;
-
-		else if (strcmp(f, "!QUEUE_MAP_RND") == 0)
-			pkt_dev->flags &= ~F_QUEUE_MAP_RND;
-
-		else if (strcmp(f, "QUEUE_MAP_CPU") == 0)
-			pkt_dev->flags |= F_QUEUE_MAP_CPU;
-
-		else if (strcmp(f, "!QUEUE_MAP_CPU") == 0)
-			pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
-#ifdef CONFIG_XFRM
-		else if (strcmp(f, "IPSEC") == 0)
-			pkt_dev->flags |= F_IPSEC;
-
-		else if (strcmp(f, "!IPSEC") == 0)
-			pkt_dev->flags &= ~F_IPSEC;
-#endif
-
-		else if (strcmp(f, "!IPV6") == 0)
-			pkt_dev->flags &= ~F_IPV6;
-
-		else if (strcmp(f, "NODE_ALLOC") == 0)
-			pkt_dev->flags |= F_NODE;
-
-		else if (strcmp(f, "!NODE_ALLOC") == 0)
-			pkt_dev->flags &= ~F_NODE;
-
-		else if (strcmp(f, "UDPCSUM") == 0)
-			pkt_dev->flags |= F_UDPCSUM;
-
-		else if (strcmp(f, "!UDPCSUM") == 0)
-			pkt_dev->flags &= ~F_UDPCSUM;
-
-		else if (strcmp(f, "NO_TIMESTAMP") == 0)
-			pkt_dev->flags |= F_NO_TIMESTAMP;
-
-		else if (strcmp(f, "!NO_TIMESTAMP") == 0)
-			pkt_dev->flags &= ~F_NO_TIMESTAMP;
-
-		else {
+		if (flag) {
+			if (disable)
+				pkt_dev->flags &= ~flag;
+			else
+				pkt_dev->flags |= flag;
+		} else {
 			sprintf(pg_result,
 				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
 				f,
-- 
cgit v1.2.3


From 715df5ecab0f22685930cb8bb0cc70ed8fb9279e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:13 -0800
Subject: net: sched: propagate extack to cls->destroy callbacks

Propagate extack to cls->destroy callbacks when called from
non-error paths.  On error paths pass NULL to avoid overwriting
the failure message.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  3 ++-
 net/sched/cls_api.c       | 15 ++++++++-------
 net/sched/cls_basic.c     |  2 +-
 net/sched/cls_bpf.c       |  3 ++-
 net/sched/cls_cgroup.c    |  3 ++-
 net/sched/cls_flow.c      |  2 +-
 net/sched/cls_flower.c    |  2 +-
 net/sched/cls_fw.c        |  2 +-
 net/sched/cls_matchall.c  |  2 +-
 net/sched/cls_route.c     |  2 +-
 net/sched/cls_rsvp.h      |  2 +-
 net/sched/cls_tcindex.c   |  3 ++-
 net/sched/cls_u32.c       |  2 +-
 13 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cd1be1f25c36..eac43e8ca96d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -233,7 +233,8 @@ struct tcf_proto_ops {
 					    const struct tcf_proto *,
 					    struct tcf_result *);
 	int			(*init)(struct tcf_proto*);
-	void			(*destroy)(struct tcf_proto*);
+	void			(*destroy)(struct tcf_proto *tp,
+					   struct netlink_ext_ack *extack);
 
 	void*			(*get)(struct tcf_proto*, u32 handle);
 	int			(*change)(struct net *net, struct sk_buff *,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f5d293416f46..bcb4ccb5f894 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -172,9 +172,10 @@ errout:
 	return ERR_PTR(err);
 }
 
-static void tcf_proto_destroy(struct tcf_proto *tp)
+static void tcf_proto_destroy(struct tcf_proto *tp,
+			      struct netlink_ext_ack *extack)
 {
-	tp->ops->destroy(tp);
+	tp->ops->destroy(tp, extack);
 	module_put(tp->ops->owner);
 	kfree_rcu(tp, rcu);
 }
@@ -223,7 +224,7 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 	tcf_chain_head_change(chain, NULL);
 	while (tp) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);
-		tcf_proto_destroy(tp);
+		tcf_proto_destroy(tp, NULL);
 		tp = rtnl_dereference(chain->filter_chain);
 		tcf_chain_put(chain);
 	}
@@ -1182,7 +1183,7 @@ replay:
 			tcf_chain_tp_remove(chain, &chain_info, tp);
 			tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 				       RTM_DELTFILTER, false);
-			tcf_proto_destroy(tp);
+			tcf_proto_destroy(tp, extack);
 			err = 0;
 			goto errout;
 		}
@@ -1200,7 +1201,7 @@ replay:
 		case RTM_NEWTFILTER:
 			if (n->nlmsg_flags & NLM_F_EXCL) {
 				if (tp_created)
-					tcf_proto_destroy(tp);
+					tcf_proto_destroy(tp, NULL);
 				NL_SET_ERR_MSG(extack, "Filter already exists");
 				err = -EEXIST;
 				goto errout;
@@ -1214,7 +1215,7 @@ replay:
 				goto errout;
 			if (last) {
 				tcf_chain_tp_remove(chain, &chain_info, tp);
-				tcf_proto_destroy(tp);
+				tcf_proto_destroy(tp, extack);
 			}
 			goto errout;
 		case RTM_GETTFILTER:
@@ -1240,7 +1241,7 @@ replay:
 			       RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
-			tcf_proto_destroy(tp);
+			tcf_proto_destroy(tp, NULL);
 	}
 
 errout:
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6088be65d167..d333f5c5101d 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -112,7 +112,7 @@ static void basic_delete_filter(struct rcu_head *head)
 	tcf_queue_work(&f->work);
 }
 
-static void basic_destroy(struct tcf_proto *tp)
+static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f, *n;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c11e0fe23a17..a562b9a39e71 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -314,7 +314,8 @@ static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
 	return 0;
 }
 
-static void cls_bpf_destroy(struct tcf_proto *tp)
+static void cls_bpf_destroy(struct tcf_proto *tp,
+			    struct netlink_ext_ack *extack)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 	struct cls_bpf_prog *prog, *tmp;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 1b54fbfca414..762da5c0cf5e 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -143,7 +143,8 @@ errout:
 	return err;
 }
 
-static void cls_cgroup_destroy(struct tcf_proto *tp)
+static void cls_cgroup_destroy(struct tcf_proto *tp,
+			       struct netlink_ext_ack *extack)
 {
 	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 64c24b488058..cd5fe383afdd 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -600,7 +600,7 @@ static int flow_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void flow_destroy(struct tcf_proto *tp)
+static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f, *next;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 727c10378f37..213be0e6f1d1 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -315,7 +315,7 @@ static void fl_destroy_rcu(struct rcu_head *rcu)
 	schedule_work(&head->work);
 }
 
-static void fl_destroy(struct tcf_proto *tp)
+static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f, *next;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 94d159a8869a..8b207723fbc2 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -149,7 +149,7 @@ static void fw_delete_filter(struct rcu_head *head)
 	tcf_queue_work(&f->work);
 }
 
-static void fw_destroy(struct tcf_proto *tp)
+static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index d990d2a52c6d..2de2338f4030 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -114,7 +114,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	return 0;
 }
 
-static void mall_destroy(struct tcf_proto *tp)
+static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 55467c30d524..21a03a8ee029 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -281,7 +281,7 @@ static void route4_delete_filter(struct rcu_head *head)
 	tcf_queue_work(&f->work);
 }
 
-static void route4_destroy(struct tcf_proto *tp)
+static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	int h1, h2;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 5cc0df690cff..4f1297657c27 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -322,7 +322,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 		__rsvp_delete_filter(f);
 }
 
-static void rsvp_destroy(struct tcf_proto *tp)
+static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct rsvp_head *data = rtnl_dereference(tp->root);
 	int h1, h2;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 01a163e0b6aa..b49cc990a000 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -581,7 +581,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 	}
 }
 
-static void tcindex_destroy(struct tcf_proto *tp)
+static void tcindex_destroy(struct tcf_proto *tp,
+			    struct netlink_ext_ack *extack)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
 	struct tcf_walker walker;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 7030240f8826..98cabe835fd8 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -638,7 +638,7 @@ static bool ht_empty(struct tc_u_hnode *ht)
 	return true;
 }
 
-static void u32_destroy(struct tcf_proto *tp)
+static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
-- 
cgit v1.2.3


From 34832e1c701553ed3eeefe5413fa93d185cff7f4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:14 -0800
Subject: net: sched: prepare for reimplementation of
 tc_cls_common_offload_init()

Rename the tc_cls_common_offload_init() helper function to
tc_cls_common_offload_init_deprecated() and add a new implementation
which also takes flags argument.  We will only set extack if flags
indicate that offload is forced (skip_sw) otherwise driver errors
should be ignored, as they don't influence the overall filter
installation.

Note that we need the tc_skip_hw() helper for new version, therefore
it is added later in the file.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    | 18 +++++++++++++++---
 net/sched/cls_bpf.c      |  4 ++--
 net/sched/cls_flower.c   |  6 +++---
 net/sched/cls_matchall.c |  4 ++--
 net/sched/cls_u32.c      |  8 ++++----
 5 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 2f8f16a4d88e..08815fe9314d 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -606,9 +606,9 @@ struct tc_cls_common_offload {
 };
 
 static inline void
-tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
-			   const struct tcf_proto *tp,
-			   struct netlink_ext_ack *extack)
+tc_cls_common_offload_init_deprecated(struct tc_cls_common_offload *cls_common,
+				      const struct tcf_proto *tp,
+				      struct netlink_ext_ack *extack)
 {
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
@@ -694,6 +694,18 @@ static inline bool tc_in_hw(u32 flags)
 	return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false;
 }
 
+static inline void
+tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
+			   const struct tcf_proto *tp, u32 flags,
+			   struct netlink_ext_ack *extack)
+{
+	cls_common->chain_index = tp->chain->index;
+	cls_common->protocol = tp->protocol;
+	cls_common->prio = tp->prio;
+	if (tc_skip_sw(flags))
+		cls_common->extack = extack;
+}
+
 enum tc_fl_command {
 	TC_CLSFLOWER_REPLACE,
 	TC_CLSFLOWER_DESTROY,
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index a562b9a39e71..0bffb189d646 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -159,7 +159,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	skip_sw = prog && tc_skip_sw(prog->gen_flags);
 	obj = prog ?: oldprog;
 
-	tc_cls_common_offload_init(&cls_bpf.common, tp, extack);
+	tc_cls_common_offload_init_deprecated(&cls_bpf.common, tp, extack);
 	cls_bpf.command = TC_CLSBPF_OFFLOAD;
 	cls_bpf.exts = &obj->exts;
 	cls_bpf.prog = prog ? prog->filter : NULL;
@@ -227,7 +227,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_bpf_offload cls_bpf = {};
 
-	tc_cls_common_offload_init(&cls_bpf.common, tp, NULL);
+	tc_cls_common_offload_init_deprecated(&cls_bpf.common, tp, NULL);
 	cls_bpf.command = TC_CLSBPF_STATS;
 	cls_bpf.exts = &prog->exts;
 	cls_bpf.prog = prog->filter;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 213be0e6f1d1..3f2654ca8ff7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -223,7 +223,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp, NULL);
+	tc_cls_common_offload_init_deprecated(&cls_flower.common, tp, NULL);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
@@ -243,7 +243,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(f->flags);
 	int err;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp, extack);
+	tc_cls_common_offload_init_deprecated(&cls_flower.common, tp, extack);
 	cls_flower.command = TC_CLSFLOWER_REPLACE;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.dissector = dissector;
@@ -272,7 +272,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp, NULL);
+	tc_cls_common_offload_init_deprecated(&cls_flower.common, tp, NULL);
 	cls_flower.command = TC_CLSFLOWER_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.exts = &f->exts;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2de2338f4030..a9a535a7a431 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -76,7 +76,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	struct tc_cls_matchall_offload cls_mall = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_mall.common, tp, NULL);
+	tc_cls_common_offload_init_deprecated(&cls_mall.common, tp, NULL);
 	cls_mall.command = TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = cookie;
 
@@ -94,7 +94,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(head->flags);
 	int err;
 
-	tc_cls_common_offload_init(&cls_mall.common, tp, extack);
+	tc_cls_common_offload_init_deprecated(&cls_mall.common, tp, extack);
 	cls_mall.command = TC_CLSMATCHALL_REPLACE;
 	cls_mall.exts = &head->exts;
 	cls_mall.cookie = cookie;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 98cabe835fd8..e2e8d08c4a0d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -491,7 +491,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, NULL);
+	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, NULL);
 	cls_u32.command = TC_CLSU32_DELETE_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -509,7 +509,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	bool offloaded = false;
 	int err;
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, extack);
+	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, extack);
 	cls_u32.command = TC_CLSU32_NEW_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -534,7 +534,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, NULL);
+	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, NULL);
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
 	cls_u32.knode.handle = n->handle;
 
@@ -550,7 +550,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	bool skip_sw = tc_skip_sw(flags);
 	int err;
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, extack);
+	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, extack);
 	cls_u32.command = TC_CLSU32_REPLACE_KNODE;
 	cls_u32.knode.handle = n->handle;
 	cls_u32.knode.fshift = n->fshift;
-- 
cgit v1.2.3


From f558fdea03bf70f23a3ac63d8c7cdd9755797f80 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:15 -0800
Subject: cls_bpf: remove gen_flags from bpf_offload

cls_bpf now guarantees that only device-bound programs are
allowed with skip_sw.  The drivers no longer pay attention to
flags on filter load, therefore the bpf_offload member can be
removed.  If flags are needed again they should probably be
added to struct tc_cls_common_offload instead.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 1 -
 net/sched/cls_bpf.c   | 2 --
 2 files changed, 3 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 08815fe9314d..85cee929b9ce 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -748,7 +748,6 @@ struct tc_cls_bpf_offload {
 	struct bpf_prog *oldprog;
 	const char *name;
 	bool exts_integrated;
-	u32 gen_flags;
 };
 
 struct tc_mqprio_qopt_offload {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 0bffb189d646..b8f953d00c46 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -166,7 +166,6 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	cls_bpf.oldprog = oldprog ? oldprog->filter : NULL;
 	cls_bpf.name = obj->bpf_name;
 	cls_bpf.exts_integrated = obj->exts_integrated;
-	cls_bpf.gen_flags = obj->gen_flags;
 
 	if (oldprog)
 		tcf_block_offload_dec(block, &oldprog->gen_flags);
@@ -233,7 +232,6 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 	cls_bpf.prog = prog->filter;
 	cls_bpf.name = prog->bpf_name;
 	cls_bpf.exts_integrated = prog->exts_integrated;
-	cls_bpf.gen_flags = prog->gen_flags;
 
 	tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
 }
-- 
cgit v1.2.3


From a6ffd6b5d6ffc1915f599e93bbf87c1cedd6abfc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:16 -0800
Subject: cls_bpf: pass offload flags to tc_cls_common_offload_init()

Pass offload flags to the new implementation of
tc_cls_common_offload_init().  Extack will now only
be set if user requested skip_sw.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b8f953d00c46..323b01f76a4c 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -159,7 +159,8 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	skip_sw = prog && tc_skip_sw(prog->gen_flags);
 	obj = prog ?: oldprog;
 
-	tc_cls_common_offload_init_deprecated(&cls_bpf.common, tp, extack);
+	tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags,
+				   extack);
 	cls_bpf.command = TC_CLSBPF_OFFLOAD;
 	cls_bpf.exts = &obj->exts;
 	cls_bpf.prog = prog ? prog->filter : NULL;
@@ -226,7 +227,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_bpf_offload cls_bpf = {};
 
-	tc_cls_common_offload_init_deprecated(&cls_bpf.common, tp, NULL);
+	tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, NULL);
 	cls_bpf.command = TC_CLSBPF_STATS;
 	cls_bpf.exts = &prog->exts;
 	cls_bpf.prog = prog->filter;
-- 
cgit v1.2.3


From 0e908a450ad6a8445b04ec49807ca5fdc94956dd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:17 -0800
Subject: cls_bpf: propagate extack to offload delete callback

Propagate extack on removal of offloaded filter.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 323b01f76a4c..8e5326bc6440 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -212,11 +212,12 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 }
 
 static void cls_bpf_stop_offload(struct tcf_proto *tp,
-				 struct cls_bpf_prog *prog)
+				 struct cls_bpf_prog *prog,
+				 struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = cls_bpf_offload_cmd(tp, NULL, prog, NULL);
+	err = cls_bpf_offload_cmd(tp, NULL, prog, extack);
 	if (err)
 		pr_err("Stopping hardware offload failed: %d\n", err);
 }
@@ -289,12 +290,13 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
 	tcf_queue_work(&prog->work);
 }
 
-static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+			     struct netlink_ext_ack *extack)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
 	idr_remove_ext(&head->handle_idr, prog->handle);
-	cls_bpf_stop_offload(tp, prog);
+	cls_bpf_stop_offload(tp, prog, extack);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	if (tcf_exts_get_net(&prog->exts))
@@ -308,7 +310,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
-	__cls_bpf_delete(tp, arg);
+	__cls_bpf_delete(tp, arg, extack);
 	*last = list_empty(&head->plist);
 	return 0;
 }
@@ -320,7 +322,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp,
 	struct cls_bpf_prog *prog, *tmp;
 
 	list_for_each_entry_safe(prog, tmp, &head->plist, link)
-		__cls_bpf_delete(tp, prog);
+		__cls_bpf_delete(tp, prog, extack);
 
 	idr_destroy(&head->handle_idr);
 	kfree_rcu(head, rcu);
-- 
cgit v1.2.3


From 93da52b567c24ce31938181fd55755b908cd0a5a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:18 -0800
Subject: cls_matchall: pass offload flags to tc_cls_common_offload_init()

Pass offload flags to the new implementation of
tc_cls_common_offload_init().  Extack will now only
be set if user requested skip_sw.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index a9a535a7a431..b0b8627b53d2 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -76,7 +76,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	struct tc_cls_matchall_offload cls_mall = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init_deprecated(&cls_mall.common, tp, NULL);
+	tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, NULL);
 	cls_mall.command = TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = cookie;
 
@@ -94,7 +94,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(head->flags);
 	int err;
 
-	tc_cls_common_offload_init_deprecated(&cls_mall.common, tp, extack);
+	tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
 	cls_mall.command = TC_CLSMATCHALL_REPLACE;
 	cls_mall.exts = &head->exts;
 	cls_mall.cookie = cookie;
-- 
cgit v1.2.3


From b505b29f68ba86586fd004d0b67bb6ba65e8d176 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:19 -0800
Subject: cls_matchall: propagate extack to delete callback

Propagate extack on removal of offloaded filter.  Don't pass
extack from error paths.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index b0b8627b53d2..2ba721a590a7 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -71,12 +71,13 @@ static void mall_destroy_rcu(struct rcu_head *rcu)
 
 static void mall_destroy_hw_filter(struct tcf_proto *tp,
 				   struct cls_mall_head *head,
-				   unsigned long cookie)
+				   unsigned long cookie,
+				   struct netlink_ext_ack *extack)
 {
 	struct tc_cls_matchall_offload cls_mall = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, NULL);
+	tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
 	cls_mall.command = TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = cookie;
 
@@ -102,7 +103,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL,
 			       &cls_mall, skip_sw);
 	if (err < 0) {
-		mall_destroy_hw_filter(tp, head, cookie);
+		mall_destroy_hw_filter(tp, head, cookie, NULL);
 		return err;
 	} else if (err > 0) {
 		tcf_block_offload_inc(block, &head->flags);
@@ -122,7 +123,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 		return;
 
 	if (!tc_skip_hw(head->flags))
-		mall_destroy_hw_filter(tp, head, (unsigned long) head);
+		mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
 
 	if (tcf_exts_get_net(&head->exts))
 		call_rcu(&head->rcu, mall_destroy_rcu);
-- 
cgit v1.2.3


From ea2059409ca1eac35607d194905cf2c6969b3921 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:20 -0800
Subject: cls_flower: pass offload flags to tc_cls_common_offload_init()

Pass offload flags to the new implementation of
tc_cls_common_offload_init().  Extack will now only
be set if user requested skip_sw.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 3f2654ca8ff7..79aa5049f028 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -223,7 +223,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init_deprecated(&cls_flower.common, tp, NULL);
+	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
@@ -243,7 +243,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(f->flags);
 	int err;
 
-	tc_cls_common_offload_init_deprecated(&cls_flower.common, tp, extack);
+	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
 	cls_flower.command = TC_CLSFLOWER_REPLACE;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.dissector = dissector;
@@ -272,7 +272,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init_deprecated(&cls_flower.common, tp, NULL);
+	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
 	cls_flower.command = TC_CLSFLOWER_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.exts = &f->exts;
-- 
cgit v1.2.3


From 1b0f80375c683a3baa0bba9b6f6d89cfb180f3dc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:21 -0800
Subject: cls_flower: propagate extack to delete callback

Propagate extack on removal of offloaded filter.  Don't pass
extack from error paths.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 79aa5049f028..dc9acaafc0a8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -218,12 +218,13 @@ static void fl_destroy_filter(struct rcu_head *head)
 	tcf_queue_work(&f->work);
 }
 
-static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
+				 struct netlink_ext_ack *extack)
 {
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 
-	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
+	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
@@ -255,7 +256,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			       &cls_flower, skip_sw);
 	if (err < 0) {
-		fl_hw_destroy_filter(tp, f);
+		fl_hw_destroy_filter(tp, f, NULL);
 		return err;
 	} else if (err > 0) {
 		tcf_block_offload_inc(block, &f->flags);
@@ -282,14 +283,15 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 			 &cls_flower, false);
 }
 
-static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
+static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+			struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 
 	idr_remove_ext(&head->handle_idr, f->handle);
 	list_del_rcu(&f->list);
 	if (!tc_skip_hw(f->flags))
-		fl_hw_destroy_filter(tp, f);
+		fl_hw_destroy_filter(tp, f, extack);
 	tcf_unbind_filter(tp, &f->res);
 	if (tcf_exts_get_net(&f->exts))
 		call_rcu(&f->rcu, fl_destroy_filter);
@@ -321,7 +323,7 @@ static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	struct cls_fl_filter *f, *next;
 
 	list_for_each_entry_safe(f, next, &head->filters, list)
-		__fl_delete(tp, f);
+		__fl_delete(tp, f, extack);
 	idr_destroy(&head->handle_idr);
 
 	__module_get(THIS_MODULE);
@@ -958,7 +960,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			rhashtable_remove_fast(&head->ht, &fold->ht_node,
 					       head->ht_params);
 		if (!tc_skip_hw(fold->flags))
-			fl_hw_destroy_filter(tp, fold);
+			fl_hw_destroy_filter(tp, fold, NULL);
 	}
 
 	*arg = fnew;
@@ -997,7 +999,7 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
 	if (!tc_skip_sw(f->flags))
 		rhashtable_remove_fast(&head->ht, &f->ht_node,
 				       head->ht_params);
-	__fl_delete(tp, f);
+	__fl_delete(tp, f, extack);
 	*last = list_empty(&head->filters);
 	return 0;
 }
-- 
cgit v1.2.3


From f40fe58d1355569472d14f2388f34d657641eed9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:22 -0800
Subject: cls_u32: pass offload flags to tc_cls_common_offload_init()

Pass offload flags to the new implementation of
tc_cls_common_offload_init().  Extack will now only
be set if user requested skip_sw.  hnodes need to
hold onto the flags now to be able to reuse them
on filter removal.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index e2e8d08c4a0d..21e84abe4226 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -87,6 +87,7 @@ struct tc_u_hnode {
 	unsigned int		divisor;
 	struct idr		handle_idr;
 	struct rcu_head		rcu;
+	u32			flags;
 	/* The 'ht' field MUST be the last field in structure to allow for
 	 * more entries allocated at end of structure.
 	 */
@@ -491,7 +492,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, NULL);
+	tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, NULL);
 	cls_u32.command = TC_CLSU32_DELETE_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -509,7 +510,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	bool offloaded = false;
 	int err;
 
-	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, extack);
+	tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack);
 	cls_u32.command = TC_CLSU32_NEW_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -534,7 +535,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, NULL);
+	tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, NULL);
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
 	cls_u32.knode.handle = n->handle;
 
@@ -550,7 +551,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	bool skip_sw = tc_skip_sw(flags);
 	int err;
 
-	tc_cls_common_offload_init_deprecated(&cls_u32.common, tp, extack);
+	tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack);
 	cls_u32.command = TC_CLSU32_REPLACE_KNODE;
 	cls_u32.knode.handle = n->handle;
 	cls_u32.knode.fshift = n->fshift;
@@ -1015,6 +1016,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		ht->handle = handle;
 		ht->prio = tp->prio;
 		idr_init(&ht->handle_idr);
+		ht->flags = flags;
 
 		err = u32_replace_hw_hnode(tp, ht, flags, extack);
 		if (err) {
-- 
cgit v1.2.3


From 458e704d4d6e71ca28fb30eb7583dbb9169f9bc2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 24 Jan 2018 12:54:23 -0800
Subject: cls_u32: propagate extack to delete callback

Propagate extack on removal of offloaded filter.  Don't pass
extack from error paths.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 21e84abe4226..60c892c36a60 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -487,12 +487,13 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 	return 0;
 }
 
-static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+			       struct netlink_ext_ack *extack)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, NULL);
+	tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, extack);
 	cls_u32.command = TC_CLSU32_DELETE_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -518,7 +519,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
 	if (err < 0) {
-		u32_clear_hw_hnode(tp, h);
+		u32_clear_hw_hnode(tp, h, NULL);
 		return err;
 	} else if (err > 0) {
 		offloaded = true;
@@ -530,12 +531,13 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	return 0;
 }
 
-static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+				struct netlink_ext_ack *extack)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, NULL);
+	tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
 	cls_u32.knode.handle = n->handle;
 
@@ -569,7 +571,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
 	if (err < 0) {
-		u32_remove_hw_knode(tp, n);
+		u32_remove_hw_knode(tp, n, NULL);
 		return err;
 	} else if (err > 0) {
 		tcf_block_offload_inc(block, &n->flags);
@@ -581,7 +583,8 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	return 0;
 }
 
-static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+			    struct netlink_ext_ack *extack)
 {
 	struct tc_u_knode *n;
 	unsigned int h;
@@ -591,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 			RCU_INIT_POINTER(ht->ht[h],
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
-			u32_remove_hw_knode(tp, n);
+			u32_remove_hw_knode(tp, n, extack);
 			idr_remove_ext(&ht->handle_idr, n->handle);
 			if (tcf_exts_get_net(&n->exts))
 				call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
@@ -601,7 +604,8 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 	}
 }
 
-static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode __rcu **hn;
@@ -609,14 +613,14 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 
 	WARN_ON(ht->refcnt);
 
-	u32_clear_hnode(tp, ht);
+	u32_clear_hnode(tp, ht, extack);
 
 	hn = &tp_c->hlist;
 	for (phn = rtnl_dereference(*hn);
 	     phn;
 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
 		if (phn == ht) {
-			u32_clear_hw_hnode(tp, ht);
+			u32_clear_hw_hnode(tp, ht, extack);
 			idr_destroy(&ht->handle_idr);
 			idr_remove_ext(&tp_c->handle_idr, ht->handle);
 			RCU_INIT_POINTER(*hn, ht->next);
@@ -647,7 +651,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	WARN_ON(root_ht == NULL);
 
 	if (root_ht && --root_ht->refcnt == 0)
-		u32_destroy_hnode(tp, root_ht);
+		u32_destroy_hnode(tp, root_ht, extack);
 
 	if (--tp_c->refcnt == 0) {
 		struct tc_u_hnode *ht;
@@ -658,7 +662,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 		     ht;
 		     ht = rtnl_dereference(ht->next)) {
 			ht->refcnt--;
-			u32_clear_hnode(tp, ht);
+			u32_clear_hnode(tp, ht, extack);
 		}
 
 		while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
@@ -685,7 +689,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 		goto out;
 
 	if (TC_U32_KEY(ht->handle)) {
-		u32_remove_hw_knode(tp, (struct tc_u_knode *)ht);
+		u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack);
 		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
 		goto out;
 	}
@@ -697,7 +701,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 
 	if (ht->refcnt == 1) {
 		ht->refcnt--;
-		u32_destroy_hnode(tp, ht);
+		u32_destroy_hnode(tp, ht, extack);
 	} else {
 		NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
 		return -EBUSY;
-- 
cgit v1.2.3


From 5c38bd1b82e1f76f9fa96c1e61c9897cabf1ce45 Mon Sep 17 00:00:00 2001
From: Thomas Winter <Thomas.Winter@alliedtelesis.co.nz>
Date: Tue, 23 Jan 2018 16:46:24 +1300
Subject: ip_tunnel: Use mark in skb by default

This allows marks set by connmark in iptables
to be used for route lookups.

Signed-off-by: Thomas Winter <thomas.winter@alliedtelesis.co.nz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5ddb1cb52bd4..141f5e865731 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -711,9 +711,16 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
-			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
-			 tunnel->fwmark);
+	if (tunnel->fwmark) {
+		init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+				 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
+				 tunnel->fwmark);
+	}
+	else {
+		init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+				 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
+				 skb->mark);
+	}
 
 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 		goto tx_error;
-- 
cgit v1.2.3


From 36fd633ec98acd2028585c22128fcaa3da6d5770 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jun 2017 13:19:16 -0400
Subject: net: separate SIOCGIFCONF handling from dev_ioctl()

Only two of dev_ioctl() callers may pass SIOCGIFCONF to it.
Separating that codepath from the rest of dev_ioctl() allows both
to simplify dev_ioctl() itself (all other cases work with struct ifreq *)
*and* seriously simplify the compat side of that beast: all it takes
is passing to inet_gifconf() an extra argument - the size of individual
records (sizeof(struct ifreq) or sizeof(struct compat_ifreq)).  With
dev_ifconf() called directly from sock_do_ioctl()/compat_dev_ifconf()
that's easy to arrange.

As the result, compat side of SIOCGIFCONF doesn't need any
allocations, copy_in_user() back and forth, etc.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/netdevice.h |  4 ++-
 net/core/dev_ioctl.c      | 29 +++++------------
 net/ipv4/devinet.c        | 16 +++++-----
 net/socket.c              | 79 ++++++++++++++---------------------------------
 4 files changed, 42 insertions(+), 86 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 581495f4e487..df5565d0369c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2761,7 +2761,8 @@ static inline bool dev_validate_header(const struct net_device *dev,
 	return false;
 }
 
-typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len);
+typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr,
+			   int len, int size);
 int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
 static inline int unregister_gifconf(unsigned int family)
 {
@@ -3315,6 +3316,7 @@ void netdev_rx_handler_unregister(struct net_device *dev);
 
 bool dev_valid_name(const char *name);
 int dev_ioctl(struct net *net, unsigned int cmd, void __user *);
+int dev_ifconf(struct net *net, struct ifconf *, int);
 int dev_ethtool(struct net *net, struct ifreq *);
 unsigned int dev_get_flags(const struct net_device *);
 int __dev_change_flags(struct net_device *, unsigned int flags);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7e690d0ccd05..5cdec23dd28e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -66,9 +66,8 @@ EXPORT_SYMBOL(register_gifconf);
  *	Thus we will need a 'compatibility mode'.
  */
 
-static int dev_ifconf(struct net *net, char __user *arg)
+int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
 {
-	struct ifconf ifc;
 	struct net_device *dev;
 	char __user *pos;
 	int len;
@@ -79,11 +78,8 @@ static int dev_ifconf(struct net *net, char __user *arg)
 	 *	Fetch the caller's info block.
 	 */
 
-	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
-		return -EFAULT;
-
-	pos = ifc.ifc_buf;
-	len = ifc.ifc_len;
+	pos = ifc->ifc_buf;
+	len = ifc->ifc_len;
 
 	/*
 	 *	Loop over the interfaces, and write an info block for each.
@@ -95,10 +91,10 @@ static int dev_ifconf(struct net *net, char __user *arg)
 			if (gifconf_list[i]) {
 				int done;
 				if (!pos)
-					done = gifconf_list[i](dev, NULL, 0);
+					done = gifconf_list[i](dev, NULL, 0, size);
 				else
 					done = gifconf_list[i](dev, pos + total,
-							       len - total);
+							       len - total, size);
 				if (done < 0)
 					return -EFAULT;
 				total += done;
@@ -109,12 +105,12 @@ static int dev_ifconf(struct net *net, char __user *arg)
 	/*
 	 *	All done.  Write the updated control block back to the caller.
 	 */
-	ifc.ifc_len = total;
+	ifc->ifc_len = total;
 
 	/*
 	 * 	Both BSD and Solaris return 0 here, so we do too.
 	 */
-	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+	return 0;
 }
 
 /*
@@ -412,17 +408,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	int ret;
 	char *colon;
 
-	/* One special case: SIOCGIFCONF takes ifconf argument
-	   and requires shared lock, because it sleeps writing
-	   to user space.
-	 */
-
-	if (cmd == SIOCGIFCONF) {
-		rtnl_lock();
-		ret = dev_ifconf(net, (char __user *) arg);
-		rtnl_unlock();
-		return ret;
-	}
 	if (cmd == SIOCGIFNAME)
 		return dev_ifname(net, (struct ifreq __user *)arg);
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7a93359fbc72..1771549d2438 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1188,22 +1188,25 @@ rarok:
 	goto out;
 }
 
-static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
 {
 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 	struct in_ifaddr *ifa;
 	struct ifreq ifr;
 	int done = 0;
 
+	if (WARN_ON(size > sizeof(struct ifreq)))
+		goto out;
+
 	if (!in_dev)
 		goto out;
 
 	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
 		if (!buf) {
-			done += sizeof(ifr);
+			done += size;
 			continue;
 		}
-		if (len < (int) sizeof(ifr))
+		if (len < size)
 			break;
 		memset(&ifr, 0, sizeof(struct ifreq));
 		strcpy(ifr.ifr_name, ifa->ifa_label);
@@ -1212,13 +1215,12 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
 		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
 								ifa->ifa_local;
 
-		if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+		if (copy_to_user(buf + done, &ifr, size)) {
 			done = -EFAULT;
 			break;
 		}
-		buf  += sizeof(struct ifreq);
-		len  -= sizeof(struct ifreq);
-		done += sizeof(struct ifreq);
+		len  -= size;
+		done += size;
 	}
 out:
 	return done;
diff --git a/net/socket.c b/net/socket.c
index 1536515b6437..96e5b23a2a2e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -961,10 +961,22 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 	 * If this ioctl is unknown try to hand it down
 	 * to the NIC driver.
 	 */
-	if (err == -ENOIOCTLCMD)
-		err = dev_ioctl(net, cmd, argp);
+	if (err != -ENOIOCTLCMD)
+		return err;
 
-	return err;
+	if (cmd == SIOCGIFCONF) {
+		struct ifconf ifc;
+		if (copy_from_user(&ifc, argp, sizeof(struct ifconf)))
+			return -EFAULT;
+		rtnl_lock();
+		err = dev_ifconf(net, &ifc, sizeof(struct ifreq));
+		rtnl_unlock();
+		if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
+			err = -EFAULT;
+		return err;
+	}
+
+	return dev_ioctl(net, cmd, argp);
 }
 
 /*
@@ -2673,70 +2685,25 @@ static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32)
 	return 0;
 }
 
-static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
+static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
 {
 	struct compat_ifconf ifc32;
 	struct ifconf ifc;
-	struct ifconf __user *uifc;
-	struct compat_ifreq __user *ifr32;
-	struct ifreq __user *ifr;
-	unsigned int i, j;
 	int err;
 
 	if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
 		return -EFAULT;
 
-	memset(&ifc, 0, sizeof(ifc));
-	if (ifc32.ifcbuf == 0) {
-		ifc32.ifc_len = 0;
-		ifc.ifc_len = 0;
-		ifc.ifc_req = NULL;
-		uifc = compat_alloc_user_space(sizeof(struct ifconf));
-	} else {
-		size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) *
-			sizeof(struct ifreq);
-		uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
-		ifc.ifc_len = len;
-		ifr = ifc.ifc_req = (void __user *)(uifc + 1);
-		ifr32 = compat_ptr(ifc32.ifcbuf);
-		for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) {
-			if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq)))
-				return -EFAULT;
-			ifr++;
-			ifr32++;
-		}
-	}
-	if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
-		return -EFAULT;
+	ifc.ifc_len = ifc32.ifc_len;
+	ifc.ifc_req = compat_ptr(ifc32.ifcbuf);
 
-	err = dev_ioctl(net, SIOCGIFCONF, uifc);
+	rtnl_lock();
+	err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq));
+	rtnl_unlock();
 	if (err)
 		return err;
 
-	if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
-		return -EFAULT;
-
-	ifr = ifc.ifc_req;
-	ifr32 = compat_ptr(ifc32.ifcbuf);
-	for (i = 0, j = 0;
-	     i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len;
-	     i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) {
-		if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq)))
-			return -EFAULT;
-		ifr32++;
-		ifr++;
-	}
-
-	if (ifc32.ifcbuf == 0) {
-		/* Translate from 64-bit structure multiple to
-		 * a 32-bit one.
-		 */
-		i = ifc.ifc_len;
-		i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq));
-		ifc32.ifc_len = i;
-	} else {
-		ifc32.ifc_len = i;
-	}
+	ifc32.ifc_len = ifc.ifc_len;
 	if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
 		return -EFAULT;
 
@@ -3133,7 +3100,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGIFNAME:
 		return dev_ifname32(net, argp);
 	case SIOCGIFCONF:
-		return dev_ifconf(net, argp);
+		return compat_dev_ifconf(net, argp);
 	case SIOCETHTOOL:
 		return ethtool_ioctl(net, argp);
 	case SIOCWANDEV:
-- 
cgit v1.2.3


From 03aef17bb79b3dc02b1352ee2f55fca799dbad7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jul 2017 07:53:12 -0400
Subject: devinet_ioctl(): take copyin/copyout to caller

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/inetdevice.h |  2 +-
 net/ipv4/af_inet.c         | 21 ++++++++++++++++-----
 net/ipv4/devinet.c         | 41 +++++++++++++++--------------------------
 net/ipv4/ipconfig.c        | 17 +++--------------
 4 files changed, 35 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 1ac5bf95bfdd..e16fe7d44a71 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -173,7 +173,7 @@ static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
 }
 
 int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
-int devinet_ioctl(struct net *net, unsigned int cmd, void __user *);
+int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *);
 void devinet_init(void);
 struct in_device *inetdev_by_index(struct net *, int);
 __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 54cccdd8b1e3..1c2bfee2e249 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -872,6 +872,8 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	struct sock *sk = sock->sk;
 	int err = 0;
 	struct net *net = sock_net(sk);
+	void __user *p = (void __user *)arg;
+	struct ifreq ifr;
 
 	switch (cmd) {
 	case SIOCGSTAMP:
@@ -891,17 +893,26 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		err = arp_ioctl(net, cmd, (void __user *)arg);
 		break;
 	case SIOCGIFADDR:
-	case SIOCSIFADDR:
 	case SIOCGIFBRDADDR:
-	case SIOCSIFBRDADDR:
 	case SIOCGIFNETMASK:
-	case SIOCSIFNETMASK:
 	case SIOCGIFDSTADDR:
+	case SIOCGIFPFLAGS:
+		if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+			return -EFAULT;
+		err = devinet_ioctl(net, cmd, &ifr);
+		if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq)))
+			err = -EFAULT;
+		break;
+
+	case SIOCSIFADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCSIFNETMASK:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFPFLAGS:
-	case SIOCGIFPFLAGS:
 	case SIOCSIFFLAGS:
-		err = devinet_ioctl(net, cmd, (void __user *)arg);
+		if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+			return -EFAULT;
+		err = devinet_ioctl(net, cmd, &ifr);
 		break;
 	default:
 		if (sk->sk_prot->ioctl)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 1771549d2438..e056c0067f2c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -946,11 +946,10 @@ static int inet_abc_len(__be32 addr)
 }
 
 
-int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 {
-	struct ifreq ifr;
 	struct sockaddr_in sin_orig;
-	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
 	struct in_device *in_dev;
 	struct in_ifaddr **ifap = NULL;
 	struct in_ifaddr *ifa = NULL;
@@ -959,22 +958,16 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	int ret = -EFAULT;
 	int tryaddrmatch = 0;
 
-	/*
-	 *	Fetch the caller's info block into kernel space
-	 */
-
-	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
-		goto out;
-	ifr.ifr_name[IFNAMSIZ - 1] = 0;
+	ifr->ifr_name[IFNAMSIZ - 1] = 0;
 
 	/* save original address for comparison */
 	memcpy(&sin_orig, sin, sizeof(*sin));
 
-	colon = strchr(ifr.ifr_name, ':');
+	colon = strchr(ifr->ifr_name, ':');
 	if (colon)
 		*colon = 0;
 
-	dev_load(net, ifr.ifr_name);
+	dev_load(net, ifr->ifr_name);
 
 	switch (cmd) {
 	case SIOCGIFADDR:	/* Get interface address */
@@ -1014,7 +1007,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	rtnl_lock();
 
 	ret = -ENODEV;
-	dev = __dev_get_by_name(net, ifr.ifr_name);
+	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (!dev)
 		goto done;
 
@@ -1031,7 +1024,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 			   This is checked above. */
 			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
 			     ifap = &ifa->ifa_next) {
-				if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+				if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
 				    sin_orig.sin_addr.s_addr ==
 							ifa->ifa_local) {
 					break; /* found */
@@ -1044,7 +1037,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		if (!ifa) {
 			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
 			     ifap = &ifa->ifa_next)
-				if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+				if (!strcmp(ifr->ifr_name, ifa->ifa_label))
 					break;
 		}
 	}
@@ -1056,19 +1049,19 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	switch (cmd) {
 	case SIOCGIFADDR:	/* Get interface address */
 		sin->sin_addr.s_addr = ifa->ifa_local;
-		goto rarok;
+		break;
 
 	case SIOCGIFBRDADDR:	/* Get the broadcast address */
 		sin->sin_addr.s_addr = ifa->ifa_broadcast;
-		goto rarok;
+		break;
 
 	case SIOCGIFDSTADDR:	/* Get the destination address */
 		sin->sin_addr.s_addr = ifa->ifa_address;
-		goto rarok;
+		break;
 
 	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
 		sin->sin_addr.s_addr = ifa->ifa_mask;
-		goto rarok;
+		break;
 
 	case SIOCSIFFLAGS:
 		if (colon) {
@@ -1076,11 +1069,11 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 			if (!ifa)
 				break;
 			ret = 0;
-			if (!(ifr.ifr_flags & IFF_UP))
+			if (!(ifr->ifr_flags & IFF_UP))
 				inet_del_ifa(in_dev, ifap, 1);
 			break;
 		}
-		ret = dev_change_flags(dev, ifr.ifr_flags);
+		ret = dev_change_flags(dev, ifr->ifr_flags);
 		break;
 
 	case SIOCSIFADDR:	/* Set interface address (and family) */
@@ -1095,7 +1088,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 				break;
 			INIT_HLIST_NODE(&ifa->hash);
 			if (colon)
-				memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+				memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
 			else
 				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
 		} else {
@@ -1182,10 +1175,6 @@ done:
 	rtnl_unlock();
 out:
 	return ret;
-rarok:
-	rtnl_unlock();
-	ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
-	goto out;
 }
 
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e9e488e72900..6895fff609b1 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -329,17 +329,6 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
 	sin->sin_port = port;
 }
 
-static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg)
-{
-	int res;
-
-	mm_segment_t oldfs = get_fs();
-	set_fs(get_ds());
-	res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
-	set_fs(oldfs);
-	return res;
-}
-
 static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
 {
 	int res;
@@ -375,19 +364,19 @@ static int __init ic_setup_if(void)
 	memset(&ir, 0, sizeof(ir));
 	strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name);
 	set_sockaddr(sin, ic_myaddr, 0);
-	if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
+	if ((err = devinet_ioctl(&init_net, SIOCSIFADDR, &ir)) < 0) {
 		pr_err("IP-Config: Unable to set interface address (%d)\n",
 		       err);
 		return -1;
 	}
 	set_sockaddr(sin, ic_netmask, 0);
-	if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+	if ((err = devinet_ioctl(&init_net, SIOCSIFNETMASK, &ir)) < 0) {
 		pr_err("IP-Config: Unable to set interface netmask (%d)\n",
 		       err);
 		return -1;
 	}
 	set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
-	if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+	if ((err = devinet_ioctl(&init_net, SIOCSIFBRDADDR, &ir)) < 0) {
 		pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
 		       err);
 		return -1;
-- 
cgit v1.2.3


From ca25c30040f93c127ff1651aa636c0174f1e0cdb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jul 2017 08:03:10 -0400
Subject: ip_rt_ioctl(): take copyin to caller

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/route.h     |  2 +-
 net/ipv4/af_inet.c      |  7 ++++++-
 net/ipv4/fib_frontend.c |  8 ++------
 net/ipv4/ipconfig.c     | 13 +------------
 4 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/net/route.h b/include/net/route.h
index d538e6db1afe..1eb9ce470e25 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -217,7 +217,7 @@ unsigned int inet_addr_type_dev_table(struct net *net,
 				      const struct net_device *dev,
 				      __be32 addr);
 void ip_rt_multicast_event(struct in_device *);
-int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
+int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt);
 void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
 struct rtable *rt_dst_alloc(struct net_device *dev,
 			     unsigned int flags, u16 type,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1c2bfee2e249..c24008daa3d8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -874,6 +874,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	struct net *net = sock_net(sk);
 	void __user *p = (void __user *)arg;
 	struct ifreq ifr;
+	struct rtentry rt;
 
 	switch (cmd) {
 	case SIOCGSTAMP:
@@ -884,8 +885,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		break;
 	case SIOCADDRT:
 	case SIOCDELRT:
+		if (copy_from_user(&rt, p, sizeof(struct rtentry)))
+			return -EFAULT;
+		err = ip_rt_ioctl(net, cmd, &rt);
+		break;
 	case SIOCRTMSG:
-		err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+		err = -EINVAL;
 		break;
 	case SIOCDARP:
 	case SIOCGARP:
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 08259d078b1c..f05afaf3235c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -587,10 +587,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
  * Handle IP routing ioctl calls.
  * These are used to manipulate the routing tables
  */
-int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 {
 	struct fib_config cfg;
-	struct rtentry rt;
 	int err;
 
 	switch (cmd) {
@@ -599,11 +598,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
-		if (copy_from_user(&rt, arg, sizeof(rt)))
-			return -EFAULT;
-
 		rtnl_lock();
-		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
+		err = rtentry_to_fib_config(net, cmd, rt, &cfg);
 		if (err == 0) {
 			struct fib_table *tb;
 
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 6895fff609b1..5f396afaa08d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -340,17 +340,6 @@ static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
 	return res;
 }
 
-static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
-{
-	int res;
-
-	mm_segment_t oldfs = get_fs();
-	set_fs(get_ds());
-	res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg);
-	set_fs(oldfs);
-	return res;
-}
-
 /*
  *	Set up interface addresses and routes.
  */
@@ -412,7 +401,7 @@ static int __init ic_setup_routes(void)
 		set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
 		set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
 		rm.rt_flags = RTF_UP | RTF_GATEWAY;
-		if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+		if ((err = ip_rt_ioctl(&init_net, SIOCADDRT, &rm)) < 0) {
 			pr_err("IP-Config: Cannot add default route (%d)\n",
 			       err);
 			return -1;
-- 
cgit v1.2.3


From bf4405737f9f85a06db2b0ce5d76a818b61992e2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Sep 2017 19:31:15 -0400
Subject: kill dev_ifsioc()

Once upon a time net/socket.c:dev_ifsioc() used to handle SIOCSHWTSTAMP and
SIOCSIFMAP.  These have different native and compat layout, so the format
conversion had been needed.  In 2009 these two cases had been taken out,
turning the rest into a convoluted way to calling sock_do_ioctl().  We copy
compat structure into native one, call sock_do_ioctl() on that and copy
the result back for the in/out ioctls.  No layout transformation anywhere,
so we might as well just call sock_do_ioctl() and skip all the headache with
copying.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/socket.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 96e5b23a2a2e..fd593a86fa76 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2906,42 +2906,6 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 	return dev_ioctl(net, cmd, u_ifreq64);
 }
 
-static int dev_ifsioc(struct net *net, struct socket *sock,
-			 unsigned int cmd, struct compat_ifreq __user *uifr32)
-{
-	struct ifreq __user *uifr;
-	int err;
-
-	uifr = compat_alloc_user_space(sizeof(*uifr));
-	if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
-		return -EFAULT;
-
-	err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
-
-	if (!err) {
-		switch (cmd) {
-		case SIOCGIFFLAGS:
-		case SIOCGIFMETRIC:
-		case SIOCGIFMTU:
-		case SIOCGIFMEM:
-		case SIOCGIFHWADDR:
-		case SIOCGIFINDEX:
-		case SIOCGIFADDR:
-		case SIOCGIFBRDADDR:
-		case SIOCGIFDSTADDR:
-		case SIOCGIFNETMASK:
-		case SIOCGIFPFLAGS:
-		case SIOCGIFTXQLEN:
-		case SIOCGMIIPHY:
-		case SIOCGMIIREG:
-			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
-				err = -EFAULT;
-			break;
-		}
-	}
-	return err;
-}
-
 static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 			struct compat_ifreq __user *uifr32)
 {
@@ -3172,8 +3136,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGMIIPHY:
 	case SIOCGMIIREG:
 	case SIOCSMIIREG:
-		return dev_ifsioc(net, sock, cmd, argp);
-
 	case SIOCSARP:
 	case SIOCGARP:
 	case SIOCDARP:
-- 
cgit v1.2.3


From f92d4fc95341cd6634c0fcd6d7cc201c9c1126fb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Sep 2017 19:32:17 -0400
Subject: kill bond_ioctl()

Same story as with dev_ifsioc(), except that the last cases with non-trivial
conversions had been taken out in 2013...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/socket.c | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index fd593a86fa76..823520e41c6c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2852,33 +2852,6 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32
 	return dev_ioctl(net, SIOCWANDEV, uifr);
 }
 
-static int bond_ioctl(struct net *net, unsigned int cmd,
-			 struct compat_ifreq __user *ifr32)
-{
-	struct ifreq kifr;
-	mm_segment_t old_fs;
-	int err;
-
-	switch (cmd) {
-	case SIOCBONDENSLAVE:
-	case SIOCBONDRELEASE:
-	case SIOCBONDSETHWADDR:
-	case SIOCBONDCHANGEACTIVE:
-		if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq)))
-			return -EFAULT;
-
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		err = dev_ioctl(net, cmd,
-				(struct ifreq __user __force *) &kifr);
-		set_fs(old_fs);
-
-		return err;
-	default:
-		return -ENOIOCTLCMD;
-	}
-}
-
 /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
 static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 				 struct compat_ifreq __user *u_ifreq32)
@@ -3072,11 +3045,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGIFMAP:
 	case SIOCSIFMAP:
 		return compat_sioc_ifmap(net, cmd, argp);
-	case SIOCBONDENSLAVE:
-	case SIOCBONDRELEASE:
-	case SIOCBONDSETHWADDR:
-	case SIOCBONDCHANGEACTIVE:
-		return bond_ioctl(net, cmd, argp);
 	case SIOCADDRT:
 	case SIOCDELRT:
 		return routing_ioctl(net, sock, cmd, argp);
@@ -3140,6 +3108,10 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGARP:
 	case SIOCDARP:
 	case SIOCATMARK:
+	case SIOCBONDENSLAVE:
+	case SIOCBONDRELEASE:
+	case SIOCBONDSETHWADDR:
+	case SIOCBONDCHANGEACTIVE:
 		return sock_do_ioctl(net, sock, cmd, arg);
 	}
 
-- 
cgit v1.2.3


From 4cf808e7ac32120b3db95d824acd52f586bfdd1d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Oct 2017 21:12:09 -0400
Subject: kill dev_ifname32()

same story...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/socket.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 823520e41c6c..026271ff2853 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2666,25 +2666,6 @@ static int do_siocgstampns(struct net *net, struct socket *sock,
 	return err;
 }
 
-static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32)
-{
-	struct ifreq __user *uifr;
-	int err;
-
-	uifr = compat_alloc_user_space(sizeof(struct ifreq));
-	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
-		return -EFAULT;
-
-	err = dev_ioctl(net, SIOCGIFNAME, uifr);
-	if (err)
-		return err;
-
-	if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq)))
-		return -EFAULT;
-
-	return 0;
-}
-
 static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
 {
 	struct compat_ifconf ifc32;
@@ -3034,8 +3015,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCSIFBR:
 	case SIOCGIFBR:
 		return old_bridge_ioctl(argp);
-	case SIOCGIFNAME:
-		return dev_ifname32(net, argp);
 	case SIOCGIFCONF:
 		return compat_dev_ifconf(net, argp);
 	case SIOCETHTOOL:
@@ -3112,6 +3091,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBONDRELEASE:
 	case SIOCBONDSETHWADDR:
 	case SIOCBONDCHANGEACTIVE:
+	case SIOCGIFNAME:
 		return sock_do_ioctl(net, sock, cmd, arg);
 	}
 
-- 
cgit v1.2.3


From b1b0c245067268043e0e832432f3d537a5cae33b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Oct 2017 20:13:08 -0400
Subject: lift handling of SIOCIW... out of dev_ioctl()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/wext.h       |  4 ++--
 net/core/dev_ioctl.c     | 18 ------------------
 net/socket.c             |  2 +-
 net/wireless/wext-core.c | 13 +++++++++----
 4 files changed, 12 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/wext.h b/include/net/wext.h
index e51f067fdb3a..aa192a670304 100644
--- a/include/net/wext.h
+++ b/include/net/wext.h
@@ -7,7 +7,7 @@
 struct net;
 
 #ifdef CONFIG_WEXT_CORE
-int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
+int wext_handle_ioctl(struct net *net, unsigned int cmd,
 		      void __user *arg);
 int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
 			     unsigned long arg);
@@ -15,7 +15,7 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
 struct iw_statistics *get_wireless_stats(struct net_device *dev);
 int call_commit_handler(struct net_device *dev);
 #else
-static inline int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
+static inline int wext_handle_ioctl(struct net *net, unsigned int cmd,
 				    void __user *arg)
 {
 	return -EINVAL;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 5cdec23dd28e..d262f159f9fd 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -411,24 +411,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	if (cmd == SIOCGIFNAME)
 		return dev_ifname(net, (struct ifreq __user *)arg);
 
-	/*
-	 * Take care of Wireless Extensions. Unfortunately struct iwreq
-	 * isn't a proper subset of struct ifreq (it's 8 byte shorter)
-	 * so we need to treat it specially, otherwise applications may
-	 * fault if the struct they're passing happens to land at the
-	 * end of a mapped page.
-	 */
-	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
-		struct iwreq iwr;
-
-		if (copy_from_user(&iwr, arg, sizeof(iwr)))
-			return -EFAULT;
-
-		iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0;
-
-		return wext_handle_ioctl(net, &iwr, cmd, arg);
-	}
-
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
diff --git a/net/socket.c b/net/socket.c
index 026271ff2853..1ad02d9edbef 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1005,7 +1005,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	} else
 #ifdef CONFIG_WEXT_CORE
 	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
-		err = dev_ioctl(net, cmd, argp);
+		err = wext_handle_ioctl(net, cmd, argp);
 	} else
 #endif
 		switch (cmd) {
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6cdb054484d6..9efbfc753347 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -1035,18 +1035,23 @@ static int ioctl_standard_call(struct net_device *	dev,
 }
 
 
-int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
-		      void __user *arg)
+int wext_handle_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 {
 	struct iw_request_info info = { .cmd = cmd, .flags = 0 };
+	struct iwreq iwr;
 	int ret;
 
-	ret = wext_ioctl_dispatch(net, iwr, cmd, &info,
+	if (copy_from_user(&iwr, arg, sizeof(iwr)))
+		return -EFAULT;
+
+	iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0;
+
+	ret = wext_ioctl_dispatch(net, &iwr, cmd, &info,
 				  ioctl_standard_call,
 				  ioctl_private_call);
 	if (ret >= 0 &&
 	    IW_IS_GET(cmd) &&
-	    copy_to_user(arg, iwr, sizeof(struct iwreq)))
+	    copy_to_user(arg, &iwr, sizeof(struct iwreq)))
 		return -EFAULT;
 
 	return ret;
-- 
cgit v1.2.3


From 6a88fbe7257282c19c777d5fe310166e5b3089e8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Oct 2017 20:27:01 -0400
Subject: ipconfig: use dev_set_mtu()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ipconfig.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 5f396afaa08d..f75802ad960f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -329,17 +329,6 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
 	sin->sin_port = port;
 }
 
-static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
-{
-	int res;
-
-	mm_segment_t oldfs = get_fs();
-	set_fs(get_ds());
-	res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
-	set_fs(oldfs);
-	return res;
-}
-
 /*
  *	Set up interface addresses and routes.
  */
@@ -375,11 +364,11 @@ static int __init ic_setup_if(void)
 	 * out, we'll try to muddle along.
 	 */
 	if (ic_dev_mtu != 0) {
-		strcpy(ir.ifr_name, ic_dev->dev->name);
-		ir.ifr_mtu = ic_dev_mtu;
-		if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
+		rtnl_lock();
+		if ((err = dev_set_mtu(ic_dev->dev, ic_dev_mtu)) < 0)
 			pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
 			       ic_dev_mtu, err);
+		rtnl_unlock();
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 44c02a2c3dc55835e9f0d8ef73966406cd805001 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Oct 2017 12:59:44 -0400
Subject: dev_ioctl(): move copyin/copyout to callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/netdevice.h |  3 +-
 net/core/dev_ioctl.c      | 85 +++++++++++++------------------------------
 net/socket.c              | 91 +++++++++++++++++++++++------------------------
 3 files changed, 71 insertions(+), 108 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index df5565d0369c..24a62d590350 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3315,7 +3315,8 @@ int netdev_rx_handler_register(struct net_device *dev,
 void netdev_rx_handler_unregister(struct net_device *dev);
 
 bool dev_valid_name(const char *name);
-int dev_ioctl(struct net *net, unsigned int cmd, void __user *);
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
+		bool *need_copyout);
 int dev_ifconf(struct net *net, struct ifconf *, int);
 int dev_ethtool(struct net *net, struct ifreq *);
 unsigned int dev_get_flags(const struct net_device *);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index d262f159f9fd..0ab1af04296c 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -18,26 +18,10 @@
  *	match.  --pb
  */
 
-static int dev_ifname(struct net *net, struct ifreq __user *arg)
+static int dev_ifname(struct net *net, struct ifreq *ifr)
 {
-	struct ifreq ifr;
-	int error;
-
-	/*
-	 *	Fetch the caller's info block.
-	 */
-
-	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
-		return -EFAULT;
-	ifr.ifr_name[IFNAMSIZ-1] = 0;
-
-	error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
-	if (error)
-		return error;
-
-	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
-		return -EFAULT;
-	return 0;
+	ifr->ifr_name[IFNAMSIZ-1] = 0;
+	return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
 }
 
 static gifconf_func_t *gifconf_list[NPROTO];
@@ -402,24 +386,24 @@ EXPORT_SYMBOL(dev_load);
  *	positive or a negative errno code on error.
  */
 
-int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
 {
-	struct ifreq ifr;
 	int ret;
 	char *colon;
 
+	if (need_copyout)
+		*need_copyout = true;
 	if (cmd == SIOCGIFNAME)
-		return dev_ifname(net, (struct ifreq __user *)arg);
-
-	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
-		return -EFAULT;
+		return dev_ifname(net, ifr);
 
-	ifr.ifr_name[IFNAMSIZ-1] = 0;
+	ifr->ifr_name[IFNAMSIZ-1] = 0;
 
-	colon = strchr(ifr.ifr_name, ':');
+	colon = strchr(ifr->ifr_name, ':');
 	if (colon)
 		*colon = 0;
 
+	dev_load(net, ifr->ifr_name);
+
 	/*
 	 *	See which interface the caller is talking about.
 	 */
@@ -439,31 +423,19 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCGIFMAP:
 	case SIOCGIFINDEX:
 	case SIOCGIFTXQLEN:
-		dev_load(net, ifr.ifr_name);
 		rcu_read_lock();
-		ret = dev_ifsioc_locked(net, &ifr, cmd);
+		ret = dev_ifsioc_locked(net, ifr, cmd);
 		rcu_read_unlock();
-		if (!ret) {
-			if (colon)
-				*colon = ':';
-			if (copy_to_user(arg, &ifr,
-					 sizeof(struct ifreq)))
-				ret = -EFAULT;
-		}
+		if (colon)
+			*colon = ':';
 		return ret;
 
 	case SIOCETHTOOL:
-		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
-		ret = dev_ethtool(net, &ifr);
+		ret = dev_ethtool(net, ifr);
 		rtnl_unlock();
-		if (!ret) {
-			if (colon)
-				*colon = ':';
-			if (copy_to_user(arg, &ifr,
-					 sizeof(struct ifreq)))
-				ret = -EFAULT;
-		}
+		if (colon)
+			*colon = ':';
 		return ret;
 
 	/*
@@ -477,17 +449,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCSIFNAME:
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
-		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
-		ret = dev_ifsioc(net, &ifr, cmd);
+		ret = dev_ifsioc(net, ifr, cmd);
 		rtnl_unlock();
-		if (!ret) {
-			if (colon)
-				*colon = ':';
-			if (copy_to_user(arg, &ifr,
-					 sizeof(struct ifreq)))
-				ret = -EFAULT;
-		}
+		if (colon)
+			*colon = ':';
 		return ret;
 
 	/*
@@ -528,10 +494,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		/* fall through */
 	case SIOCBONDSLAVEINFOQUERY:
 	case SIOCBONDINFOQUERY:
-		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
-		ret = dev_ifsioc(net, &ifr, cmd);
+		ret = dev_ifsioc(net, ifr, cmd);
 		rtnl_unlock();
+		if (need_copyout)
+			*need_copyout = false;
 		return ret;
 
 	case SIOCGIFMEM:
@@ -551,13 +518,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		    cmd == SIOCGHWTSTAMP ||
 		    (cmd >= SIOCDEVPRIVATE &&
 		     cmd <= SIOCDEVPRIVATE + 15)) {
-			dev_load(net, ifr.ifr_name);
 			rtnl_lock();
-			ret = dev_ifsioc(net, &ifr, cmd);
+			ret = dev_ifsioc(net, ifr, cmd);
 			rtnl_unlock();
-			if (!ret && copy_to_user(arg, &ifr,
-						 sizeof(struct ifreq)))
-				ret = -EFAULT;
 			return ret;
 		}
 		return -ENOTTY;
diff --git a/net/socket.c b/net/socket.c
index 1ad02d9edbef..45d51555ce47 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -973,10 +973,17 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 		rtnl_unlock();
 		if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
 			err = -EFAULT;
-		return err;
+	} else {
+		struct ifreq ifr;
+		bool need_copyout;
+		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+			return -EFAULT;
+		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
+		if (!err && need_copyout)
+			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+				return -EFAULT;
 	}
-
-	return dev_ioctl(net, cmd, argp);
+	return err;
 }
 
 /*
@@ -1000,8 +1007,15 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	sock = file->private_data;
 	sk = sock->sk;
 	net = sock_net(sk);
-	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
-		err = dev_ioctl(net, cmd, argp);
+	if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
+		struct ifreq ifr;
+		bool need_copyout;
+		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+			return -EFAULT;
+		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
+		if (!err && need_copyout)
+			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+				return -EFAULT;
 	} else
 #ifdef CONFIG_WEXT_CORE
 	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
@@ -2695,9 +2709,9 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 {
 	struct compat_ethtool_rxnfc __user *compat_rxnfc;
 	bool convert_in = false, convert_out = false;
-	size_t buf_size = ALIGN(sizeof(struct ifreq), 8);
-	struct ethtool_rxnfc __user *rxnfc;
-	struct ifreq __user *ifr;
+	size_t buf_size = 0;
+	struct ethtool_rxnfc __user *rxnfc = NULL;
+	struct ifreq ifr;
 	u32 rule_cnt = 0, actual_rule_cnt;
 	u32 ethcmd;
 	u32 data;
@@ -2734,18 +2748,14 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 	case ETHTOOL_SRXCLSRLDEL:
 		buf_size += sizeof(struct ethtool_rxnfc);
 		convert_in = true;
+		rxnfc = compat_alloc_user_space(buf_size);
 		break;
 	}
 
-	ifr = compat_alloc_user_space(buf_size);
-	rxnfc = (void __user *)ifr + ALIGN(sizeof(struct ifreq), 8);
-
-	if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
+	if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ))
 		return -EFAULT;
 
-	if (put_user(convert_in ? rxnfc : compat_ptr(data),
-		     &ifr->ifr_ifru.ifru_data))
-		return -EFAULT;
+	ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc;
 
 	if (convert_in) {
 		/* We expect there to be holes between fs.m_ext and
@@ -2773,7 +2783,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 			return -EFAULT;
 	}
 
-	ret = dev_ioctl(net, SIOCETHTOOL, ifr);
+	ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL);
 	if (ret)
 		return ret;
 
@@ -2814,50 +2824,43 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 
 static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
 {
-	void __user *uptr;
 	compat_uptr_t uptr32;
-	struct ifreq __user *uifr;
+	struct ifreq ifr;
+	void __user *saved;
+	int err;
 
-	uifr = compat_alloc_user_space(sizeof(*uifr));
-	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+	if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq)))
 		return -EFAULT;
 
 	if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
 		return -EFAULT;
 
-	uptr = compat_ptr(uptr32);
-
-	if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc))
-		return -EFAULT;
+	saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
+	ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);
 
-	return dev_ioctl(net, SIOCWANDEV, uifr);
+	err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL);
+	if (!err) {
+		ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
+		if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq)))
+			err = -EFAULT;
+	}
+	return err;
 }
 
 /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
 static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 				 struct compat_ifreq __user *u_ifreq32)
 {
-	struct ifreq __user *u_ifreq64;
-	char tmp_buf[IFNAMSIZ];
-	void __user *data64;
+	struct ifreq ifreq;
 	u32 data32;
 
-	if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
-			   IFNAMSIZ))
+	if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ))
 		return -EFAULT;
-	if (get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
+	if (get_user(data32, &u_ifreq32->ifr_data))
 		return -EFAULT;
-	data64 = compat_ptr(data32);
+	ifreq.ifr_data = compat_ptr(data32);
 
-	u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
-
-	if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
-			 IFNAMSIZ))
-		return -EFAULT;
-	if (put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
-		return -EFAULT;
-
-	return dev_ioctl(net, cmd, u_ifreq64);
+	return dev_ioctl(net, cmd, &ifreq, NULL);
 }
 
 static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
@@ -2865,7 +2868,6 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 {
 	struct ifreq ifr;
 	struct compat_ifmap __user *uifmap32;
-	mm_segment_t old_fs;
 	int err;
 
 	uifmap32 = &uifr32->ifr_ifru.ifru_map;
@@ -2879,10 +2881,7 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 	if (err)
 		return -EFAULT;
 
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	err = dev_ioctl(net, cmd, (void  __user __force *)&ifr);
-	set_fs(old_fs);
+	err = dev_ioctl(net, cmd, &ifr, NULL);
 
 	if (cmd == SIOCGIFMAP && !err) {
 		err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
-- 
cgit v1.2.3


From 5c59e564e46dcbab2ee7a4e9e0243562a39679a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jul 2017 18:46:30 -0400
Subject: kill kernel_sock_ioctl()

no users since 2014

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/net.h |  1 -
 net/socket.c        | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/net.h b/include/linux/net.h
index caeb159abda5..68acc54976bf 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -306,7 +306,6 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags);
 int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
 			   size_t size, int flags);
-int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how);
 
 /* Routine returns the IP overhead imposed by a (caller-protected) socket. */
diff --git a/net/socket.c b/net/socket.c
index 45d51555ce47..11cc2cd0f37b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3245,19 +3245,6 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
 }
 EXPORT_SYMBOL(kernel_sendpage_locked);
 
-int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
-{
-	mm_segment_t oldfs = get_fs();
-	int err;
-
-	set_fs(KERNEL_DS);
-	err = sock->ops->ioctl(sock, cmd, arg);
-	set_fs(oldfs);
-
-	return err;
-}
-EXPORT_SYMBOL(kernel_sock_ioctl);
-
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
 {
 	return sock->ops->shutdown(sock, how);
-- 
cgit v1.2.3


From fb07a820fe3fedabffc57863e0f823c912d81bad Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 19 Jan 2018 19:14:53 +0300
Subject: net: Move net:netns_ids destruction out of rtnl_lock() and document
 locking scheme

Currently, we unhash a dying net from netns_ids lists
under rtnl_lock(). It's a leftover from the time when
net::netns_ids was introduced. There was no net::nsid_lock,
and rtnl_lock() was mostly need to order modification
of alive nets nsid idr, i.e. for:
	for_each_net(tmp) {
		...
		id = __peernet2id(tmp, net);
		idr_remove(&tmp->netns_ids, id);
		...
	}

Since we have net::nsid_lock, the modifications are
protected by this local lock, and now we may introduce
better scheme of netns_ids destruction.

Let's look at the functions peernet2id_alloc() and
get_net_ns_by_id(). Previous commits taught these
functions to work well with dying net acquired from
rtnl unlocked lists. And they are the only functions
which can hash a net to netns_ids or obtain from there.
And as easy to check, other netns_ids operating functions
works with id, not with net pointers. So, we do not
need rtnl_lock to synchronize cleanup_net() with all them.

The another property, which is used in the patch,
is that net is unhashed from net_namespace_list
in the only place and by the only process. So,
we avoid excess rcu_read_lock() or rtnl_lock(),
when we'are iterating over the list in unhash_nsid().

All the above makes possible to keep rtnl_lock() locked
only for net->list deletion, and completely avoid it
for netns_ids unhashing and destruction. As these two
doings may take long time (e.g., memory allocation
to send skb), the patch should positively act on
the scalability and signify decrease the time, which
rtnl_lock() is held in cleanup_net().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 62 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1ccb953b3b09..3cad5f51afd3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -439,13 +439,40 @@ struct net *copy_net_ns(unsigned long flags,
 	return net;
 }
 
+static void unhash_nsid(struct net *net, struct net *last)
+{
+	struct net *tmp;
+	/* This function is only called from cleanup_net() work,
+	 * and this work is the only process, that may delete
+	 * a net from net_namespace_list. So, when the below
+	 * is executing, the list may only grow. Thus, we do not
+	 * use for_each_net_rcu() or rtnl_lock().
+	 */
+	for_each_net(tmp) {
+		int id;
+
+		spin_lock_bh(&tmp->nsid_lock);
+		id = __peernet2id(tmp, net);
+		if (id >= 0)
+			idr_remove(&tmp->netns_ids, id);
+		spin_unlock_bh(&tmp->nsid_lock);
+		if (id >= 0)
+			rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+		if (tmp == last)
+			break;
+	}
+	spin_lock_bh(&net->nsid_lock);
+	idr_destroy(&net->netns_ids);
+	spin_unlock_bh(&net->nsid_lock);
+}
+
 static DEFINE_SPINLOCK(cleanup_list_lock);
 static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
 
 static void cleanup_net(struct work_struct *work)
 {
 	const struct pernet_operations *ops;
-	struct net *net, *tmp;
+	struct net *net, *tmp, *last;
 	struct list_head net_kill_list;
 	LIST_HEAD(net_exit_list);
 
@@ -458,26 +485,25 @@ static void cleanup_net(struct work_struct *work)
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+	list_for_each_entry(net, &net_kill_list, cleanup_list)
 		list_del_rcu(&net->list);
-		list_add_tail(&net->exit_list, &net_exit_list);
-		for_each_net(tmp) {
-			int id;
-
-			spin_lock_bh(&tmp->nsid_lock);
-			id = __peernet2id(tmp, net);
-			if (id >= 0)
-				idr_remove(&tmp->netns_ids, id);
-			spin_unlock_bh(&tmp->nsid_lock);
-			if (id >= 0)
-				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
-		}
-		spin_lock_bh(&net->nsid_lock);
-		idr_destroy(&net->netns_ids);
-		spin_unlock_bh(&net->nsid_lock);
+	/* Cache last net. After we unlock rtnl, no one new net
+	 * added to net_namespace_list can assign nsid pointer
+	 * to a net from net_kill_list (see peernet2id_alloc()).
+	 * So, we skip them in unhash_nsid().
+	 *
+	 * Note, that unhash_nsid() does not delete nsid links
+	 * between net_kill_list's nets, as they've already
+	 * deleted from net_namespace_list. But, this would be
+	 * useless anyway, as netns_ids are destroyed there.
+	 */
+	last = list_last_entry(&net_namespace_list, struct net, list);
+	rtnl_unlock();
 
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+		unhash_nsid(net, last);
+		list_add_tail(&net->exit_list, &net_exit_list);
 	}
-	rtnl_unlock();
 
 	/*
 	 * Another CPU might be rcu-iterating the list, wait for it.
-- 
cgit v1.2.3


From 611b63a12732635ba7bdc8578b42a1cc40f56a13 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 25 Jan 2018 11:15:31 +0100
Subject: net/smc: cancel tx worker in case of socket aborts

If an SMC socket is aborted, the tx worker should be cancelled.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 1468a2a3cdf4..6de909612bd0 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -107,6 +107,9 @@ static void smc_close_active_abort(struct smc_sock *smc)
 	case SMC_INIT:
 	case SMC_ACTIVE:
 		sk->sk_state = SMC_PEERABORTWAIT;
+		release_sock(sk);
+		cancel_delayed_work_sync(&smc->conn.tx_work);
+		lock_sock(sk);
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
@@ -116,6 +119,9 @@ static void smc_close_active_abort(struct smc_sock *smc)
 			sk->sk_state = SMC_PEERABORTWAIT;
 		else
 			sk->sk_state = SMC_CLOSED;
+		release_sock(sk);
+		cancel_delayed_work_sync(&smc->conn.tx_work);
+		lock_sock(sk);
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
@@ -249,9 +255,6 @@ again:
 		/* peer sending PeerConnectionClosed will cause transition */
 		break;
 	case SMC_PROCESSABORT:
-		release_sock(sk);
-		cancel_delayed_work_sync(&conn->tx_work);
-		lock_sock(sk);
 		smc_close_abort(conn);
 		sk->sk_state = SMC_CLOSED;
 		break;
@@ -327,6 +330,9 @@ static void smc_close_passive_work(struct work_struct *work)
 	rxflags = &conn->local_rx_ctrl.conn_state_flags;
 	if (rxflags->peer_conn_abort) {
 		smc_close_passive_abort_received(smc);
+		release_sock(&smc->sk);
+		cancel_delayed_work_sync(&conn->tx_work);
+		lock_sock(&smc->sk);
 		goto wakeup;
 	}
 
-- 
cgit v1.2.3


From 5ac92a00aa743e76e86503d58f092830486af39b Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 25 Jan 2018 11:15:32 +0100
Subject: net/smc: handle state SMC_PEERFINCLOSEWAIT correctly

A state transition from closing state SMC_PEERFINCLOSEWAIT to closing
state SMC_APPFINCLOSEWAIT is not allowed. Once a closing indication
from the peer has been received, the socket reaches state SMC_CLOSED.

And receiving a peer_conn_abort just changes the state of the socket
into one of the states SMC_PROCESSABORT or SMC_CLOSED;
sending a peer_conn_abort occurs in smc_close_active() for state
SMC_PROCESSABORT only.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 6de909612bd0..3dc109f5db56 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -280,7 +280,6 @@ static void smc_close_passive_abort_received(struct smc_sock *smc)
 	case SMC_APPFINCLOSEWAIT:
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
-		smc_close_abort(&smc->conn);
 		sk->sk_state = SMC_PROCESSABORT;
 		break;
 	case SMC_PEERCLOSEWAIT1:
@@ -288,7 +287,6 @@ static void smc_close_passive_abort_received(struct smc_sock *smc)
 		if (txflags->peer_done_writing &&
 		    !smc_close_sent_any_close(&smc->conn)) {
 			/* just shutdown, but not yet closed locally */
-			smc_close_abort(&smc->conn);
 			sk->sk_state = SMC_PROCESSABORT;
 		} else {
 			sk->sk_state = SMC_CLOSED;
@@ -354,7 +352,6 @@ static void smc_close_passive_work(struct work_struct *work)
 		/* fall through */
 		/* to check for closing */
 	case SMC_PEERCLOSEWAIT2:
-	case SMC_PEERFINCLOSEWAIT:
 		if (!smc_cdc_rxed_any_close(conn))
 			break;
 		if (sock_flag(sk, SOCK_DEAD) &&
@@ -366,6 +363,10 @@ static void smc_close_passive_work(struct work_struct *work)
 			sk->sk_state = SMC_APPFINCLOSEWAIT;
 		}
 		break;
+	case SMC_PEERFINCLOSEWAIT:
+		if (smc_cdc_rxed_any_close(conn))
+			sk->sk_state = SMC_CLOSED;
+		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
 	case SMC_APPFINCLOSEWAIT:
-- 
cgit v1.2.3


From b4772b3a87b772401e2af1c894fef323fb5c6e7c Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 25 Jan 2018 11:15:33 +0100
Subject: net/smc: terminate link group for ib_post_send problems

If ib_post_send() fails, terminate all connections of this
link group.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tx.c | 4 +++-
 net/smc/smc_wr.c | 8 +++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index fea6482233a6..71b7d9f079f0 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -248,8 +248,10 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
 		peer_rmbe_offset;
 	rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
 	rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
-	if (rc)
+	if (rc) {
 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+		smc_lgr_terminate(lgr);
+	}
 	return rc;
 }
 
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 5ed94109d1d6..621c65850a18 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -248,8 +248,14 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
 	pend = container_of(priv, struct smc_wr_tx_pend, priv);
 	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
 			  &failed_wr);
-	if (rc)
+	if (rc) {
+		struct smc_link_group *lgr =
+			container_of(link, struct smc_link_group,
+				     lnk[SMC_SINGLE_LINK]);
+
 		smc_wr_tx_put_slot(link, priv);
+		smc_lgr_terminate(lgr);
+	}
 	return rc;
 }
 
-- 
cgit v1.2.3


From 610db66f377cf99aa07a2b1990727238f2e2d6d3 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 25 Jan 2018 11:15:34 +0100
Subject: net/smc: do not reuse a linkgroup with setup problems

Once a linkgroup is created successfully, it stays alive for a
certain time to service more connections potentially created.
If one of the initialization steps for a new linkgroup fails,
the linkgroup should not be reused by other connections following.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 17 +++++++++++++++++
 net/smc/smc_core.c |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 05cbcd3a6f60..cf0e11978b66 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -377,6 +377,15 @@ static void smc_link_save_peer_info(struct smc_link *link,
 	link->peer_mtu = clc->qp_mtu;
 }
 
+static void smc_lgr_forget(struct smc_link_group *lgr)
+{
+	spin_lock_bh(&smc_lgr_list.lock);
+	/* do not use this link group for new connections */
+	if (!list_empty(&lgr->list))
+		list_del_init(&lgr->list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 /* setup for RDMA connection of client */
 static int smc_connect_rdma(struct smc_sock *smc)
 {
@@ -513,6 +522,8 @@ out_connected:
 	return rc ? rc : local_contact;
 
 decline_rdma_unlock:
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_lgr_forget(smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_conn_free(&smc->conn);
 decline_rdma:
@@ -526,6 +537,8 @@ decline_rdma:
 	goto out_connected;
 
 out_err_unlock:
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_lgr_forget(smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_conn_free(&smc->conn);
 out_err:
@@ -906,6 +919,8 @@ enqueue:
 	return;
 
 decline_rdma_unlock:
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_lgr_forget(new_smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 decline_rdma:
 	/* RDMA setup failed, switch back to TCP */
@@ -918,6 +933,8 @@ decline_rdma:
 	goto out_connected;
 
 out_err_unlock:
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_lgr_forget(new_smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 out_err:
 	newsmcsk->sk_state = SMC_CLOSED;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 94f21116dac5..7406cbb41856 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -128,6 +128,8 @@ static void smc_lgr_free_work(struct work_struct *work)
 	bool conns;
 
 	spin_lock_bh(&smc_lgr_list.lock);
+	if (list_empty(&lgr->list))
+		goto free;
 	read_lock_bh(&lgr->conns_lock);
 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
 	read_unlock_bh(&lgr->conns_lock);
@@ -136,6 +138,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 		return;
 	}
 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
+free:
 	spin_unlock_bh(&smc_lgr_list.lock);
 	smc_lgr_free(lgr);
 }
-- 
cgit v1.2.3


From 732720fafdb04ad8daec36ba7f09b44641ebc13f Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 25 Jan 2018 11:15:35 +0100
Subject: net/smc: wake up wr_reg_wait when terminating a link group

If a new connection with a new rmb is added to a link group, its
memory region is registered. If a link group is terminated, a pending
registration requires a wake up.

And consolidate setting of tx_flag peer_conn_abort in smc_lgr_terminate().

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 14 ++++++--------
 net/smc/smc_core.c  |  8 +++++---
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 3dc109f5db56..babe05d385e7 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -89,7 +89,7 @@ static int smc_close_abort(struct smc_connection *conn)
 }
 
 /* terminate smc socket abnormally - active abort
- * RDMA communication no longer possible
+ * link group is terminated, i.e. RDMA communication no longer possible
  */
 static void smc_close_active_abort(struct smc_sock *smc)
 {
@@ -113,7 +113,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
-		txflags->peer_conn_abort = 1;
 		sock_release(smc->clcsock);
 		if (!smc_cdc_rxed_any_close(&smc->conn))
 			sk->sk_state = SMC_PEERABORTWAIT;
@@ -127,7 +126,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
 	case SMC_PEERCLOSEWAIT2:
 		if (!txflags->peer_conn_closed) {
 			sk->sk_state = SMC_PEERABORTWAIT;
-			txflags->peer_conn_abort = 1;
 			sock_release(smc->clcsock);
 		} else {
 			sk->sk_state = SMC_CLOSED;
@@ -135,10 +133,8 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		break;
 	case SMC_PROCESSABORT:
 	case SMC_APPFINCLOSEWAIT:
-		if (!txflags->peer_conn_closed) {
-			txflags->peer_conn_abort = 1;
+		if (!txflags->peer_conn_closed)
 			sock_release(smc->clcsock);
-		}
 		sk->sk_state = SMC_CLOSED;
 		break;
 	case SMC_PEERFINCLOSEWAIT:
@@ -303,8 +299,9 @@ static void smc_close_passive_abort_received(struct smc_sock *smc)
 	}
 }
 
-/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
- * or peer_done_writing.
+/* Either some kind of closing has been received: peer_conn_closed,
+ * peer_conn_abort, or peer_done_writing
+ * or the link group of the connection terminates abnormally.
  */
 static void smc_close_passive_work(struct work_struct *work)
 {
@@ -327,6 +324,7 @@ static void smc_close_passive_work(struct work_struct *work)
 
 	rxflags = &conn->local_rx_ctrl.conn_state_flags;
 	if (rxflags->peer_conn_abort) {
+		/* peer has not received all data */
 		smc_close_passive_abort_received(smc);
 		release_sock(&smc->sk);
 		cancel_delayed_work_sync(&conn->tx_work);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 7406cbb41856..ed5b46d1fe41 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -234,9 +234,7 @@ static void smc_buf_unuse(struct smc_connection *conn)
 /* remove a finished connection from its link group */
 void smc_conn_free(struct smc_connection *conn)
 {
-	struct smc_link_group *lgr = conn->lgr;
-
-	if (!lgr)
+	if (!conn->lgr)
 		return;
 	smc_cdc_tx_dismiss_slots(conn);
 	smc_lgr_unregister_conn(conn);
@@ -331,12 +329,16 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 		conn = rb_entry(node, struct smc_connection, alert_node);
 		smc = container_of(conn, struct smc_sock, conn);
 		sock_hold(&smc->sk);
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
 		__smc_lgr_unregister_conn(conn);
+		write_unlock_bh(&lgr->conns_lock);
 		schedule_work(&conn->close_work);
+		write_lock_bh(&lgr->conns_lock);
 		sock_put(&smc->sk);
 		node = rb_first(&lgr->conns_all);
 	}
 	write_unlock_bh(&lgr->conns_lock);
+	wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
 }
 
 /* Determine vlan of internal TCP socket.
-- 
cgit v1.2.3


From 1a0a04c7a82c4c4667ab5a9660dc37f6d365d9d3 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 25 Jan 2018 11:15:36 +0100
Subject: net/smc: check for healthy link group resp. connections

If a problem for at least one connection of a link group is detected,
the whole link group and all its connections are terminated.
This patch adds a check for healthy link group when trying to reserve
a work request, and checks for healthy connections before starting
a tx worker.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c  |  9 +++++++--
 net/smc/smc_diag.c |  6 ++++--
 net/smc/smc_tx.c   | 15 ++++++++++++---
 net/smc/smc_wr.c   | 11 ++++++-----
 4 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 51805334e001..6e8f5fbe0f09 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -65,9 +65,14 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
 			  struct smc_cdc_tx_pend **pend)
 {
 	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+	int rc;
 
-	return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
-				       (struct smc_wr_tx_pend_priv **)pend);
+	rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+				     (struct smc_wr_tx_pend_priv **)pend);
+	if (!conn->alert_token_local)
+		/* abnormal termination */
+		rc = -EPIPE;
+	return rc;
 }
 
 static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index d2d01cf70224..427b91c1c964 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -86,7 +86,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
 		goto errout;
 
-	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
+	    smc->conn.alert_token_local) {
 		struct smc_connection *conn = &smc->conn;
 		struct smc_diag_conninfo cinfo = {
 			.token = conn->alert_token_local,
@@ -124,7 +125,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 			goto errout;
 	}
 
-	if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+	if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr &&
+	    !list_empty(&smc->conn.lgr->list)) {
 		struct smc_diag_lgrinfo linfo = {
 			.role = smc->conn.lgr->role,
 			.lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 71b7d9f079f0..838bce20c361 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -408,8 +408,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 				goto out_unlock;
 			}
 			rc = 0;
-			schedule_delayed_work(&conn->tx_work,
-					      SMC_TX_WORK_DELAY);
+			if (conn->alert_token_local) /* connection healthy */
+				schedule_delayed_work(&conn->tx_work,
+						      SMC_TX_WORK_DELAY);
 		}
 		goto out_unlock;
 	}
@@ -440,10 +441,17 @@ static void smc_tx_work(struct work_struct *work)
 	int rc;
 
 	lock_sock(&smc->sk);
+	if (smc->sk.sk_err ||
+	    !conn->alert_token_local ||
+	    conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+		goto out;
+
 	rc = smc_tx_sndbuf_nonempty(conn);
 	if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
 	    !atomic_read(&conn->bytes_to_rcv))
 		conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+
+out:
 	release_sock(&smc->sk);
 }
 
@@ -464,7 +472,8 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmbe_size / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
-		if (smc_cdc_get_slot_and_msg_send(conn) < 0) {
+		if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
+		    conn->alert_token_local) { /* connection healthy */
 			schedule_delayed_work(&conn->tx_work,
 					      SMC_TX_WORK_DELAY);
 			return;
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 621c65850a18..1b8af23e6e2b 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -174,9 +174,9 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 			    struct smc_wr_tx_pend_priv **wr_pend_priv)
 {
 	struct smc_wr_tx_pend *wr_pend;
+	u32 idx = link->wr_tx_cnt;
 	struct ib_send_wr *wr_ib;
 	u64 wr_id;
-	u32 idx;
 	int rc;
 
 	*wr_buf = NULL;
@@ -186,16 +186,17 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 		if (rc)
 			return rc;
 	} else {
+		struct smc_link_group *lgr;
+
+		lgr = container_of(link, struct smc_link_group,
+				   lnk[SMC_SINGLE_LINK]);
 		rc = wait_event_timeout(
 			link->wr_tx_wait,
+			list_empty(&lgr->list) || /* lgr terminated */
 			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
 			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
 		if (!rc) {
 			/* timeout - terminate connections */
-			struct smc_link_group *lgr;
-
-			lgr = container_of(link, struct smc_link_group,
-					   lnk[SMC_SINGLE_LINK]);
 			smc_lgr_terminate(lgr);
 			return -EPIPE;
 		}
-- 
cgit v1.2.3


From 955ec4cb3b54c7c389a9f830be7d3ae2056b9212 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 24 Jan 2018 19:45:29 -0800
Subject: net/ipv6: Do not allow route add with a device that is down

IPv6 allows routes to be installed when the device is not up (admin up).
Worse, it does not mark it as LINKDOWN. IPv4 does not allow it and really
there is no reason for IPv6 to allow it, so check the flags and deny if
device is admin down.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f85da2f1e729..aa4411c81e7e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2734,6 +2734,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!dev)
 		goto out;
 
+	if (!(dev->flags & IFF_UP)) {
+		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+		err = -ENETDOWN;
+		goto out;
+	}
+
 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
 			NL_SET_ERR_MSG(extack, "Invalid source address");
-- 
cgit v1.2.3


From 2585cd62f0986a6e6d9c83363ed6dbcc66bc9f32 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:05 -0800
Subject: bpf: Only reply field should be writeable

Currently, a sock_ops BPF program can write the op field and all the
reply fields (reply and replylong). This is a bug. The op field should
not have been writeable and there is currently no way to use replylong
field for indices >= 1. This patch enforces that only the reply field
(which equals replylong[0]) is writeable.

Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops")
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 18da42a81d0c..bf9bb755e369 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3845,8 +3845,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case offsetof(struct bpf_sock_ops, op) ...
-		     offsetof(struct bpf_sock_ops, replylong[3]):
+		case offsetof(struct bpf_sock_ops, reply):
 			break;
 		default:
 			return false;
-- 
cgit v1.2.3


From a33de3973488680def6a900e74ec67ba2bef226c Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:06 -0800
Subject: bpf: Make SOCK_OPS_GET_TCP size independent

Make SOCK_OPS_GET_TCP helper macro size independent (before only worked
with 4-byte fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index bf9bb755e369..62e78748d95f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4470,9 +4470,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 /* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4484,16 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
+						   FIELD_NAME), si->dst_reg,  \
+				      si->dst_reg,			      \
 				      offsetof(struct tcp_sock, FIELD_NAME)); \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP32(snd_cwnd);
+		SOCK_OPS_GET_TCP(snd_cwnd);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP32(srtt_us);
+		SOCK_OPS_GET_TCP(srtt_us);
 		break;
 	}
 	return insn - insn_buf;
-- 
cgit v1.2.3


From 34d367c59233464dbd1f07445c4665099a7128ec Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:07 -0800
Subject: bpf: Make SOCK_OPS_GET_TCP struct independent

Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added 2
arguments so now it can also work with struct sock fields.
The first argument is the name of the field in the bpf_sock_ops
struct, the 2nd argument is the name of the field in the OBJ struct.

Previous: SOCK_OPS_GET_TCP(FIELD_NAME)
New:      SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)

Where OBJ is either "struct tcp_sock" or "struct sock" (without
quotation). BPF_FIELD is the name of the field in the bpf_sock_ops
struct and OBJ_FIELD is the name of the field in the OBJ struct.

Although the field names are currently the same, the kernel struct names
could change in the future and this change makes it easier to support
that.

Note that adding access to tcp_sock fields in sock_ops programs does
not preclude the tcp_sock fields from being removed as long as we are
willing to do one of the following:

  1) Return a fixed value (e.x. 0 or 0xffffffff), or
  2) Make the verifier fail if that field is accessed (i.e. program
    fails to load) so the user will know that field is no longer
    supported.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 62e78748d95f..dbb6d2f60680 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4469,11 +4469,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
-			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4485,18 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
-						   FIELD_NAME), si->dst_reg,  \
-				      si->dst_reg,			      \
-				      offsetof(struct tcp_sock, FIELD_NAME)); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \
+						       OBJ_FIELD),	      \
+				      si->dst_reg, si->dst_reg,		      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP(snd_cwnd);
+		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP(srtt_us);
+		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
 	}
 	return insn - insn_buf;
-- 
cgit v1.2.3


From b73042b8a28e2603ac178295ab96c876ba5a97a1 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:08 -0800
Subject: bpf: Add write access to tcp_sock and sock fields

This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
struct tcp_sock or struct sock fields. This required adding a new
field "temp" to struct bpf_sock_ops_kern for temporary storage that
is used by sock_ops_convert_ctx_access. It is used to store and recover
the contents of a register, so the register can be used to store the
address of the sk. Since we cannot overwrite the dst_reg because it
contains the pointer to ctx, nor the src_reg since it contains the value
we want to store, we need an extra register to contain the address
of the sk.

Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
GET or SET macros depending on the value of the TYPE field.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  9 +++++++++
 include/net/tcp.h      |  2 +-
 net/core/filter.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 425056c7f96c..daa5a676335f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1007,6 +1007,15 @@ struct bpf_sock_ops_kern {
 		u32 replylong[4];
 	};
 	u32	is_fullsock;
+	u64	temp;			/* temp and everything after is not
+					 * initialized to 0 before calling
+					 * the BPF program. New fields that
+					 * should be initialized to 0 should
+					 * be inserted before temp.
+					 * temp is scratch storage used by
+					 * sock_ops_convert_ctx_access
+					 * as temporary storage of a register.
+					 */
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5a1d26a18599..6092eaff61cf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
 	if (sk_fullsock(sk)) {
 		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index dbb6d2f60680..c356ec02b1a5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4491,6 +4491,54 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(OBJ, OBJ_FIELD));	      \
 	} while (0)
 
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
+	do {								      \
+		int reg = BPF_REG_9;					      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),	      \
+				      reg, si->src_reg,			      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+	} while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)	      \
+	do {								      \
+		if (TYPE == BPF_WRITE)					      \
+			SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+		else							      \
+			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+	} while (0)
+
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
 		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
-- 
cgit v1.2.3


From de525be2ca2734865d29c4b67ddd29913b214906 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:09 -0800
Subject: bpf: Support passing args to sock_ops bpf function

Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        | 40 +++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/bpf.h |  5 +++--
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_nv.c        |  2 +-
 net/ipv4/tcp_output.c    |  2 +-
 6 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index daa5a676335f..20384c4bed25 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1003,6 +1003,7 @@ struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
 	union {
+		u32 args[4];
 		u32 reply;
 		u32 replylong[4];
 	};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6092eaff61cf..093e967a2960 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk);
  * program loaded).
  */
 #ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 
 	sock_ops.sk = sk;
 	sock_ops.op = op;
+	if (nargs > 0)
+		memcpy(sock_ops.args, args, nargs * sizeof(*args));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
 	if (ret == 0)
@@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 		ret = -1;
 	return ret;
 }
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	u32 args[2] = {arg1, arg2};
+
+	return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	u32 args[3] = {arg1, arg2, arg3};
+
+	return tcp_call_bpf(sk, op, 3, args);
+}
+
 #else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	return -EPERM;
 }
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	return -EPERM;
+}
+
 #endif
 
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
-	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 {
 	int rwnd;
 
-	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
 
 	if (rwnd < 0)
 		rwnd = 0;
@@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 
 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
-	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 406c19d6016b..8d5874c2c4ff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -952,8 +952,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d7cf861bf699..88b62441e7e9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05bd82e3..ddbce73edae8 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
 	 * within a datacenter, where we have reasonable estimates of
 	 * RTTs
 	 */
-	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
 	if (base_rtt > 0) {
 		ca->nv_base_rtt = base_rtt;
 		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 95461f02ac9a..d12f7f71c1c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3469,7 +3469,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
-	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
-- 
cgit v1.2.3


From b13d880721729384757f235166068c315326f4a1 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:10 -0800
Subject: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock

Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary
use is to determine if there should be calls to sock_ops bpf program at
various points in the TCP code. The field is initialized to zero,
disabling the calls. A sock_ops BPF program can set it, per connection and
as necessary, when the connection is established.

It also adds support for reading and writting the field within a
sock_ops BPF program. Reading is done by accessing the field directly.
However, writing is done through the helper function
bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program
is trying to set a callback that is not supported in the current kernel
(i.e. running an older kernel). The helper function returns 0 if it was
able to set all of the bits set in the argument, a positive number
containing the bits that could not be set, or -EINVAL if the socket is
not a full TCP socket.

Examples of where one could call the bpf program:

1) When RTO fires
2) When a packet is retransmitted
3) When the connection terminates
4) When a packet is sent
5) When a packet is received

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tcp.h      | 11 +++++++++++
 include/uapi/linux/bpf.h | 17 ++++++++++++++++-
 net/core/filter.c        | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4f93f0953c41..8f4c54986f97 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -335,6 +335,17 @@ struct tcp_sock {
 
 	int			linger2;
 
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
+					 * values defined in uapi/linux/tcp.h
+					 */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
+
 /* Receiver side RTT estimation */
 	struct {
 		u32	rtt_us;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8d5874c2c4ff..aa128407c44d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -642,6 +642,14 @@ union bpf_attr {
  *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
+ * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
+ *     Set callback flags for sock_ops
+ *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
+ *     @flags: flags value
+ *     Return: 0 for no error
+ *             -EINVAL if there is no full tcp socket
+ *             bits in flags that are not supported by current kernel
+ *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
  *     Grow or shrink room in sk_buff.
  *     @skb: pointer to skb
@@ -748,7 +756,8 @@ union bpf_attr {
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
 	FN(getsockopt),			\
-	FN(override_return),
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -969,8 +978,14 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
 };
 
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0		/* Mask of all currently
+							 * supported cb flags
+							 */
+
 /* List of known BPF sock_ops operators.
  * New entries can only be added at the end
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index c356ec02b1a5..6936d19ac736 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3328,6 +3328,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, argval)
+{
+	struct sock *sk = bpf_sock->sk;
+	int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+
+	if (!sk_fullsock(sk))
+		return -EINVAL;
+
+#ifdef CONFIG_INET
+	if (val)
+		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+#else
+	return -EINVAL;
+#endif
+}
+
+static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
+	.func		= bpf_sock_ops_cb_flags_set,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3510,6 +3537,8 @@ static const struct bpf_func_proto *
 		return &bpf_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
 		return &bpf_getsockopt_proto;
+	case BPF_FUNC_sock_ops_cb_flags_set:
+		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
 	default:
@@ -4546,6 +4575,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, srtt_us):
 		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
+		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
+				   struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From f89013f66d0f1a0dad44c513318efb706399a36b Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:11 -0800
Subject: bpf: Add sock_ops RTO callback

Adds an optional call to sock_ops BPF program based on whether the
BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags.
The BPF program is passed 2 arguments: icsk_retransmits and whether the
RTO has expired.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 8 +++++++-
 net/ipv4/tcp_timer.c     | 7 +++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa128407c44d..c8cecf9cf5bd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -982,7 +982,8 @@ struct bpf_sock_ops {
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0		/* Mask of all currently
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x1		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -1019,6 +1020,11 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6db3124cdbda..257abdde23b0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk)
 						icsk->icsk_user_timeout);
 	}
 	tcp_fastopen_active_detect_blackhole(sk, expired);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+				  icsk->icsk_retransmits,
+				  icsk->icsk_rto, (int)expired);
+
 	if (expired) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
 	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 44f0e43037d3a17b043843ba67610ac7c7e37db6 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:12 -0800
Subject: bpf: Add support for reading sk_state and more

Add support for reading many more tcp_sock fields

  state,	same as sk->sk_state
  rtt_min	same as sk->rtt_min.s[0].v (current rtt_min)
  snd_ssthresh
  rcv_nxt
  snd_nxt
  snd_una
  mss_cache
  ecn_flags
  rate_delivered
  rate_interval_us
  packets_out
  retrans_out
  total_retrans
  segs_in
  data_segs_in
  segs_out
  data_segs_out
  lost_out
  sacked_out
  sk_txhash
  bytes_received (__u64)
  bytes_acked    (__u64)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  22 ++++++++
 net/core/filter.c        | 143 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 154 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8cecf9cf5bd..46520eae37fa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -979,6 +979,28 @@ struct bpf_sock_ops {
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6936d19ac736..a858ebc4ece4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3855,33 +3855,43 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static bool __is_valid_sock_ops_access(int off, int size)
+static bool sock_ops_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
 {
+	const int size_default = sizeof(__u32);
+
 	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
-
-	return true;
-}
 
-static bool sock_ops_is_valid_access(int off, int size,
-				     enum bpf_access_type type,
-				     struct bpf_insn_access_aux *info)
-{
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, reply):
+			if (size != size_default)
+				return false;
 			break;
 		default:
 			return false;
 		}
+	} else {
+		switch (off) {
+		case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
+					bytes_acked):
+			if (size != sizeof(__u64))
+				return false;
+			break;
+		default:
+			if (size != size_default)
+				return false;
+			break;
+		}
 	}
 
-	return __is_valid_sock_ops_access(off, size);
+	return true;
 }
 
 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -4498,6 +4508,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
+	case offsetof(struct bpf_sock_ops, state):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_state));
+		break;
+
+	case offsetof(struct bpf_sock_ops, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      FIELD_SIZEOF(struct minmax_sample, t));
+		break;
+
 /* Helper macro for adding read access to tcp_sock or sock fields. */
 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
@@ -4580,6 +4616,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
 				   struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, snd_ssthresh):
+		SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rcv_nxt):
+		SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_nxt):
+		SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_una):
+		SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, mss_cache):
+		SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, ecn_flags):
+		SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_delivered):
+		SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_interval_us):
+		SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, packets_out):
+		SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, retrans_out):
+		SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, total_retrans):
+		SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_in):
+		SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_in):
+		SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_out):
+		SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, lost_out):
+		SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sacked_out):
+		SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sk_txhash):
+		SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_received):
+		SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_acked):
+		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From 6f9bd3d731aac0d2ac21dd78a642af5df38fb5c5 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:13 -0800
Subject: bpf: Add sock_ops R/W access to tclass

Adds direct write access to sk_txhash and access to tclass for ipv6
flows through getsockopt and setsockopt. Sample usage for tclass:

  bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v))

where skops is a pointer to the ctx (struct bpf_sock_ops).

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index a858ebc4ece4..fe2c7937351f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3232,6 +3232,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct ipv6_pinfo *np = inet6_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				np->tclass = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
+#endif
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
@@ -3241,7 +3264,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false, reinit);
+			ret = tcp_set_congestion_control(sk, name, false,
+							 reinit);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3307,6 +3331,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			*((int *)optval) = (int)np->tclass;
+			break;
+		default:
+			goto err_clear;
+		}
+#endif
 	} else {
 		goto err_clear;
 	}
@@ -3871,6 +3911,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, reply):
+		case offsetof(struct bpf_sock_ops, sk_txhash):
 			if (size != size_default)
 				return false;
 			break;
@@ -4690,7 +4731,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct bpf_sock_ops, sk_txhash):
-		SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock);
+		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+					  struct sock, type);
 		break;
 
 	case offsetof(struct bpf_sock_ops, bytes_received):
@@ -4701,6 +4743,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, bytes_acked):
 		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
 		break;
+
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From a31ad29e6a30cb0b9084a9425b819cdcd97273ce Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:14 -0800
Subject: bpf: Add BPF_SOCK_OPS_RETRANS_CB

Adds support for calling sock_ops BPF program when there is a
retransmission. Three arguments are used; one for the sequence number,
another for the number of segments retransmitted, and the last one for
the return value of tcp_transmit_skb (0 => success).
Does not include syn-ack retransmissions.

New op: BPF_SOCK_OPS_RETRANS_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 9 ++++++++-
 net/ipv4/tcp_output.c    | 4 ++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 46520eae37fa..31c93a0bdbc2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1005,7 +1005,8 @@ struct bpf_sock_ops {
 
 /* Definitions for bpf_sock_ops_cb_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x1		/* Mask of all currently
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x3		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -1047,6 +1048,12 @@ enum {
 					 * Arg2: value of icsk_rto
 					 * Arg3: whether RTO has expired
 					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d12f7f71c1c4..e9f985e42405 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
 
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+				  TCP_SKB_CB(skb)->seq, segs, err);
+
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
-- 
cgit v1.2.3


From d44874910a26f3a8f81edf873a2473363f07f660 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:15 -0800
Subject: bpf: Add BPF_SOCK_OPS_STATE_CB

Adds support for calling sock_ops BPF program when there is a TCP state
change. Two arguments are used; one for the old state and another for
the new state.

There is a new enum in include/uapi/linux/bpf.h that exports the TCP
states that prepends BPF_ to the current TCP state names. If it is ever
necessary to change the internal TCP state values (other than adding
more to the end), then it will become necessary to convert from the
internal TCP state value to the BPF value before calling the BPF
sock_ops function. There are a set of compile checks added in tcp.c
to detect if the internal and BPF values differ so we can make the
necessary fixes.

New op: BPF_SOCK_OPS_STATE_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 29 ++++++++++++++++++++++++++++-
 net/ipv4/tcp.c           | 24 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 31c93a0bdbc2..db6bdc375126 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1006,7 +1006,8 @@ struct bpf_sock_ops {
 /* Definitions for bpf_sock_ops_cb_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
 #define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x3		/* Mask of all currently
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -1054,6 +1055,32 @@ enum {
 					 * Arg3: return value of
 					 *       tcp_transmit_skb (0 => success)
 					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 88b62441e7e9..f013ddc191e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
+	/* We defined a new enum for TCP states that are exported in BPF
+	 * so as not force the internal TCP states to be frozen. The
+	 * following checks will detect if an internal state value ever
+	 * differs from the BPF value. If this ever happens, then we will
+	 * need to remap the internal value to the BPF value before calling
+	 * tcp_call_bpf_2arg.
+	 */
+	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
-- 
cgit v1.2.3


From c69de58ba84f480879de64571d9dae5102d10ed6 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 25 Jan 2018 13:20:09 -0800
Subject: net: erspan: use bitfield instead of mask and offset

Originally the erspan fields are defined as a group into a __be16 field,
and use mask and offset to access each field.  This is more costly due to
calling ntohs/htons.  The patch changes it to use bitfields.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h | 127 ++++++++++++++++++++++++++++++++++++++-------------
 net/ipv4/ip_gre.c    |  38 ++++++---------
 net/ipv6/ip6_gre.c   |  36 ++++++---------
 3 files changed, 121 insertions(+), 80 deletions(-)

(limited to 'net')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index 712ea1b1f4db..6d30fe898286 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -65,16 +65,30 @@
 #define GRA_MASK	0x0006
 #define O_MASK		0x0001
 
+#define HWID_OFFSET    4
+#define DIR_OFFSET     3
+
 /* ERSPAN version 2 metadata header */
 struct erspan_md2 {
 	__be32 timestamp;
 	__be16 sgt;	/* security group tag */
-	__be16 flags;
-#define P_OFFSET	15
-#define FT_OFFSET	10
-#define HWID_OFFSET	4
-#define DIR_OFFSET	3
-#define GRA_OFFSET	1
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8	hwid_upper:2,
+		ft:5,
+		p:1;
+	__u8	o:1,
+		gra:2,
+		dir:1,
+		hwid:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u8	p:1,
+		ft:5,
+		hwid_upper:2;
+	__u8	hwid:4,
+		dir:1,
+		gra:2,
+		o:1;
+#endif
 };
 
 enum erspan_encap_type {
@@ -95,15 +109,62 @@ struct erspan_metadata {
 };
 
 struct erspan_base_hdr {
-	__be16 ver_vlan;
-#define VER_OFFSET  12
-	__be16 session_id;
-#define COS_OFFSET  13
-#define EN_OFFSET   11
-#define BSO_OFFSET  EN_OFFSET
-#define T_OFFSET    10
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8	vlan_upper:4,
+		ver:4;
+	__u8	vlan:8;
+	__u8	session_id_upper:2,
+		t:1,
+		en:2,
+		cos:3;
+	__u8	session_id:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u8	ver: 4,
+		vlan_upper:4;
+	__u8	vlan:8;
+	__u8	cos:3,
+		en:2,
+		t:1,
+		session_id_upper:2;
+	__u8	session_id:8;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
 };
 
+static inline void set_session_id(struct erspan_base_hdr *ershdr, u16 id)
+{
+	ershdr->session_id = id & 0xff;
+	ershdr->session_id_upper = (id >> 8) & 0x3;
+}
+
+static inline u16 get_session_id(const struct erspan_base_hdr *ershdr)
+{
+	return (ershdr->session_id_upper << 8) + ershdr->session_id;
+}
+
+static inline void set_vlan(struct erspan_base_hdr *ershdr, u16 vlan)
+{
+	ershdr->vlan = vlan & 0xff;
+	ershdr->vlan_upper = (vlan >> 8) & 0xf;
+}
+
+static inline u16 get_vlan(const struct erspan_base_hdr *ershdr)
+{
+	return (ershdr->vlan_upper << 8) + ershdr->vlan;
+}
+
+static inline void set_hwid(struct erspan_md2 *md2, u8 hwid)
+{
+	md2->hwid = hwid & 0xf;
+	md2->hwid_upper = (hwid >> 4) & 0x3;
+}
+
+static inline u8 get_hwid(const struct erspan_md2 *md2)
+{
+	return (md2->hwid_upper << 4) + md2->hwid;
+}
+
 static inline int erspan_hdr_len(int version)
 {
 	return sizeof(struct erspan_base_hdr) +
@@ -120,7 +181,7 @@ static inline u8 tos_to_cos(u8 tos)
 }
 
 static inline void erspan_build_header(struct sk_buff *skb,
-				__be32 id, u32 index,
+				u32 id, u32 index,
 				bool truncate, bool is_ipv4)
 {
 	struct ethhdr *eth = (struct ethhdr *)skb->data;
@@ -154,12 +215,12 @@ static inline void erspan_build_header(struct sk_buff *skb,
 	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE);
 
 	/* Build base header */
-	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
-				 (ERSPAN_VERSION << VER_OFFSET));
-	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
-			   ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) |
-			   (enc_type << EN_OFFSET & EN_MASK) |
-			   ((truncate << T_OFFSET) & T_MASK));
+	ershdr->ver = ERSPAN_VERSION;
+	ershdr->cos = tos_to_cos(tos);
+	ershdr->en = enc_type;
+	ershdr->t = truncate;
+	set_vlan(ershdr, vlan_tci);
+	set_session_id(ershdr, id);
 
 	/* Build metadata */
 	ersmd = (struct erspan_metadata *)(ershdr + 1);
@@ -187,7 +248,7 @@ static inline __be32 erspan_get_timestamp(void)
 }
 
 static inline void erspan_build_header_v2(struct sk_buff *skb,
-					  __be32 id, u8 direction, u16 hwid,
+					  u32 id, u8 direction, u16 hwid,
 					  bool truncate, bool is_ipv4)
 {
 	struct ethhdr *eth = (struct ethhdr *)skb->data;
@@ -198,7 +259,6 @@ static inline void erspan_build_header_v2(struct sk_buff *skb,
 		__be16 tci;
 	} *qp;
 	u16 vlan_tci = 0;
-	u16 session_id;
 	u8 gra = 0; /* 100 usec */
 	u8 bso = 0; /* Bad/Short/Oversized */
 	u8 sgt = 0;
@@ -221,22 +281,23 @@ static inline void erspan_build_header_v2(struct sk_buff *skb,
 	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
 
 	/* Build base header */
-	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
-				 (ERSPAN_VERSION2 << VER_OFFSET));
-	session_id = (u16)(ntohl(id) & ID_MASK) |
-		     ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) |
-		     (bso << BSO_OFFSET & BSO_MASK) |
-		     ((truncate << T_OFFSET) & T_MASK);
-	ershdr->session_id = htons(session_id);
+	ershdr->ver = ERSPAN_VERSION2;
+	ershdr->cos = tos_to_cos(tos);
+	ershdr->en = bso;
+	ershdr->t = truncate;
+	set_vlan(ershdr, vlan_tci);
+	set_session_id(ershdr, id);
 
 	/* Build metadata */
 	md = (struct erspan_metadata *)(ershdr + 1);
 	md->u.md2.timestamp = erspan_get_timestamp();
 	md->u.md2.sgt = htons(sgt);
-	md->u.md2.flags = htons(((1 << P_OFFSET) & P_MASK) |
-				((hwid << HWID_OFFSET) & HWID_MASK) |
-				((direction << DIR_OFFSET) & DIR_MASK) |
-				((gra << GRA_OFFSET) & GRA_MASK));
+	md->u.md2.p = 1;
+	md->u.md2.ft = 0;
+	md->u.md2.dir = direction;
+	md->u.md2.gra = gra;
+	md->u.md2.o = 0;
+	set_hwid(&md->u.md2, hwid);
 }
 
 #endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b61f2285816d..6ec670fbbbdd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -114,7 +114,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void erspan_build_header(struct sk_buff *skb,
-				__be32 id, u32 index,
+				u32 id, u32 index,
 				bool truncate, bool is_ipv4);
 
 static unsigned int ipgre_net_id __read_mostly;
@@ -273,12 +273,12 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 
 	iph = ip_hdr(skb);
 	ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
-	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
+	ver = ershdr->ver;
 
 	/* The original GRE header does not have key field,
 	 * Use ERSPAN 10-bit session ID as key.
 	 */
-	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
+	tpi->key = cpu_to_be32(get_session_id(ershdr));
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 				  tpi->flags | TUNNEL_KEY,
 				  iph->saddr, iph->daddr, tpi->key);
@@ -324,14 +324,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 			if (ver == 1) {
 				tunnel->index = ntohl(pkt_md->u.index);
 			} else {
-				u16 md2_flags;
-				u16 dir, hwid;
-
-				md2_flags = ntohs(pkt_md->u.md2.flags);
-				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
-				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
-				tunnel->dir = dir;
-				tunnel->hwid = hwid;
+				tunnel->dir = pkt_md->u.md2.dir;
+				tunnel->hwid = get_hwid(&pkt_md->u.md2);
 			}
 
 		}
@@ -615,19 +609,14 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	if (version == 1) {
-		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 				    ntohl(md->u.index), truncate, true);
 	} else if (version == 2) {
-		u16 md2_flags;
-		u8 direction;
-		u16 hwid;
-
-		md2_flags = ntohs(md->u.md2.flags);
-		direction = (md2_flags & DIR_MASK) >> DIR_OFFSET;
-		hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
-
-		erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id),
-				       direction, hwid,	truncate, true);
+		erspan_build_header_v2(skb,
+				       ntohl(tunnel_id_to_key32(key->tun_id)),
+				       md->u.md2.dir,
+				       get_hwid(&md->u.md2),
+				       truncate, true);
 	} else {
 		goto err_free_rt;
 	}
@@ -733,10 +722,11 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 
 	/* Push ERSPAN header */
 	if (tunnel->erspan_ver == 1)
-		erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
+				    tunnel->index,
 				    truncate, true);
 	else
-		erspan_build_header_v2(skb, tunnel->parms.o_key,
+		erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
 				       tunnel->dir, tunnel->hwid,
 				       truncate, true);
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a88480193d77..05f070e123e4 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -513,8 +513,8 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 
 	ipv6h = ipv6_hdr(skb);
 	ershdr = (struct erspan_base_hdr *)skb->data;
-	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
-	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
+	ver = ershdr->ver;
+	tpi->key = cpu_to_be32(get_session_id(ershdr));
 
 	tunnel = ip6gre_tunnel_lookup(skb->dev,
 				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
@@ -565,14 +565,8 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 			if (ver == 1) {
 				tunnel->parms.index = ntohl(pkt_md->u.index);
 			} else {
-				u16 md2_flags;
-				u16 dir, hwid;
-
-				md2_flags = ntohs(pkt_md->u.md2.flags);
-				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
-				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
-				tunnel->parms.dir = dir;
-				tunnel->parms.hwid = hwid;
+				tunnel->parms.dir = pkt_md->u.md2.dir;
+				tunnel->parms.hwid = get_hwid(&pkt_md->u.md2);
 			}
 
 			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
@@ -925,6 +919,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		struct ip_tunnel_info *tun_info;
 		const struct ip_tunnel_key *key;
 		struct erspan_metadata *md;
+		__be32 tun_id;
 
 		tun_info = skb_tunnel_info(skb);
 		if (unlikely(!tun_info ||
@@ -944,23 +939,18 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		if (!md)
 			goto tx_err;
 
+		tun_id = tunnel_id_to_key32(key->tun_id);
 		if (md->version == 1) {
 			erspan_build_header(skb,
-					    tunnel_id_to_key32(key->tun_id),
+					    ntohl(tun_id),
 					    ntohl(md->u.index), truncate,
 					    false);
 		} else if (md->version == 2) {
-			u16 md2_flags;
-			u16 dir, hwid;
-
-			md2_flags = ntohs(md->u.md2.flags);
-			dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
-			hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
-
 			erspan_build_header_v2(skb,
-					       tunnel_id_to_key32(key->tun_id),
-					       dir, hwid, truncate,
-					       false);
+					       ntohl(tun_id),
+					       md->u.md2.dir,
+					       get_hwid(&md->u.md2),
+					       truncate, false);
 		}
 	} else {
 		switch (skb->protocol) {
@@ -982,11 +972,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		}
 
 		if (t->parms.erspan_ver == 1)
-			erspan_build_header(skb, t->parms.o_key,
+			erspan_build_header(skb, ntohl(t->parms.o_key),
 					    t->parms.index,
 					    truncate, false);
 		else
-			erspan_build_header_v2(skb, t->parms.o_key,
+			erspan_build_header_v2(skb, ntohl(t->parms.o_key),
 					       t->parms.dir,
 					       t->parms.hwid,
 					       truncate, false);
-- 
cgit v1.2.3


From fc1372f89ffe1f58b589643b75f679e452350703 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 25 Jan 2018 13:20:11 -0800
Subject: openvswitch: add erspan version I and II support

The patch adds support for openvswitch to configure erspan
v1 and v2.  The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr is added
to uapi as a binary blob to support all ERSPAN v1 and v2's
fields.  Note that Previous commit "openvswitch: Add erspan tunnel
support." was reverted since it does not design properly.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/flow_netlink.c   | 52 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index dcfab5e3b55c..713e56ce681f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -363,6 +363,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_SRC,		/* struct in6_addr src IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
+	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* struct erspan_metadata */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index eb55f1b3d047..7322aa1e382e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -49,6 +49,7 @@
 #include <net/mpls.h>
 #include <net/vxlan.h>
 #include <net/tun_proto.h>
+#include <net/erspan.h>
 
 #include "flow_netlink.h"
 
@@ -329,7 +330,8 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(0)    /* OVS_TUNNEL_KEY_ATTR_CSUM */
 		+ nla_total_size(0)    /* OVS_TUNNEL_KEY_ATTR_OAM */
 		+ nla_total_size(256)  /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
-		/* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with
+		/* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and
+		 * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with
 		 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
 		 */
 		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
@@ -400,6 +402,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 						.next = ovs_vxlan_ext_key_lens },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
+	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = OVS_ATTR_VARIABLE },
 };
 
 static const struct ovs_len_tbl
@@ -631,6 +634,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
 	return 0;
 }
 
+static int erspan_tun_opt_from_nlattr(const struct nlattr *a,
+				      struct sw_flow_match *match, bool is_mask,
+				      bool log)
+{
+	unsigned long opt_key_offset;
+
+	BUILD_BUG_ON(sizeof(struct erspan_metadata) >
+		     sizeof(match->key->tun_opts));
+
+	if (nla_len(a) > sizeof(match->key->tun_opts)) {
+		OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).",
+			  nla_len(a), sizeof(match->key->tun_opts));
+		return -EINVAL;
+	}
+
+	if (!is_mask)
+		SW_FLOW_KEY_PUT(match, tun_opts_len,
+				sizeof(struct erspan_metadata), false);
+	else
+		SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
+
+	opt_key_offset = TUN_METADATA_OFFSET(nla_len(a));
+	SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a),
+				  nla_len(a), is_mask);
+	return 0;
+}
+
 static int ip_tun_from_nlattr(const struct nlattr *attr,
 			      struct sw_flow_match *match, bool is_mask,
 			      bool log)
@@ -738,6 +768,20 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_PAD:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			if (opts_type) {
+				OVS_NLERR(log, "Multiple metadata blocks provided");
+				return -EINVAL;
+			}
+
+			err = erspan_tun_opt_from_nlattr(a, match, is_mask,
+							 log);
+			if (err)
+				return err;
+
+			tun_flags |= TUNNEL_ERSPAN_OPT;
+			opts_type = type;
+			break;
 		default:
 			OVS_NLERR(log, "Unknown IP tunnel attribute %d",
 				  type);
@@ -862,6 +906,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
 			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
 			return -EMSGSIZE;
+		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+			 nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+				 swkey_tun_opts_len, tun_opts))
+			return -EMSGSIZE;
 	}
 
 	return 0;
@@ -2486,6 +2534,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			break;
 		}
 	}
 
-- 
cgit v1.2.3


From 9515a2e082f91457db0ecff4b65371d0fb5d9aad Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 24 Jan 2018 19:37:38 -0800
Subject: net/ipv4: Allow send to local broadcast from a socket bound to a VRF

Message sends to the local broadcast address (255.255.255.255) require
uc_index or sk_bound_dev_if to be set to an egress device. However,
responses or only received if the socket is bound to the device. This
is overly constraining for processes running in an L3 domain. This
patch allows a socket bound to the VRF device to send to the local
broadcast address by using IP_UNICAST_IF to set the egress interface
with packet receipt handled by the VRF binding.

Similar to IP_MULTICAST_IF, relax the constraint on setting
IP_UNICAST_IF if a socket is bound to an L3 master device. In this
case allow uc_index to be set to an enslaved if sk_bound_dev_if is
an L3 master device and is the master device for the ifindex.

In udp and raw sendmsg, allow uc_index to override the oif if
uc_index master device is oif (ie., the oif is an L3 master and the
index is an L3 slave).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c |  6 +++++-
 net/ipv4/raw.c         | 15 ++++++++++++++-
 net/ipv4/udp.c         | 15 ++++++++++++++-
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 60fb1eb7d7d8..6cc70fa488cb 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -808,6 +808,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	{
 		struct net_device *dev = NULL;
 		int ifindex;
+		int midx;
 
 		if (optlen != sizeof(int))
 			goto e_inval;
@@ -823,10 +824,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		err = -EADDRNOTAVAIL;
 		if (!dev)
 			break;
+
+		midx = l3mdev_master_ifindex(dev);
 		dev_put(dev);
 
 		err = -EINVAL;
-		if (sk->sk_bound_dev_if)
+		if (sk->sk_bound_dev_if &&
+		    (!midx || midx != sk->sk_bound_dev_if))
 			break;
 
 		inet->uc_index = ifindex;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 136544b36a46..7c509697ebc7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -617,8 +617,21 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			ipc.oif = inet->mc_index;
 		if (!saddr)
 			saddr = inet->mc_addr;
-	} else if (!ipc.oif)
+	} else if (!ipc.oif) {
 		ipc.oif = inet->uc_index;
+	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+		/* oif is set, packet is to local broadcast and
+		 * and uc_index is set. oif is most likely set
+		 * by sk_bound_dev_if. If uc_index != oif check if the
+		 * oif is an L3 master and uc_index is an L3 slave.
+		 * If so, we want to allow the send using the uc_index.
+		 */
+		if (ipc.oif != inet->uc_index &&
+		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
+							      inet->uc_index)) {
+			ipc.oif = inet->uc_index;
+		}
+	}
 
 	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
 			   RT_SCOPE_UNIVERSE,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 853321555a4e..3f018f34cf56 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -977,8 +977,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (!saddr)
 			saddr = inet->mc_addr;
 		connected = 0;
-	} else if (!ipc.oif)
+	} else if (!ipc.oif) {
 		ipc.oif = inet->uc_index;
+	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+		/* oif is set, packet is to local broadcast and
+		 * and uc_index is set. oif is most likely set
+		 * by sk_bound_dev_if. If uc_index != oif check if the
+		 * oif is an L3 master and uc_index is an L3 slave.
+		 * If so, we want to allow the send using the uc_index.
+		 */
+		if (ipc.oif != inet->uc_index &&
+		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
+							      inet->uc_index)) {
+			ipc.oif = inet->uc_index;
+		}
+	}
 
 	if (connected)
 		rt = (struct rtable *)sk_dst_check(sk, 0);
-- 
cgit v1.2.3


From 7d5977394515ad3a636361ce23890863be6e0f70 Mon Sep 17 00:00:00 2001
From: Robert Schwebel <r.schwebel@pengutronix.de>
Date: Wed, 24 Jan 2018 11:19:11 +0100
Subject: can: migrate documentation to restructured text

The kernel documentation is now restructured text. Convert the SocketCAN
documentation and include it in the toplevel kernel documentation.

This patch doesn't do any content change.

All references to can.txt in the code are converted to can.rst.

Signed-off-by: Robert Schwebel <r.schwebel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 Documentation/networking/00-INDEX  |    2 -
 Documentation/networking/can.rst   | 1437 ++++++++++++++++++++++++++++++++++++
 Documentation/networking/can.txt   | 1308 --------------------------------
 Documentation/networking/index.rst |    1 +
 MAINTAINERS                        |    2 +-
 drivers/net/can/dev.c              |    2 +-
 drivers/net/can/vcan.c             |    2 +-
 net/can/Kconfig                    |    2 +-
 8 files changed, 1442 insertions(+), 1314 deletions(-)
 create mode 100644 Documentation/networking/can.rst
 delete mode 100644 Documentation/networking/can.txt

(limited to 'net')

diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index f5d642c01dd3..2b89d91b376f 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -36,8 +36,6 @@ bonding.txt
 	- Linux Ethernet Bonding Driver HOWTO: link aggregation in Linux.
 bridge.txt
 	- where to get user space programs for ethernet bridging with Linux.
-can.txt
-	- documentation on CAN protocol family.
 cdc_mbim.txt
 	- 3G/LTE USB modem (Mobile Broadband Interface Model)
 checksum-offloads.txt
diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
new file mode 100644
index 000000000000..d23c51abf8c6
--- /dev/null
+++ b/Documentation/networking/can.rst
@@ -0,0 +1,1437 @@
+===================================
+SocketCAN - Controller Area Network
+===================================
+
+Overview / What is SocketCAN
+============================
+
+The socketcan package is an implementation of CAN protocols
+(Controller Area Network) for Linux.  CAN is a networking technology
+which has widespread use in automation, embedded devices, and
+automotive fields.  While there have been other CAN implementations
+for Linux based on character devices, SocketCAN uses the Berkeley
+socket API, the Linux network stack and implements the CAN device
+drivers as network interfaces.  The CAN socket API has been designed
+as similar as possible to the TCP/IP protocols to allow programmers,
+familiar with network programming, to easily learn how to use CAN
+sockets.
+
+
+.. _socketcan-motivation:
+
+Motivation / Why Using the Socket API
+=====================================
+
+There have been CAN implementations for Linux before SocketCAN so the
+question arises, why we have started another project.  Most existing
+implementations come as a device driver for some CAN hardware, they
+are based on character devices and provide comparatively little
+functionality.  Usually, there is only a hardware-specific device
+driver which provides a character device interface to send and
+receive raw CAN frames, directly to/from the controller hardware.
+Queueing of frames and higher-level transport protocols like ISO-TP
+have to be implemented in user space applications.  Also, most
+character-device implementations support only one single process to
+open the device at a time, similar to a serial interface.  Exchanging
+the CAN controller requires employment of another device driver and
+often the need for adaption of large parts of the application to the
+new driver's API.
+
+SocketCAN was designed to overcome all of these limitations.  A new
+protocol family has been implemented which provides a socket interface
+to user space applications and which builds upon the Linux network
+layer, enabling use all of the provided queueing functionality.  A device
+driver for CAN controller hardware registers itself with the Linux
+network layer as a network device, so that CAN frames from the
+controller can be passed up to the network layer and on to the CAN
+protocol family module and also vice-versa.  Also, the protocol family
+module provides an API for transport protocol modules to register, so
+that any number of transport protocols can be loaded or unloaded
+dynamically.  In fact, the can core module alone does not provide any
+protocol and cannot be used without loading at least one additional
+protocol module.  Multiple sockets can be opened at the same time,
+on different or the same protocol module and they can listen/send
+frames on different or the same CAN IDs.  Several sockets listening on
+the same interface for frames with the same CAN ID are all passed the
+same received matching CAN frames.  An application wishing to
+communicate using a specific transport protocol, e.g. ISO-TP, just
+selects that protocol when opening the socket, and then can read and
+write application data byte streams, without having to deal with
+CAN-IDs, frames, etc.
+
+Similar functionality visible from user-space could be provided by a
+character device, too, but this would lead to a technically inelegant
+solution for a couple of reasons:
+
+* **Intricate usage:**  Instead of passing a protocol argument to
+  socket(2) and using bind(2) to select a CAN interface and CAN ID, an
+  application would have to do all these operations using ioctl(2)s.
+
+* **Code duplication:**  A character device cannot make use of the Linux
+  network queueing code, so all that code would have to be duplicated
+  for CAN networking.
+
+* **Abstraction:**  In most existing character-device implementations, the
+  hardware-specific device driver for a CAN controller directly
+  provides the character device for the application to work with.
+  This is at least very unusual in Unix systems for both, char and
+  block devices.  For example you don't have a character device for a
+  certain UART of a serial interface, a certain sound chip in your
+  computer, a SCSI or IDE controller providing access to your hard
+  disk or tape streamer device.  Instead, you have abstraction layers
+  which provide a unified character or block device interface to the
+  application on the one hand, and a interface for hardware-specific
+  device drivers on the other hand.  These abstractions are provided
+  by subsystems like the tty layer, the audio subsystem or the SCSI
+  and IDE subsystems for the devices mentioned above.
+
+  The easiest way to implement a CAN device driver is as a character
+  device without such a (complete) abstraction layer, as is done by most
+  existing drivers.  The right way, however, would be to add such a
+  layer with all the functionality like registering for certain CAN
+  IDs, supporting several open file descriptors and (de)multiplexing
+  CAN frames between them, (sophisticated) queueing of CAN frames, and
+  providing an API for device drivers to register with.  However, then
+  it would be no more difficult, or may be even easier, to use the
+  networking framework provided by the Linux kernel, and this is what
+  SocketCAN does.
+
+The use of the networking framework of the Linux kernel is just the
+natural and most appropriate way to implement CAN for Linux.
+
+
+.. _socketcan-concept:
+
+SocketCAN Concept
+=================
+
+As described in :ref:`socketcan-motivation` the main goal of SocketCAN is to
+provide a socket interface to user space applications which builds
+upon the Linux network layer. In contrast to the commonly known
+TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!)
+medium that has no MAC-layer addressing like ethernet. The CAN-identifier
+(can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs
+have to be chosen uniquely on the bus. When designing a CAN-ECU
+network the CAN-IDs are mapped to be sent by a specific ECU.
+For this reason a CAN-ID can be treated best as a kind of source address.
+
+
+.. _socketcan-receive-lists:
+
+Receive Lists
+-------------
+
+The network transparent access of multiple applications leads to the
+problem that different applications may be interested in the same
+CAN-IDs from the same CAN network interface. The SocketCAN core
+module - which implements the protocol family CAN - provides several
+high efficient receive lists for this reason. If e.g. a user space
+application opens a CAN RAW socket, the raw protocol module itself
+requests the (range of) CAN-IDs from the SocketCAN core that are
+requested by the user. The subscription and unsubscription of
+CAN-IDs can be done for specific CAN interfaces or for all(!) known
+CAN interfaces with the can_rx_(un)register() functions provided to
+CAN protocol modules by the SocketCAN core (see :ref:`socketcan-core-module`).
+To optimize the CPU usage at runtime the receive lists are split up
+into several specific lists per device that match the requested
+filter complexity for a given use-case.
+
+
+.. _socketcan-local-loopback1:
+
+Local Loopback of Sent Frames
+-----------------------------
+
+As known from other networking concepts the data exchanging
+applications may run on the same or different nodes without any
+change (except for the according addressing information):
+
+.. code::
+
+	 ___   ___   ___                   _______   ___
+	| _ | | _ | | _ |                 | _   _ | | _ |
+	||A|| ||B|| ||C||                 ||A| |B|| ||C||
+	|___| |___| |___|                 |_______| |___|
+	  |     |     |                       |       |
+	-----------------(1)- CAN bus -(2)---------------
+
+To ensure that application A receives the same information in the
+example (2) as it would receive in example (1) there is need for
+some kind of local loopback of the sent CAN frames on the appropriate
+node.
+
+The Linux network devices (by default) just can handle the
+transmission and reception of media dependent frames. Due to the
+arbitration on the CAN bus the transmission of a low prio CAN-ID
+may be delayed by the reception of a high prio CAN frame. To
+reflect the correct [*]_ traffic on the node the loopback of the sent
+data has to be performed right after a successful transmission. If
+the CAN network interface is not capable of performing the loopback for
+some reason the SocketCAN core can do this task as a fallback solution.
+See :ref:`socketcan-local-loopback1` for details (recommended).
+
+The loopback functionality is enabled by default to reflect standard
+networking behaviour for CAN applications. Due to some requests from
+the RT-SocketCAN group the loopback optionally may be disabled for each
+separate socket. See sockopts from the CAN RAW sockets in :ref:`socketcan-raw-sockets`.
+
+.. [*] you really like to have this when you're running analyser
+       tools like 'candump' or 'cansniffer' on the (same) node.
+
+
+.. _socketcan-network-problem-notifications:
+
+Network Problem Notifications
+-----------------------------
+
+The use of the CAN bus may lead to several problems on the physical
+and media access control layer. Detecting and logging of these lower
+layer problems is a vital requirement for CAN users to identify
+hardware issues on the physical transceiver layer as well as
+arbitration problems and error frames caused by the different
+ECUs. The occurrence of detected errors are important for diagnosis
+and have to be logged together with the exact timestamp. For this
+reason the CAN interface driver can generate so called Error Message
+Frames that can optionally be passed to the user application in the
+same way as other CAN frames. Whenever an error on the physical layer
+or the MAC layer is detected (e.g. by the CAN controller) the driver
+creates an appropriate error message frame. Error messages frames can
+be requested by the user application using the common CAN filter
+mechanisms. Inside this filter definition the (interested) type of
+errors may be selected. The reception of error messages is disabled
+by default. The format of the CAN error message frame is briefly
+described in the Linux header file "include/uapi/linux/can/error.h".
+
+
+How to use SocketCAN
+====================
+
+Like TCP/IP, you first need to open a socket for communicating over a
+CAN network. Since SocketCAN implements a new protocol family, you
+need to pass PF_CAN as the first argument to the socket(2) system
+call. Currently, there are two CAN protocols to choose from, the raw
+socket protocol and the broadcast manager (BCM). So to open a socket,
+you would write::
+
+    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+
+and::
+
+    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
+
+respectively.  After the successful creation of the socket, you would
+normally use the bind(2) system call to bind the socket to a CAN
+interface (which is different from TCP/IP due to different addressing
+- see :ref:`socketcan-concept`). After binding (CAN_RAW) or connecting (CAN_BCM)
+the socket, you can read(2) and write(2) from/to the socket or use
+send(2), sendto(2), sendmsg(2) and the recv* counterpart operations
+on the socket as usual. There are also CAN specific socket options
+described below.
+
+The basic CAN frame structure and the sockaddr structure are defined
+in include/linux/can.h:
+
+.. code-block:: C
+
+    struct can_frame {
+            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
+            __u8    can_dlc; /* frame payload length in byte (0 .. 8) */
+            __u8    __pad;   /* padding */
+            __u8    __res0;  /* reserved / padding */
+            __u8    __res1;  /* reserved / padding */
+            __u8    data[8] __attribute__((aligned(8)));
+    };
+
+The alignment of the (linear) payload data[] to a 64bit boundary
+allows the user to define their own structs and unions to easily access
+the CAN payload. There is no given byteorder on the CAN bus by
+default. A read(2) system call on a CAN_RAW socket transfers a
+struct can_frame to the user space.
+
+The sockaddr_can structure has an interface index like the
+PF_PACKET socket, that also binds to a specific interface:
+
+.. code-block:: C
+
+    struct sockaddr_can {
+            sa_family_t can_family;
+            int         can_ifindex;
+            union {
+                    /* transport protocol class address info (e.g. ISOTP) */
+                    struct { canid_t rx_id, tx_id; } tp;
+
+                    /* reserved for future CAN protocols address information */
+            } can_addr;
+    };
+
+To determine the interface index an appropriate ioctl() has to
+be used (example for CAN_RAW sockets without error checking):
+
+.. code-block:: C
+
+    int s;
+    struct sockaddr_can addr;
+    struct ifreq ifr;
+
+    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+
+    strcpy(ifr.ifr_name, "can0" );
+    ioctl(s, SIOCGIFINDEX, &ifr);
+
+    addr.can_family = AF_CAN;
+    addr.can_ifindex = ifr.ifr_ifindex;
+
+    bind(s, (struct sockaddr *)&addr, sizeof(addr));
+
+    (..)
+
+To bind a socket to all(!) CAN interfaces the interface index must
+be 0 (zero). In this case the socket receives CAN frames from every
+enabled CAN interface. To determine the originating CAN interface
+the system call recvfrom(2) may be used instead of read(2). To send
+on a socket that is bound to 'any' interface sendto(2) is needed to
+specify the outgoing interface.
+
+Reading CAN frames from a bound CAN_RAW socket (see above) consists
+of reading a struct can_frame:
+
+.. code-block:: C
+
+    struct can_frame frame;
+
+    nbytes = read(s, &frame, sizeof(struct can_frame));
+
+    if (nbytes < 0) {
+            perror("can raw socket read");
+            return 1;
+    }
+
+    /* paranoid check ... */
+    if (nbytes < sizeof(struct can_frame)) {
+            fprintf(stderr, "read: incomplete CAN frame\n");
+            return 1;
+    }
+
+    /* do something with the received CAN frame */
+
+Writing CAN frames can be done similarly, with the write(2) system call::
+
+    nbytes = write(s, &frame, sizeof(struct can_frame));
+
+When the CAN interface is bound to 'any' existing CAN interface
+(addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the
+information about the originating CAN interface is needed:
+
+.. code-block:: C
+
+    struct sockaddr_can addr;
+    struct ifreq ifr;
+    socklen_t len = sizeof(addr);
+    struct can_frame frame;
+
+    nbytes = recvfrom(s, &frame, sizeof(struct can_frame),
+                      0, (struct sockaddr*)&addr, &len);
+
+    /* get interface name of the received CAN frame */
+    ifr.ifr_ifindex = addr.can_ifindex;
+    ioctl(s, SIOCGIFNAME, &ifr);
+    printf("Received a CAN frame from interface %s", ifr.ifr_name);
+
+To write CAN frames on sockets bound to 'any' CAN interface the
+outgoing interface has to be defined certainly:
+
+.. code-block:: C
+
+    strcpy(ifr.ifr_name, "can0");
+    ioctl(s, SIOCGIFINDEX, &ifr);
+    addr.can_ifindex = ifr.ifr_ifindex;
+    addr.can_family  = AF_CAN;
+
+    nbytes = sendto(s, &frame, sizeof(struct can_frame),
+                    0, (struct sockaddr*)&addr, sizeof(addr));
+
+An accurate timestamp can be obtained with an ioctl(2) call after reading
+a message from the socket:
+
+.. code-block:: C
+
+    struct timeval tv;
+    ioctl(s, SIOCGSTAMP, &tv);
+
+The timestamp has a resolution of one microsecond and is set automatically
+at the reception of a CAN frame.
+
+Remark about CAN FD (flexible data rate) support:
+
+Generally the handling of CAN FD is very similar to the formerly described
+examples. The new CAN FD capable CAN controllers support two different
+bitrates for the arbitration phase and the payload phase of the CAN FD frame
+and up to 64 bytes of payload. This extended payload length breaks all the
+kernel interfaces (ABI) which heavily rely on the CAN frame with fixed eight
+bytes of payload (struct can_frame) like the CAN_RAW socket. Therefore e.g.
+the CAN_RAW socket supports a new socket option CAN_RAW_FD_FRAMES that
+switches the socket into a mode that allows the handling of CAN FD frames
+and (legacy) CAN frames simultaneously (see :ref:`socketcan-rawfd`).
+
+The struct canfd_frame is defined in include/linux/can.h:
+
+.. code-block:: C
+
+    struct canfd_frame {
+            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
+            __u8    len;     /* frame payload length in byte (0 .. 64) */
+            __u8    flags;   /* additional flags for CAN FD */
+            __u8    __res0;  /* reserved / padding */
+            __u8    __res1;  /* reserved / padding */
+            __u8    data[64] __attribute__((aligned(8)));
+    };
+
+The struct canfd_frame and the existing struct can_frame have the can_id,
+the payload length and the payload data at the same offset inside their
+structures. This allows to handle the different structures very similar.
+When the content of a struct can_frame is copied into a struct canfd_frame
+all structure elements can be used as-is - only the data[] becomes extended.
+
+When introducing the struct canfd_frame it turned out that the data length
+code (DLC) of the struct can_frame was used as a length information as the
+length and the DLC has a 1:1 mapping in the range of 0 .. 8. To preserve
+the easy handling of the length information the canfd_frame.len element
+contains a plain length value from 0 .. 64. So both canfd_frame.len and
+can_frame.can_dlc are equal and contain a length information and no DLC.
+For details about the distinction of CAN and CAN FD capable devices and
+the mapping to the bus-relevant data length code (DLC), see :ref:`socketcan-can-fd-driver`.
+
+The length of the two CAN(FD) frame structures define the maximum transfer
+unit (MTU) of the CAN(FD) network interface and skbuff data length. Two
+definitions are specified for CAN specific MTUs in include/linux/can.h:
+
+.. code-block:: C
+
+  #define CAN_MTU   (sizeof(struct can_frame))   == 16  => 'legacy' CAN frame
+  #define CANFD_MTU (sizeof(struct canfd_frame)) == 72  => CAN FD frame
+
+
+.. _socketcan-raw-sockets:
+
+RAW Protocol Sockets with can_filters (SOCK_RAW)
+------------------------------------------------
+
+Using CAN_RAW sockets is extensively comparable to the commonly
+known access to CAN character devices. To meet the new possibilities
+provided by the multi user SocketCAN approach, some reasonable
+defaults are set at RAW socket binding time:
+
+- The filters are set to exactly one filter receiving everything
+- The socket only receives valid data frames (=> no error message frames)
+- The loopback of sent CAN frames is enabled (see :ref:`socketcan-local-loopback2`)
+- The socket does not receive its own sent frames (in loopback mode)
+
+These default settings may be changed before or after binding the socket.
+To use the referenced definitions of the socket options for CAN_RAW
+sockets, include <linux/can/raw.h>.
+
+
+.. _socketcan-rawfilter:
+
+RAW socket option CAN_RAW_FILTER
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The reception of CAN frames using CAN_RAW sockets can be controlled
+by defining 0 .. n filters with the CAN_RAW_FILTER socket option.
+
+The CAN filter structure is defined in include/linux/can.h:
+
+.. code-block:: C
+
+    struct can_filter {
+            canid_t can_id;
+            canid_t can_mask;
+    };
+
+A filter matches, when:
+
+.. code-block:: C
+
+    <received_can_id> & mask == can_id & mask
+
+which is analogous to known CAN controllers hardware filter semantics.
+The filter can be inverted in this semantic, when the CAN_INV_FILTER
+bit is set in can_id element of the can_filter structure. In
+contrast to CAN controller hardware filters the user may set 0 .. n
+receive filters for each open socket separately:
+
+.. code-block:: C
+
+    struct can_filter rfilter[2];
+
+    rfilter[0].can_id   = 0x123;
+    rfilter[0].can_mask = CAN_SFF_MASK;
+    rfilter[1].can_id   = 0x200;
+    rfilter[1].can_mask = 0x700;
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
+
+To disable the reception of CAN frames on the selected CAN_RAW socket:
+
+.. code-block:: C
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0);
+
+To set the filters to zero filters is quite obsolete as to not read
+data causes the raw socket to discard the received CAN frames. But
+having this 'send only' use-case we may remove the receive list in the
+Kernel to save a little (really a very little!) CPU usage.
+
+CAN Filter Usage Optimisation
+.............................
+
+The CAN filters are processed in per-device filter lists at CAN frame
+reception time. To reduce the number of checks that need to be performed
+while walking through the filter lists the CAN core provides an optimized
+filter handling when the filter subscription focusses on a single CAN ID.
+
+For the possible 2048 SFF CAN identifiers the identifier is used as an index
+to access the corresponding subscription list without any further checks.
+For the 2^29 possible EFF CAN identifiers a 10 bit XOR folding is used as
+hash function to retrieve the EFF table index.
+
+To benefit from the optimized filters for single CAN identifiers the
+CAN_SFF_MASK or CAN_EFF_MASK have to be set into can_filter.mask together
+with set CAN_EFF_FLAG and CAN_RTR_FLAG bits. A set CAN_EFF_FLAG bit in the
+can_filter.mask makes clear that it matters whether a SFF or EFF CAN ID is
+subscribed. E.g. in the example from above:
+
+.. code-block:: C
+
+    rfilter[0].can_id   = 0x123;
+    rfilter[0].can_mask = CAN_SFF_MASK;
+
+both SFF frames with CAN ID 0x123 and EFF frames with 0xXXXXX123 can pass.
+
+To filter for only 0x123 (SFF) and 0x12345678 (EFF) CAN identifiers the
+filter has to be defined in this way to benefit from the optimized filters:
+
+.. code-block:: C
+
+    struct can_filter rfilter[2];
+
+    rfilter[0].can_id   = 0x123;
+    rfilter[0].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_SFF_MASK);
+    rfilter[1].can_id   = 0x12345678 | CAN_EFF_FLAG;
+    rfilter[1].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_EFF_MASK);
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
+
+
+RAW Socket Option CAN_RAW_ERR_FILTER
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As described in :ref:`socketcan-network-problem-notifications` the CAN interface driver can generate so
+called Error Message Frames that can optionally be passed to the user
+application in the same way as other CAN frames. The possible
+errors are divided into different error classes that may be filtered
+using the appropriate error mask. To register for every possible
+error condition CAN_ERR_MASK can be used as value for the error mask.
+The values for the error mask are defined in linux/can/error.h:
+
+.. code-block:: C
+
+    can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF );
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER,
+               &err_mask, sizeof(err_mask));
+
+
+RAW Socket Option CAN_RAW_LOOPBACK
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To meet multi user needs the local loopback is enabled by default
+(see :ref:`socketcan-local-loopback1` for details). But in some embedded use-cases
+(e.g. when only one application uses the CAN bus) this loopback
+functionality can be disabled (separately for each socket):
+
+.. code-block:: C
+
+    int loopback = 0; /* 0 = disabled, 1 = enabled (default) */
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback));
+
+
+RAW socket option CAN_RAW_RECV_OWN_MSGS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When the local loopback is enabled, all the sent CAN frames are
+looped back to the open CAN sockets that registered for the CAN
+frames' CAN-ID on this given interface to meet the multi user
+needs. The reception of the CAN frames on the same socket that was
+sending the CAN frame is assumed to be unwanted and therefore
+disabled by default. This default behaviour may be changed on
+demand:
+
+.. code-block:: C
+
+    int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS,
+               &recv_own_msgs, sizeof(recv_own_msgs));
+
+
+.. _socketcan-rawfd:
+
+RAW Socket Option CAN_RAW_FD_FRAMES
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CAN FD support in CAN_RAW sockets can be enabled with a new socket option
+CAN_RAW_FD_FRAMES which is off by default. When the new socket option is
+not supported by the CAN_RAW socket (e.g. on older kernels), switching the
+CAN_RAW_FD_FRAMES option returns the error -ENOPROTOOPT.
+
+Once CAN_RAW_FD_FRAMES is enabled the application can send both CAN frames
+and CAN FD frames. OTOH the application has to handle CAN and CAN FD frames
+when reading from the socket:
+
+.. code-block:: C
+
+    CAN_RAW_FD_FRAMES enabled:  CAN_MTU and CANFD_MTU are allowed
+    CAN_RAW_FD_FRAMES disabled: only CAN_MTU is allowed (default)
+
+Example:
+
+.. code-block:: C
+
+    [ remember: CANFD_MTU == sizeof(struct canfd_frame) ]
+
+    struct canfd_frame cfd;
+
+    nbytes = read(s, &cfd, CANFD_MTU);
+
+    if (nbytes == CANFD_MTU) {
+            printf("got CAN FD frame with length %d\n", cfd.len);
+            /* cfd.flags contains valid data */
+    } else if (nbytes == CAN_MTU) {
+            printf("got legacy CAN frame with length %d\n", cfd.len);
+            /* cfd.flags is undefined */
+    } else {
+            fprintf(stderr, "read: invalid CAN(FD) frame\n");
+            return 1;
+    }
+
+    /* the content can be handled independently from the received MTU size */
+
+    printf("can_id: %X data length: %d data: ", cfd.can_id, cfd.len);
+    for (i = 0; i < cfd.len; i++)
+            printf("%02X ", cfd.data[i]);
+
+When reading with size CANFD_MTU only returns CAN_MTU bytes that have
+been received from the socket a legacy CAN frame has been read into the
+provided CAN FD structure. Note that the canfd_frame.flags data field is
+not specified in the struct can_frame and therefore it is only valid in
+CANFD_MTU sized CAN FD frames.
+
+Implementation hint for new CAN applications:
+
+To build a CAN FD aware application use struct canfd_frame as basic CAN
+data structure for CAN_RAW based applications. When the application is
+executed on an older Linux kernel and switching the CAN_RAW_FD_FRAMES
+socket option returns an error: No problem. You'll get legacy CAN frames
+or CAN FD frames and can process them the same way.
+
+When sending to CAN devices make sure that the device is capable to handle
+CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU.
+The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
+
+
+RAW socket option CAN_RAW_JOIN_FILTERS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CAN_RAW socket can set multiple CAN identifier specific filters that
+lead to multiple filters in the af_can.c filter processing. These filters
+are indenpendent from each other which leads to logical OR'ed filters when
+applied (see :ref:`socketcan-rawfilter`).
+
+This socket option joines the given CAN filters in the way that only CAN
+frames are passed to user space that matched *all* given CAN filters. The
+semantic for the applied filters is therefore changed to a logical AND.
+
+This is useful especially when the filterset is a combination of filters
+where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or
+CAN ID ranges from the incoming traffic.
+
+
+RAW Socket Returned Message Flags
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using recvmsg() call, the msg->msg_flags may contain following flags:
+
+MSG_DONTROUTE:
+	set when the received frame was created on the local host.
+
+MSG_CONFIRM:
+	set when the frame was sent via the socket it is received on.
+	This flag can be interpreted as a 'transmission confirmation' when the
+	CAN driver supports the echo of frames on driver level, see
+	:ref:`socketcan-local-loopback1` and :ref:`socketcan-local-loopback2`.
+	In order to receive such messages, CAN_RAW_RECV_OWN_MSGS must be set.
+
+
+Broadcast Manager Protocol Sockets (SOCK_DGRAM)
+-----------------------------------------------
+
+The Broadcast Manager protocol provides a command based configuration
+interface to filter and send (e.g. cyclic) CAN messages in kernel space.
+
+Receive filters can be used to down sample frequent messages; detect events
+such as message contents changes, packet length changes, and do time-out
+monitoring of received messages.
+
+Periodic transmission tasks of CAN frames or a sequence of CAN frames can be
+created and modified at runtime; both the message content and the two
+possible transmit intervals can be altered.
+
+A BCM socket is not intended for sending individual CAN frames using the
+struct can_frame as known from the CAN_RAW socket. Instead a special BCM
+configuration message is defined. The basic BCM configuration message used
+to communicate with the broadcast manager and the available operations are
+defined in the linux/can/bcm.h include. The BCM message consists of a
+message header with a command ('opcode') followed by zero or more CAN frames.
+The broadcast manager sends responses to user space in the same form:
+
+.. code-block:: C
+
+    struct bcm_msg_head {
+            __u32 opcode;                   /* command */
+            __u32 flags;                    /* special flags */
+            __u32 count;                    /* run 'count' times with ival1 */
+            struct timeval ival1, ival2;    /* count and subsequent interval */
+            canid_t can_id;                 /* unique can_id for task */
+            __u32 nframes;                  /* number of can_frames following */
+            struct can_frame frames[0];
+    };
+
+The aligned payload 'frames' uses the same basic CAN frame structure defined
+at the beginning of :ref:`socketcan-rawfd` and in the include/linux/can.h include. All
+messages to the broadcast manager from user space have this structure.
+
+Note a CAN_BCM socket must be connected instead of bound after socket
+creation (example without error checking):
+
+.. code-block:: C
+
+    int s;
+    struct sockaddr_can addr;
+    struct ifreq ifr;
+
+    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
+
+    strcpy(ifr.ifr_name, "can0");
+    ioctl(s, SIOCGIFINDEX, &ifr);
+
+    addr.can_family = AF_CAN;
+    addr.can_ifindex = ifr.ifr_ifindex;
+
+    connect(s, (struct sockaddr *)&addr, sizeof(addr));
+
+    (..)
+
+The broadcast manager socket is able to handle any number of in flight
+transmissions or receive filters concurrently. The different RX/TX jobs are
+distinguished by the unique can_id in each BCM message. However additional
+CAN_BCM sockets are recommended to communicate on multiple CAN interfaces.
+When the broadcast manager socket is bound to 'any' CAN interface (=> the
+interface index is set to zero) the configured receive filters apply to any
+CAN interface unless the sendto() syscall is used to overrule the 'any' CAN
+interface index. When using recvfrom() instead of read() to retrieve BCM
+socket messages the originating CAN interface is provided in can_ifindex.
+
+
+Broadcast Manager Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The opcode defines the operation for the broadcast manager to carry out,
+or details the broadcast managers response to several events, including
+user requests.
+
+Transmit Operations (user space to broadcast manager):
+
+TX_SETUP:
+	Create (cyclic) transmission task.
+
+TX_DELETE:
+	Remove (cyclic) transmission task, requires only can_id.
+
+TX_READ:
+	Read properties of (cyclic) transmission task for can_id.
+
+TX_SEND:
+	Send one CAN frame.
+
+Transmit Responses (broadcast manager to user space):
+
+TX_STATUS:
+	Reply to TX_READ request (transmission task configuration).
+
+TX_EXPIRED:
+	Notification when counter finishes sending at initial interval
+	'ival1'. Requires the TX_COUNTEVT flag to be set at TX_SETUP.
+
+Receive Operations (user space to broadcast manager):
+
+RX_SETUP:
+	Create RX content filter subscription.
+
+RX_DELETE:
+	Remove RX content filter subscription, requires only can_id.
+
+RX_READ:
+	Read properties of RX content filter subscription for can_id.
+
+Receive Responses (broadcast manager to user space):
+
+RX_STATUS:
+	Reply to RX_READ request (filter task configuration).
+
+RX_TIMEOUT:
+	Cyclic message is detected to be absent (timer ival1 expired).
+
+RX_CHANGED:
+	BCM message with updated CAN frame (detected content change).
+	Sent on first message received or on receipt of revised CAN messages.
+
+
+Broadcast Manager Message Flags
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When sending a message to the broadcast manager the 'flags' element may
+contain the following flag definitions which influence the behaviour:
+
+SETTIMER:
+	Set the values of ival1, ival2 and count
+
+STARTTIMER:
+	Start the timer with the actual values of ival1, ival2
+	and count. Starting the timer leads simultaneously to emit a CAN frame.
+
+TX_COUNTEVT:
+	Create the message TX_EXPIRED when count expires
+
+TX_ANNOUNCE:
+	A change of data by the process is emitted immediately.
+
+TX_CP_CAN_ID:
+	Copies the can_id from the message header to each
+	subsequent frame in frames. This is intended as usage simplification. For
+	TX tasks the unique can_id from the message header may differ from the
+	can_id(s) stored for transmission in the subsequent struct can_frame(s).
+
+RX_FILTER_ID:
+	Filter by can_id alone, no frames required (nframes=0).
+
+RX_CHECK_DLC:
+	A change of the DLC leads to an RX_CHANGED.
+
+RX_NO_AUTOTIMER:
+	Prevent automatically starting the timeout monitor.
+
+RX_ANNOUNCE_RESUME:
+	If passed at RX_SETUP and a receive timeout occurred, a
+	RX_CHANGED message will be generated when the (cyclic) receive restarts.
+
+TX_RESET_MULTI_IDX:
+	Reset the index for the multiple frame transmission.
+
+RX_RTR_FRAME:
+	Send reply for RTR-request (placed in op->frames[0]).
+
+
+Broadcast Manager Transmission Timers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Periodic transmission configurations may use up to two interval timers.
+In this case the BCM sends a number of messages ('count') at an interval
+'ival1', then continuing to send at another given interval 'ival2'. When
+only one timer is needed 'count' is set to zero and only 'ival2' is used.
+When SET_TIMER and START_TIMER flag were set the timers are activated.
+The timer values can be altered at runtime when only SET_TIMER is set.
+
+
+Broadcast Manager message sequence transmission
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Up to 256 CAN frames can be transmitted in a sequence in the case of a cyclic
+TX task configuration. The number of CAN frames is provided in the 'nframes'
+element of the BCM message head. The defined number of CAN frames are added
+as array to the TX_SETUP BCM configuration message:
+
+.. code-block:: C
+
+    /* create a struct to set up a sequence of four CAN frames */
+    struct {
+            struct bcm_msg_head msg_head;
+            struct can_frame frame[4];
+    } mytxmsg;
+
+    (..)
+    mytxmsg.msg_head.nframes = 4;
+    (..)
+
+    write(s, &mytxmsg, sizeof(mytxmsg));
+
+With every transmission the index in the array of CAN frames is increased
+and set to zero at index overflow.
+
+
+Broadcast Manager Receive Filter Timers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The timer values ival1 or ival2 may be set to non-zero values at RX_SETUP.
+When the SET_TIMER flag is set the timers are enabled:
+
+ival1:
+	Send RX_TIMEOUT when a received message is not received again within
+	the given time. When START_TIMER is set at RX_SETUP the timeout detection
+	is activated directly - even without a former CAN frame reception.
+
+ival2:
+	Throttle the received message rate down to the value of ival2. This
+	is useful to reduce messages for the application when the signal inside the
+	CAN frame is stateless as state changes within the ival2 periode may get
+	lost.
+
+Broadcast Manager Multiplex Message Receive Filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To filter for content changes in multiplex message sequences an array of more
+than one CAN frames can be passed in a RX_SETUP configuration message. The
+data bytes of the first CAN frame contain the mask of relevant bits that
+have to match in the subsequent CAN frames with the received CAN frame.
+If one of the subsequent CAN frames is matching the bits in that frame data
+mark the relevant content to be compared with the previous received content.
+Up to 257 CAN frames (multiplex filter bit mask CAN frame plus 256 CAN
+filters) can be added as array to the TX_SETUP BCM configuration message:
+
+.. code-block:: C
+
+    /* usually used to clear CAN frame data[] - beware of endian problems! */
+    #define U64_DATA(p) (*(unsigned long long*)(p)->data)
+
+    struct {
+            struct bcm_msg_head msg_head;
+            struct can_frame frame[5];
+    } msg;
+
+    msg.msg_head.opcode  = RX_SETUP;
+    msg.msg_head.can_id  = 0x42;
+    msg.msg_head.flags   = 0;
+    msg.msg_head.nframes = 5;
+    U64_DATA(&msg.frame[0]) = 0xFF00000000000000ULL; /* MUX mask */
+    U64_DATA(&msg.frame[1]) = 0x01000000000000FFULL; /* data mask (MUX 0x01) */
+    U64_DATA(&msg.frame[2]) = 0x0200FFFF000000FFULL; /* data mask (MUX 0x02) */
+    U64_DATA(&msg.frame[3]) = 0x330000FFFFFF0003ULL; /* data mask (MUX 0x33) */
+    U64_DATA(&msg.frame[4]) = 0x4F07FC0FF0000000ULL; /* data mask (MUX 0x4F) */
+
+    write(s, &msg, sizeof(msg));
+
+
+Broadcast Manager CAN FD Support
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The programming API of the CAN_BCM depends on struct can_frame which is
+given as array directly behind the bcm_msg_head structure. To follow this
+schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head
+flags indicates that the concatenated CAN frame structures behind the
+bcm_msg_head are defined as struct canfd_frame:
+
+.. code-block:: C
+
+    struct {
+            struct bcm_msg_head msg_head;
+            struct canfd_frame frame[5];
+    } msg;
+
+    msg.msg_head.opcode  = RX_SETUP;
+    msg.msg_head.can_id  = 0x42;
+    msg.msg_head.flags   = CAN_FD_FRAME;
+    msg.msg_head.nframes = 5;
+    (..)
+
+When using CAN FD frames for multiplex filtering the MUX mask is still
+expected in the first 64 bit of the struct canfd_frame data section.
+
+
+Connected Transport Protocols (SOCK_SEQPACKET)
+----------------------------------------------
+
+(to be written)
+
+
+Unconnected Transport Protocols (SOCK_DGRAM)
+--------------------------------------------
+
+(to be written)
+
+
+.. _socketcan-core-module:
+
+SocketCAN Core Module
+=====================
+
+The SocketCAN core module implements the protocol family
+PF_CAN. CAN protocol modules are loaded by the core module at
+runtime. The core module provides an interface for CAN protocol
+modules to subscribe needed CAN IDs (see :ref:`socketcan-receive-lists`).
+
+
+can.ko Module Params
+--------------------
+
+- **stats_timer**:
+  To calculate the SocketCAN core statistics
+  (e.g. current/maximum frames per second) this 1 second timer is
+  invoked at can.ko module start time by default. This timer can be
+  disabled by using stattimer=0 on the module commandline.
+
+- **debug**:
+  (removed since SocketCAN SVN r546)
+
+
+procfs content
+--------------
+
+As described in :ref:`socketcan-receive-lists` the SocketCAN core uses several filter
+lists to deliver received CAN frames to CAN protocol modules. These
+receive lists, their filters and the count of filter matches can be
+checked in the appropriate receive list. All entries contain the
+device and a protocol module identifier::
+
+    foo@bar:~$ cat /proc/net/can/rcvlist_all
+
+    receive list 'rx_all':
+      (vcan3: no entry)
+      (vcan2: no entry)
+      (vcan1: no entry)
+      device   can_id   can_mask  function  userdata   matches  ident
+       vcan0     000    00000000  f88e6370  f6c6f400         0  raw
+      (any: no entry)
+
+In this example an application requests any CAN traffic from vcan0::
+
+    rcvlist_all - list for unfiltered entries (no filter operations)
+    rcvlist_eff - list for single extended frame (EFF) entries
+    rcvlist_err - list for error message frames masks
+    rcvlist_fil - list for mask/value filters
+    rcvlist_inv - list for mask/value filters (inverse semantic)
+    rcvlist_sff - list for single standard frame (SFF) entries
+
+Additional procfs files in /proc/net/can::
+
+    stats       - SocketCAN core statistics (rx/tx frames, match ratios, ...)
+    reset_stats - manual statistic reset
+    version     - prints the SocketCAN core version and the ABI version
+
+
+Writing Own CAN Protocol Modules
+--------------------------------
+
+To implement a new protocol in the protocol family PF_CAN a new
+protocol has to be defined in include/linux/can.h .
+The prototypes and definitions to use the SocketCAN core can be
+accessed by including include/linux/can/core.h .
+In addition to functions that register the CAN protocol and the
+CAN device notifier chain there are functions to subscribe CAN
+frames received by CAN interfaces and to send CAN frames::
+
+    can_rx_register   - subscribe CAN frames from a specific interface
+    can_rx_unregister - unsubscribe CAN frames from a specific interface
+    can_send          - transmit a CAN frame (optional with local loopback)
+
+For details see the kerneldoc documentation in net/can/af_can.c or
+the source code of net/can/raw.c or net/can/bcm.c .
+
+
+CAN Network Drivers
+===================
+
+Writing a CAN network device driver is much easier than writing a
+CAN character device driver. Similar to other known network device
+drivers you mainly have to deal with:
+
+- TX: Put the CAN frame from the socket buffer to the CAN controller.
+- RX: Put the CAN frame from the CAN controller to the socket buffer.
+
+See e.g. at Documentation/networking/netdevices.txt . The differences
+for writing CAN network device driver are described below:
+
+
+General Settings
+----------------
+
+.. code-block:: C
+
+    dev->type  = ARPHRD_CAN; /* the netdevice hardware type */
+    dev->flags = IFF_NOARP;  /* CAN has no arp */
+
+    dev->mtu = CAN_MTU; /* sizeof(struct can_frame) -> legacy CAN interface */
+
+    or alternative, when the controller supports CAN with flexible data rate:
+    dev->mtu = CANFD_MTU; /* sizeof(struct canfd_frame) -> CAN FD interface */
+
+The struct can_frame or struct canfd_frame is the payload of each socket
+buffer (skbuff) in the protocol family PF_CAN.
+
+
+.. _socketcan-local-loopback2:
+
+Local Loopback of Sent Frames
+-----------------------------
+
+As described in :ref:`socketcan-local-loopback1` the CAN network device driver should
+support a local loopback functionality similar to the local echo
+e.g. of tty devices. In this case the driver flag IFF_ECHO has to be
+set to prevent the PF_CAN core from locally echoing sent frames
+(aka loopback) as fallback solution::
+
+    dev->flags = (IFF_NOARP | IFF_ECHO);
+
+
+CAN Controller Hardware Filters
+-------------------------------
+
+To reduce the interrupt load on deep embedded systems some CAN
+controllers support the filtering of CAN IDs or ranges of CAN IDs.
+These hardware filter capabilities vary from controller to
+controller and have to be identified as not feasible in a multi-user
+networking approach. The use of the very controller specific
+hardware filters could make sense in a very dedicated use-case, as a
+filter on driver level would affect all users in the multi-user
+system. The high efficient filter sets inside the PF_CAN core allow
+to set different multiple filters for each socket separately.
+Therefore the use of hardware filters goes to the category 'handmade
+tuning on deep embedded systems'. The author is running a MPC603e
+@133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
+load without any problems ...
+
+
+The Virtual CAN Driver (vcan)
+-----------------------------
+
+Similar to the network loopback devices, vcan offers a virtual local
+CAN interface. A full qualified address on CAN consists of
+
+- a unique CAN Identifier (CAN ID)
+- the CAN bus this CAN ID is transmitted on (e.g. can0)
+
+so in common use cases more than one virtual CAN interface is needed.
+
+The virtual CAN interfaces allow the transmission and reception of CAN
+frames without real CAN controller hardware. Virtual CAN network
+devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
+When compiled as a module the virtual CAN driver module is called vcan.ko
+
+Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
+netlink interface to create vcan network devices. The creation and
+removal of vcan network devices can be managed with the ip(8) tool::
+
+  - Create a virtual CAN network interface:
+       $ ip link add type vcan
+
+  - Create a virtual CAN network interface with a specific name 'vcan42':
+       $ ip link add dev vcan42 type vcan
+
+  - Remove a (virtual CAN) network interface 'vcan42':
+       $ ip link del vcan42
+
+
+The CAN Network Device Driver Interface
+---------------------------------------
+
+The CAN network device driver interface provides a generic interface
+to setup, configure and monitor CAN network devices. The user can then
+configure the CAN device, like setting the bit-timing parameters, via
+the netlink interface using the program "ip" from the "IPROUTE2"
+utility suite. The following chapter describes briefly how to use it.
+Furthermore, the interface uses a common data structure and exports a
+set of common functions, which all real CAN network device drivers
+should use. Please have a look to the SJA1000 or MSCAN driver to
+understand how to use them. The name of the module is can-dev.ko.
+
+
+Netlink interface to set/get devices properties
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CAN device must be configured via netlink interface. The supported
+netlink message types are defined and briefly described in
+"include/linux/can/netlink.h". CAN link support for the program "ip"
+of the IPROUTE2 utility suite is available and it can be used as shown
+below:
+
+Setting CAN device properties::
+
+    $ ip link set can0 type can help
+    Usage: ip link set DEVICE type can
+        [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] |
+        [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1
+          phase-seg2 PHASE-SEG2 [ sjw SJW ] ]
+
+        [ dbitrate BITRATE [ dsample-point SAMPLE-POINT] ] |
+        [ dtq TQ dprop-seg PROP_SEG dphase-seg1 PHASE-SEG1
+          dphase-seg2 PHASE-SEG2 [ dsjw SJW ] ]
+
+        [ loopback { on | off } ]
+        [ listen-only { on | off } ]
+        [ triple-sampling { on | off } ]
+        [ one-shot { on | off } ]
+        [ berr-reporting { on | off } ]
+        [ fd { on | off } ]
+        [ fd-non-iso { on | off } ]
+        [ presume-ack { on | off } ]
+
+        [ restart-ms TIME-MS ]
+        [ restart ]
+
+        Where: BITRATE       := { 1..1000000 }
+               SAMPLE-POINT  := { 0.000..0.999 }
+               TQ            := { NUMBER }
+               PROP-SEG      := { 1..8 }
+               PHASE-SEG1    := { 1..8 }
+               PHASE-SEG2    := { 1..8 }
+               SJW           := { 1..4 }
+               RESTART-MS    := { 0 | NUMBER }
+
+Display CAN device details and statistics::
+
+    $ ip -details -statistics link show can0
+    2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10
+      link/can
+      can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100
+      bitrate 125000 sample_point 0.875
+      tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1
+      sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
+      clock 8000000
+      re-started bus-errors arbit-lost error-warn error-pass bus-off
+      41         17457      0          41         42         41
+      RX: bytes  packets  errors  dropped overrun mcast
+      140859     17608    17457   0       0       0
+      TX: bytes  packets  errors  dropped carrier collsns
+      861        112      0       41      0       0
+
+More info to the above output:
+
+"<TRIPLE-SAMPLING>"
+	Shows the list of selected CAN controller modes: LOOPBACK,
+	LISTEN-ONLY, or TRIPLE-SAMPLING.
+
+"state ERROR-ACTIVE"
+	The current state of the CAN controller: "ERROR-ACTIVE",
+	"ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED"
+
+"restart-ms 100"
+	Automatic restart delay time. If set to a non-zero value, a
+	restart of the CAN controller will be triggered automatically
+	in case of a bus-off condition after the specified delay time
+	in milliseconds. By default it's off.
+
+"bitrate 125000 sample-point 0.875"
+	Shows the real bit-rate in bits/sec and the sample-point in the
+	range 0.000..0.999. If the calculation of bit-timing parameters
+	is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the
+	bit-timing can be defined by setting the "bitrate" argument.
+	Optionally the "sample-point" can be specified. By default it's
+	0.000 assuming CIA-recommended sample-points.
+
+"tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1"
+	Shows the time quanta in ns, propagation segment, phase buffer
+	segment 1 and 2 and the synchronisation jump width in units of
+	tq. They allow to define the CAN bit-timing in a hardware
+	independent format as proposed by the Bosch CAN 2.0 spec (see
+	chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf).
+
+"sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 clock 8000000"
+	Shows the bit-timing constants of the CAN controller, here the
+	"sja1000". The minimum and maximum values of the time segment 1
+	and 2, the synchronisation jump width in units of tq, the
+	bitrate pre-scaler and the CAN system clock frequency in Hz.
+	These constants could be used for user-defined (non-standard)
+	bit-timing calculation algorithms in user-space.
+
+"re-started bus-errors arbit-lost error-warn error-pass bus-off"
+	Shows the number of restarts, bus and arbitration lost errors,
+	and the state changes to the error-warning, error-passive and
+	bus-off state. RX overrun errors are listed in the "overrun"
+	field of the standard network statistics.
+
+Setting the CAN Bit-Timing
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CAN bit-timing parameters can always be defined in a hardware
+independent format as proposed in the Bosch CAN 2.0 specification
+specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2"
+and "sjw"::
+
+    $ ip link set canX type can tq 125 prop-seg 6 \
+				phase-seg1 7 phase-seg2 2 sjw 1
+
+If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA
+recommended CAN bit-timing parameters will be calculated if the bit-
+rate is specified with the argument "bitrate"::
+
+    $ ip link set canX type can bitrate 125000
+
+Note that this works fine for the most common CAN controllers with
+standard bit-rates but may *fail* for exotic bit-rates or CAN system
+clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some
+space and allows user-space tools to solely determine and set the
+bit-timing parameters. The CAN controller specific bit-timing
+constants can be used for that purpose. They are listed by the
+following command::
+
+    $ ip -details link show can0
+    ...
+      sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
+
+
+Starting and Stopping the CAN Network Device
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A CAN network device is started or stopped as usual with the command
+"ifconfig canX up/down" or "ip link set canX up/down". Be aware that
+you *must* define proper bit-timing parameters for real CAN devices
+before you can start it to avoid error-prone default settings::
+
+    $ ip link set canX up type can bitrate 125000
+
+A device may enter the "bus-off" state if too many errors occurred on
+the CAN bus. Then no more messages are received or sent. An automatic
+bus-off recovery can be enabled by setting the "restart-ms" to a
+non-zero value, e.g.::
+
+    $ ip link set canX type can restart-ms 100
+
+Alternatively, the application may realize the "bus-off" condition
+by monitoring CAN error message frames and do a restart when
+appropriate with the command::
+
+    $ ip link set canX type can restart
+
+Note that a restart will also create a CAN error message frame (see
+also :ref:`socketcan-network-problem-notifications`).
+
+
+.. _socketcan-can-fd-driver:
+
+CAN FD (Flexible Data Rate) Driver Support
+------------------------------------------
+
+CAN FD capable CAN controllers support two different bitrates for the
+arbitration phase and the payload phase of the CAN FD frame. Therefore a
+second bit timing has to be specified in order to enable the CAN FD bitrate.
+
+Additionally CAN FD capable CAN controllers support up to 64 bytes of
+payload. The representation of this length in can_frame.can_dlc and
+canfd_frame.len for userspace applications and inside the Linux network
+layer is a plain value from 0 .. 64 instead of the CAN 'data length code'.
+The data length code was a 1:1 mapping to the payload length in the legacy
+CAN frames anyway. The payload length to the bus-relevant DLC mapping is
+only performed inside the CAN drivers, preferably with the helper
+functions can_dlc2len() and can_len2dlc().
+
+The CAN netdevice driver capabilities can be distinguished by the network
+devices maximum transfer unit (MTU)::
+
+  MTU = 16 (CAN_MTU)   => sizeof(struct can_frame)   => 'legacy' CAN device
+  MTU = 72 (CANFD_MTU) => sizeof(struct canfd_frame) => CAN FD capable device
+
+The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
+N.B. CAN FD capable devices can also handle and send legacy CAN frames.
+
+When configuring CAN FD capable CAN controllers an additional 'data' bitrate
+has to be set. This bitrate for the data phase of the CAN FD frame has to be
+at least the bitrate which was configured for the arbitration phase. This
+second bitrate is specified analogue to the first bitrate but the bitrate
+setting keywords for the 'data' bitrate start with 'd' e.g. dbitrate,
+dsample-point, dsjw or dtq and similar settings. When a data bitrate is set
+within the configuration process the controller option "fd on" can be
+specified to enable the CAN FD mode in the CAN controller. This controller
+option also switches the device MTU to 72 (CANFD_MTU).
+
+The first CAN FD specification presented as whitepaper at the International
+CAN Conference 2012 needed to be improved for data integrity reasons.
+Therefore two CAN FD implementations have to be distinguished today:
+
+- ISO compliant:     The ISO 11898-1:2015 CAN FD implementation (default)
+- non-ISO compliant: The CAN FD implementation following the 2012 whitepaper
+
+Finally there are three types of CAN FD controllers:
+
+1. ISO compliant (fixed)
+2. non-ISO compliant (fixed, like the M_CAN IP core v3.0.1 in m_can.c)
+3. ISO/non-ISO CAN FD controllers (switchable, like the PEAK PCAN-USB FD)
+
+The current ISO/non-ISO mode is announced by the CAN controller driver via
+netlink and displayed by the 'ip' tool (controller option FD-NON-ISO).
+The ISO/non-ISO-mode can be altered by setting 'fd-non-iso {on|off}' for
+switchable CAN FD controllers only.
+
+Example configuring 500 kbit/s arbitration bitrate and 4 Mbit/s data bitrate::
+
+    $ ip link set can0 up type can bitrate 500000 sample-point 0.75 \
+                                   dbitrate 4000000 dsample-point 0.8 fd on
+    $ ip -details link show can0
+    5: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 72 qdisc pfifo_fast state UNKNOWN \
+             mode DEFAULT group default qlen 10
+    link/can  promiscuity 0
+    can <FD> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
+          bitrate 500000 sample-point 0.750
+          tq 50 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 1
+          pcan_usb_pro_fd: tseg1 1..64 tseg2 1..16 sjw 1..16 brp 1..1024 \
+          brp-inc 1
+          dbitrate 4000000 dsample-point 0.800
+          dtq 12 dprop-seg 7 dphase-seg1 8 dphase-seg2 4 dsjw 1
+          pcan_usb_pro_fd: dtseg1 1..16 dtseg2 1..8 dsjw 1..4 dbrp 1..1024 \
+          dbrp-inc 1
+          clock 80000000
+
+Example when 'fd-non-iso on' is added on this switchable CAN FD adapter::
+
+   can <FD,FD-NON-ISO> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
+
+
+Supported CAN Hardware
+----------------------
+
+Please check the "Kconfig" file in "drivers/net/can" to get an actual
+list of the support CAN hardware. On the SocketCAN project website
+(see :ref:`socketcan-resources`) there might be further drivers available, also for
+older kernel versions.
+
+
+.. _socketcan-resources:
+
+SocketCAN Resources
+===================
+
+The Linux CAN / SocketCAN project resources (project site / mailing list)
+are referenced in the MAINTAINERS file in the Linux source tree.
+Search for CAN NETWORK [LAYERS|DRIVERS].
+
+Credits
+=======
+
+- Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver)
+- Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
+- Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
+- Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews, CAN device driver interface, MSCAN driver)
+- Robert Schwebel (design reviews, PTXdist integration)
+- Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
+- Benedikt Spranger (reviews)
+- Thomas Gleixner (LKML reviews, coding style, posting hints)
+- Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver)
+- Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
+- Klaus Hitschler (PEAK driver integration)
+- Uwe Koppe (CAN netdevices with PF_PACKET approach)
+- Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
+- Pavel Pisa (Bit-timing calculation)
+- Sascha Hauer (SJA1000 platform driver)
+- Sebastian Haas (SJA1000 EMS PCI driver)
+- Markus Plessing (SJA1000 EMS PCI driver)
+- Per Dalen (SJA1000 Kvaser PCI driver)
+- Sam Ravnborg (reviews, coding style, kbuild help)
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
deleted file mode 100644
index aa15b9ee2e70..000000000000
--- a/Documentation/networking/can.txt
+++ /dev/null
@@ -1,1308 +0,0 @@
-============================================================================
-
-can.txt
-
-Readme file for the Controller Area Network Protocol Family (aka SocketCAN)
-
-This file contains
-
-  1 Overview / What is SocketCAN
-
-  2 Motivation / Why using the socket API
-
-  3 SocketCAN concept
-    3.1 receive lists
-    3.2 local loopback of sent frames
-    3.3 network problem notifications
-
-  4 How to use SocketCAN
-    4.1 RAW protocol sockets with can_filters (SOCK_RAW)
-      4.1.1 RAW socket option CAN_RAW_FILTER
-      4.1.2 RAW socket option CAN_RAW_ERR_FILTER
-      4.1.3 RAW socket option CAN_RAW_LOOPBACK
-      4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS
-      4.1.5 RAW socket option CAN_RAW_FD_FRAMES
-      4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS
-      4.1.7 RAW socket returned message flags
-    4.2 Broadcast Manager protocol sockets (SOCK_DGRAM)
-      4.2.1 Broadcast Manager operations
-      4.2.2 Broadcast Manager message flags
-      4.2.3 Broadcast Manager transmission timers
-      4.2.4 Broadcast Manager message sequence transmission
-      4.2.5 Broadcast Manager receive filter timers
-      4.2.6 Broadcast Manager multiplex message receive filter
-      4.2.7 Broadcast Manager CAN FD support
-    4.3 connected transport protocols (SOCK_SEQPACKET)
-    4.4 unconnected transport protocols (SOCK_DGRAM)
-
-  5 SocketCAN core module
-    5.1 can.ko module params
-    5.2 procfs content
-    5.3 writing own CAN protocol modules
-
-  6 CAN network drivers
-    6.1 general settings
-    6.2 local loopback of sent frames
-    6.3 CAN controller hardware filters
-    6.4 The virtual CAN driver (vcan)
-    6.5 The CAN network device driver interface
-      6.5.1 Netlink interface to set/get devices properties
-      6.5.2 Setting the CAN bit-timing
-      6.5.3 Starting and stopping the CAN network device
-    6.6 CAN FD (flexible data rate) driver support
-    6.7 supported CAN hardware
-
-  7 SocketCAN resources
-
-  8 Credits
-
-============================================================================
-
-1. Overview / What is SocketCAN
---------------------------------
-
-The socketcan package is an implementation of CAN protocols
-(Controller Area Network) for Linux.  CAN is a networking technology
-which has widespread use in automation, embedded devices, and
-automotive fields.  While there have been other CAN implementations
-for Linux based on character devices, SocketCAN uses the Berkeley
-socket API, the Linux network stack and implements the CAN device
-drivers as network interfaces.  The CAN socket API has been designed
-as similar as possible to the TCP/IP protocols to allow programmers,
-familiar with network programming, to easily learn how to use CAN
-sockets.
-
-2. Motivation / Why using the socket API
-----------------------------------------
-
-There have been CAN implementations for Linux before SocketCAN so the
-question arises, why we have started another project.  Most existing
-implementations come as a device driver for some CAN hardware, they
-are based on character devices and provide comparatively little
-functionality.  Usually, there is only a hardware-specific device
-driver which provides a character device interface to send and
-receive raw CAN frames, directly to/from the controller hardware.
-Queueing of frames and higher-level transport protocols like ISO-TP
-have to be implemented in user space applications.  Also, most
-character-device implementations support only one single process to
-open the device at a time, similar to a serial interface.  Exchanging
-the CAN controller requires employment of another device driver and
-often the need for adaption of large parts of the application to the
-new driver's API.
-
-SocketCAN was designed to overcome all of these limitations.  A new
-protocol family has been implemented which provides a socket interface
-to user space applications and which builds upon the Linux network
-layer, enabling use all of the provided queueing functionality.  A device
-driver for CAN controller hardware registers itself with the Linux
-network layer as a network device, so that CAN frames from the
-controller can be passed up to the network layer and on to the CAN
-protocol family module and also vice-versa.  Also, the protocol family
-module provides an API for transport protocol modules to register, so
-that any number of transport protocols can be loaded or unloaded
-dynamically.  In fact, the can core module alone does not provide any
-protocol and cannot be used without loading at least one additional
-protocol module.  Multiple sockets can be opened at the same time,
-on different or the same protocol module and they can listen/send
-frames on different or the same CAN IDs.  Several sockets listening on
-the same interface for frames with the same CAN ID are all passed the
-same received matching CAN frames.  An application wishing to
-communicate using a specific transport protocol, e.g. ISO-TP, just
-selects that protocol when opening the socket, and then can read and
-write application data byte streams, without having to deal with
-CAN-IDs, frames, etc.
-
-Similar functionality visible from user-space could be provided by a
-character device, too, but this would lead to a technically inelegant
-solution for a couple of reasons:
-
-* Intricate usage.  Instead of passing a protocol argument to
-  socket(2) and using bind(2) to select a CAN interface and CAN ID, an
-  application would have to do all these operations using ioctl(2)s.
-
-* Code duplication.  A character device cannot make use of the Linux
-  network queueing code, so all that code would have to be duplicated
-  for CAN networking.
-
-* Abstraction.  In most existing character-device implementations, the
-  hardware-specific device driver for a CAN controller directly
-  provides the character device for the application to work with.
-  This is at least very unusual in Unix systems for both, char and
-  block devices.  For example you don't have a character device for a
-  certain UART of a serial interface, a certain sound chip in your
-  computer, a SCSI or IDE controller providing access to your hard
-  disk or tape streamer device.  Instead, you have abstraction layers
-  which provide a unified character or block device interface to the
-  application on the one hand, and a interface for hardware-specific
-  device drivers on the other hand.  These abstractions are provided
-  by subsystems like the tty layer, the audio subsystem or the SCSI
-  and IDE subsystems for the devices mentioned above.
-
-  The easiest way to implement a CAN device driver is as a character
-  device without such a (complete) abstraction layer, as is done by most
-  existing drivers.  The right way, however, would be to add such a
-  layer with all the functionality like registering for certain CAN
-  IDs, supporting several open file descriptors and (de)multiplexing
-  CAN frames between them, (sophisticated) queueing of CAN frames, and
-  providing an API for device drivers to register with.  However, then
-  it would be no more difficult, or may be even easier, to use the
-  networking framework provided by the Linux kernel, and this is what
-  SocketCAN does.
-
-  The use of the networking framework of the Linux kernel is just the
-  natural and most appropriate way to implement CAN for Linux.
-
-3. SocketCAN concept
----------------------
-
-  As described in chapter 2 it is the main goal of SocketCAN to
-  provide a socket interface to user space applications which builds
-  upon the Linux network layer. In contrast to the commonly known
-  TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!)
-  medium that has no MAC-layer addressing like ethernet. The CAN-identifier
-  (can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs
-  have to be chosen uniquely on the bus. When designing a CAN-ECU
-  network the CAN-IDs are mapped to be sent by a specific ECU.
-  For this reason a CAN-ID can be treated best as a kind of source address.
-
-  3.1 receive lists
-
-  The network transparent access of multiple applications leads to the
-  problem that different applications may be interested in the same
-  CAN-IDs from the same CAN network interface. The SocketCAN core
-  module - which implements the protocol family CAN - provides several
-  high efficient receive lists for this reason. If e.g. a user space
-  application opens a CAN RAW socket, the raw protocol module itself
-  requests the (range of) CAN-IDs from the SocketCAN core that are
-  requested by the user. The subscription and unsubscription of
-  CAN-IDs can be done for specific CAN interfaces or for all(!) known
-  CAN interfaces with the can_rx_(un)register() functions provided to
-  CAN protocol modules by the SocketCAN core (see chapter 5).
-  To optimize the CPU usage at runtime the receive lists are split up
-  into several specific lists per device that match the requested
-  filter complexity for a given use-case.
-
-  3.2 local loopback of sent frames
-
-  As known from other networking concepts the data exchanging
-  applications may run on the same or different nodes without any
-  change (except for the according addressing information):
-
-         ___   ___   ___                   _______   ___
-        | _ | | _ | | _ |                 | _   _ | | _ |
-        ||A|| ||B|| ||C||                 ||A| |B|| ||C||
-        |___| |___| |___|                 |_______| |___|
-          |     |     |                       |       |
-        -----------------(1)- CAN bus -(2)---------------
-
-  To ensure that application A receives the same information in the
-  example (2) as it would receive in example (1) there is need for
-  some kind of local loopback of the sent CAN frames on the appropriate
-  node.
-
-  The Linux network devices (by default) just can handle the
-  transmission and reception of media dependent frames. Due to the
-  arbitration on the CAN bus the transmission of a low prio CAN-ID
-  may be delayed by the reception of a high prio CAN frame. To
-  reflect the correct* traffic on the node the loopback of the sent
-  data has to be performed right after a successful transmission. If
-  the CAN network interface is not capable of performing the loopback for
-  some reason the SocketCAN core can do this task as a fallback solution.
-  See chapter 6.2 for details (recommended).
-
-  The loopback functionality is enabled by default to reflect standard
-  networking behaviour for CAN applications. Due to some requests from
-  the RT-SocketCAN group the loopback optionally may be disabled for each
-  separate socket. See sockopts from the CAN RAW sockets in chapter 4.1.
-
-  * = you really like to have this when you're running analyser tools
-      like 'candump' or 'cansniffer' on the (same) node.
-
-  3.3 network problem notifications
-
-  The use of the CAN bus may lead to several problems on the physical
-  and media access control layer. Detecting and logging of these lower
-  layer problems is a vital requirement for CAN users to identify
-  hardware issues on the physical transceiver layer as well as
-  arbitration problems and error frames caused by the different
-  ECUs. The occurrence of detected errors are important for diagnosis
-  and have to be logged together with the exact timestamp. For this
-  reason the CAN interface driver can generate so called Error Message
-  Frames that can optionally be passed to the user application in the
-  same way as other CAN frames. Whenever an error on the physical layer
-  or the MAC layer is detected (e.g. by the CAN controller) the driver
-  creates an appropriate error message frame. Error messages frames can
-  be requested by the user application using the common CAN filter
-  mechanisms. Inside this filter definition the (interested) type of
-  errors may be selected. The reception of error messages is disabled
-  by default. The format of the CAN error message frame is briefly
-  described in the Linux header file "include/uapi/linux/can/error.h".
-
-4. How to use SocketCAN
-------------------------
-
-  Like TCP/IP, you first need to open a socket for communicating over a
-  CAN network. Since SocketCAN implements a new protocol family, you
-  need to pass PF_CAN as the first argument to the socket(2) system
-  call. Currently, there are two CAN protocols to choose from, the raw
-  socket protocol and the broadcast manager (BCM). So to open a socket,
-  you would write
-
-    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
-
-  and
-
-    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
-
-  respectively.  After the successful creation of the socket, you would
-  normally use the bind(2) system call to bind the socket to a CAN
-  interface (which is different from TCP/IP due to different addressing
-  - see chapter 3). After binding (CAN_RAW) or connecting (CAN_BCM)
-  the socket, you can read(2) and write(2) from/to the socket or use
-  send(2), sendto(2), sendmsg(2) and the recv* counterpart operations
-  on the socket as usual. There are also CAN specific socket options
-  described below.
-
-  The basic CAN frame structure and the sockaddr structure are defined
-  in include/linux/can.h:
-
-    struct can_frame {
-            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
-            __u8    can_dlc; /* frame payload length in byte (0 .. 8) */
-            __u8    __pad;   /* padding */
-            __u8    __res0;  /* reserved / padding */
-            __u8    __res1;  /* reserved / padding */
-            __u8    data[8] __attribute__((aligned(8)));
-    };
-
-  The alignment of the (linear) payload data[] to a 64bit boundary
-  allows the user to define their own structs and unions to easily access
-  the CAN payload. There is no given byteorder on the CAN bus by
-  default. A read(2) system call on a CAN_RAW socket transfers a
-  struct can_frame to the user space.
-
-  The sockaddr_can structure has an interface index like the
-  PF_PACKET socket, that also binds to a specific interface:
-
-    struct sockaddr_can {
-            sa_family_t can_family;
-            int         can_ifindex;
-            union {
-                    /* transport protocol class address info (e.g. ISOTP) */
-                    struct { canid_t rx_id, tx_id; } tp;
-
-                    /* reserved for future CAN protocols address information */
-            } can_addr;
-    };
-
-  To determine the interface index an appropriate ioctl() has to
-  be used (example for CAN_RAW sockets without error checking):
-
-    int s;
-    struct sockaddr_can addr;
-    struct ifreq ifr;
-
-    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
-
-    strcpy(ifr.ifr_name, "can0" );
-    ioctl(s, SIOCGIFINDEX, &ifr);
-
-    addr.can_family = AF_CAN;
-    addr.can_ifindex = ifr.ifr_ifindex;
-
-    bind(s, (struct sockaddr *)&addr, sizeof(addr));
-
-    (..)
-
-  To bind a socket to all(!) CAN interfaces the interface index must
-  be 0 (zero). In this case the socket receives CAN frames from every
-  enabled CAN interface. To determine the originating CAN interface
-  the system call recvfrom(2) may be used instead of read(2). To send
-  on a socket that is bound to 'any' interface sendto(2) is needed to
-  specify the outgoing interface.
-
-  Reading CAN frames from a bound CAN_RAW socket (see above) consists
-  of reading a struct can_frame:
-
-    struct can_frame frame;
-
-    nbytes = read(s, &frame, sizeof(struct can_frame));
-
-    if (nbytes < 0) {
-            perror("can raw socket read");
-            return 1;
-    }
-
-    /* paranoid check ... */
-    if (nbytes < sizeof(struct can_frame)) {
-            fprintf(stderr, "read: incomplete CAN frame\n");
-            return 1;
-    }
-
-    /* do something with the received CAN frame */
-
-  Writing CAN frames can be done similarly, with the write(2) system call:
-
-    nbytes = write(s, &frame, sizeof(struct can_frame));
-
-  When the CAN interface is bound to 'any' existing CAN interface
-  (addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the
-  information about the originating CAN interface is needed:
-
-    struct sockaddr_can addr;
-    struct ifreq ifr;
-    socklen_t len = sizeof(addr);
-    struct can_frame frame;
-
-    nbytes = recvfrom(s, &frame, sizeof(struct can_frame),
-                      0, (struct sockaddr*)&addr, &len);
-
-    /* get interface name of the received CAN frame */
-    ifr.ifr_ifindex = addr.can_ifindex;
-    ioctl(s, SIOCGIFNAME, &ifr);
-    printf("Received a CAN frame from interface %s", ifr.ifr_name);
-
-  To write CAN frames on sockets bound to 'any' CAN interface the
-  outgoing interface has to be defined certainly.
-
-    strcpy(ifr.ifr_name, "can0");
-    ioctl(s, SIOCGIFINDEX, &ifr);
-    addr.can_ifindex = ifr.ifr_ifindex;
-    addr.can_family  = AF_CAN;
-
-    nbytes = sendto(s, &frame, sizeof(struct can_frame),
-                    0, (struct sockaddr*)&addr, sizeof(addr));
-
-  An accurate timestamp can be obtained with an ioctl(2) call after reading
-  a message from the socket:
-
-    struct timeval tv;
-    ioctl(s, SIOCGSTAMP, &tv);
-
-  The timestamp has a resolution of one microsecond and is set automatically
-  at the reception of a CAN frame.
-
-  Remark about CAN FD (flexible data rate) support:
-
-  Generally the handling of CAN FD is very similar to the formerly described
-  examples. The new CAN FD capable CAN controllers support two different
-  bitrates for the arbitration phase and the payload phase of the CAN FD frame
-  and up to 64 bytes of payload. This extended payload length breaks all the
-  kernel interfaces (ABI) which heavily rely on the CAN frame with fixed eight
-  bytes of payload (struct can_frame) like the CAN_RAW socket. Therefore e.g.
-  the CAN_RAW socket supports a new socket option CAN_RAW_FD_FRAMES that
-  switches the socket into a mode that allows the handling of CAN FD frames
-  and (legacy) CAN frames simultaneously (see section 4.1.5).
-
-  The struct canfd_frame is defined in include/linux/can.h:
-
-    struct canfd_frame {
-            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
-            __u8    len;     /* frame payload length in byte (0 .. 64) */
-            __u8    flags;   /* additional flags for CAN FD */
-            __u8    __res0;  /* reserved / padding */
-            __u8    __res1;  /* reserved / padding */
-            __u8    data[64] __attribute__((aligned(8)));
-    };
-
-  The struct canfd_frame and the existing struct can_frame have the can_id,
-  the payload length and the payload data at the same offset inside their
-  structures. This allows to handle the different structures very similar.
-  When the content of a struct can_frame is copied into a struct canfd_frame
-  all structure elements can be used as-is - only the data[] becomes extended.
-
-  When introducing the struct canfd_frame it turned out that the data length
-  code (DLC) of the struct can_frame was used as a length information as the
-  length and the DLC has a 1:1 mapping in the range of 0 .. 8. To preserve
-  the easy handling of the length information the canfd_frame.len element
-  contains a plain length value from 0 .. 64. So both canfd_frame.len and
-  can_frame.can_dlc are equal and contain a length information and no DLC.
-  For details about the distinction of CAN and CAN FD capable devices and
-  the mapping to the bus-relevant data length code (DLC), see chapter 6.6.
-
-  The length of the two CAN(FD) frame structures define the maximum transfer
-  unit (MTU) of the CAN(FD) network interface and skbuff data length. Two
-  definitions are specified for CAN specific MTUs in include/linux/can.h :
-
-  #define CAN_MTU   (sizeof(struct can_frame))   == 16  => 'legacy' CAN frame
-  #define CANFD_MTU (sizeof(struct canfd_frame)) == 72  => CAN FD frame
-
-  4.1 RAW protocol sockets with can_filters (SOCK_RAW)
-
-  Using CAN_RAW sockets is extensively comparable to the commonly
-  known access to CAN character devices. To meet the new possibilities
-  provided by the multi user SocketCAN approach, some reasonable
-  defaults are set at RAW socket binding time:
-
-  - The filters are set to exactly one filter receiving everything
-  - The socket only receives valid data frames (=> no error message frames)
-  - The loopback of sent CAN frames is enabled (see chapter 3.2)
-  - The socket does not receive its own sent frames (in loopback mode)
-
-  These default settings may be changed before or after binding the socket.
-  To use the referenced definitions of the socket options for CAN_RAW
-  sockets, include <linux/can/raw.h>.
-
-  4.1.1 RAW socket option CAN_RAW_FILTER
-
-  The reception of CAN frames using CAN_RAW sockets can be controlled
-  by defining 0 .. n filters with the CAN_RAW_FILTER socket option.
-
-  The CAN filter structure is defined in include/linux/can.h:
-
-    struct can_filter {
-            canid_t can_id;
-            canid_t can_mask;
-    };
-
-  A filter matches, when
-
-    <received_can_id> & mask == can_id & mask
-
-  which is analogous to known CAN controllers hardware filter semantics.
-  The filter can be inverted in this semantic, when the CAN_INV_FILTER
-  bit is set in can_id element of the can_filter structure. In
-  contrast to CAN controller hardware filters the user may set 0 .. n
-  receive filters for each open socket separately:
-
-    struct can_filter rfilter[2];
-
-    rfilter[0].can_id   = 0x123;
-    rfilter[0].can_mask = CAN_SFF_MASK;
-    rfilter[1].can_id   = 0x200;
-    rfilter[1].can_mask = 0x700;
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
-
-  To disable the reception of CAN frames on the selected CAN_RAW socket:
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0);
-
-  To set the filters to zero filters is quite obsolete as to not read
-  data causes the raw socket to discard the received CAN frames. But
-  having this 'send only' use-case we may remove the receive list in the
-  Kernel to save a little (really a very little!) CPU usage.
-
-  4.1.1.1 CAN filter usage optimisation
-
-  The CAN filters are processed in per-device filter lists at CAN frame
-  reception time. To reduce the number of checks that need to be performed
-  while walking through the filter lists the CAN core provides an optimized
-  filter handling when the filter subscription focusses on a single CAN ID.
-
-  For the possible 2048 SFF CAN identifiers the identifier is used as an index
-  to access the corresponding subscription list without any further checks.
-  For the 2^29 possible EFF CAN identifiers a 10 bit XOR folding is used as
-  hash function to retrieve the EFF table index.
-
-  To benefit from the optimized filters for single CAN identifiers the
-  CAN_SFF_MASK or CAN_EFF_MASK have to be set into can_filter.mask together
-  with set CAN_EFF_FLAG and CAN_RTR_FLAG bits. A set CAN_EFF_FLAG bit in the
-  can_filter.mask makes clear that it matters whether a SFF or EFF CAN ID is
-  subscribed. E.g. in the example from above
-
-    rfilter[0].can_id   = 0x123;
-    rfilter[0].can_mask = CAN_SFF_MASK;
-
-  both SFF frames with CAN ID 0x123 and EFF frames with 0xXXXXX123 can pass.
-
-  To filter for only 0x123 (SFF) and 0x12345678 (EFF) CAN identifiers the
-  filter has to be defined in this way to benefit from the optimized filters:
-
-    struct can_filter rfilter[2];
-
-    rfilter[0].can_id   = 0x123;
-    rfilter[0].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_SFF_MASK);
-    rfilter[1].can_id   = 0x12345678 | CAN_EFF_FLAG;
-    rfilter[1].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_EFF_MASK);
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
-
-  4.1.2 RAW socket option CAN_RAW_ERR_FILTER
-
-  As described in chapter 3.3 the CAN interface driver can generate so
-  called Error Message Frames that can optionally be passed to the user
-  application in the same way as other CAN frames. The possible
-  errors are divided into different error classes that may be filtered
-  using the appropriate error mask. To register for every possible
-  error condition CAN_ERR_MASK can be used as value for the error mask.
-  The values for the error mask are defined in linux/can/error.h .
-
-    can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF );
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER,
-               &err_mask, sizeof(err_mask));
-
-  4.1.3 RAW socket option CAN_RAW_LOOPBACK
-
-  To meet multi user needs the local loopback is enabled by default
-  (see chapter 3.2 for details). But in some embedded use-cases
-  (e.g. when only one application uses the CAN bus) this loopback
-  functionality can be disabled (separately for each socket):
-
-    int loopback = 0; /* 0 = disabled, 1 = enabled (default) */
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback));
-
-  4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS
-
-  When the local loopback is enabled, all the sent CAN frames are
-  looped back to the open CAN sockets that registered for the CAN
-  frames' CAN-ID on this given interface to meet the multi user
-  needs. The reception of the CAN frames on the same socket that was
-  sending the CAN frame is assumed to be unwanted and therefore
-  disabled by default. This default behaviour may be changed on
-  demand:
-
-    int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS,
-               &recv_own_msgs, sizeof(recv_own_msgs));
-
-  4.1.5 RAW socket option CAN_RAW_FD_FRAMES
-
-  CAN FD support in CAN_RAW sockets can be enabled with a new socket option
-  CAN_RAW_FD_FRAMES which is off by default. When the new socket option is
-  not supported by the CAN_RAW socket (e.g. on older kernels), switching the
-  CAN_RAW_FD_FRAMES option returns the error -ENOPROTOOPT.
-
-  Once CAN_RAW_FD_FRAMES is enabled the application can send both CAN frames
-  and CAN FD frames. OTOH the application has to handle CAN and CAN FD frames
-  when reading from the socket.
-
-    CAN_RAW_FD_FRAMES enabled:  CAN_MTU and CANFD_MTU are allowed
-    CAN_RAW_FD_FRAMES disabled: only CAN_MTU is allowed (default)
-
-  Example:
-    [ remember: CANFD_MTU == sizeof(struct canfd_frame) ]
-
-    struct canfd_frame cfd;
-
-    nbytes = read(s, &cfd, CANFD_MTU);
-
-    if (nbytes == CANFD_MTU) {
-            printf("got CAN FD frame with length %d\n", cfd.len);
-	    /* cfd.flags contains valid data */
-    } else if (nbytes == CAN_MTU) {
-            printf("got legacy CAN frame with length %d\n", cfd.len);
-	    /* cfd.flags is undefined */
-    } else {
-            fprintf(stderr, "read: invalid CAN(FD) frame\n");
-            return 1;
-    }
-
-    /* the content can be handled independently from the received MTU size */
-
-    printf("can_id: %X data length: %d data: ", cfd.can_id, cfd.len);
-    for (i = 0; i < cfd.len; i++)
-            printf("%02X ", cfd.data[i]);
-
-  When reading with size CANFD_MTU only returns CAN_MTU bytes that have
-  been received from the socket a legacy CAN frame has been read into the
-  provided CAN FD structure. Note that the canfd_frame.flags data field is
-  not specified in the struct can_frame and therefore it is only valid in
-  CANFD_MTU sized CAN FD frames.
-
-  Implementation hint for new CAN applications:
-
-  To build a CAN FD aware application use struct canfd_frame as basic CAN
-  data structure for CAN_RAW based applications. When the application is
-  executed on an older Linux kernel and switching the CAN_RAW_FD_FRAMES
-  socket option returns an error: No problem. You'll get legacy CAN frames
-  or CAN FD frames and can process them the same way.
-
-  When sending to CAN devices make sure that the device is capable to handle
-  CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU.
-  The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
-
-  4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS
-
-  The CAN_RAW socket can set multiple CAN identifier specific filters that
-  lead to multiple filters in the af_can.c filter processing. These filters
-  are indenpendent from each other which leads to logical OR'ed filters when
-  applied (see 4.1.1).
-
-  This socket option joines the given CAN filters in the way that only CAN
-  frames are passed to user space that matched *all* given CAN filters. The
-  semantic for the applied filters is therefore changed to a logical AND.
-
-  This is useful especially when the filterset is a combination of filters
-  where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or
-  CAN ID ranges from the incoming traffic.
-
-  4.1.7 RAW socket returned message flags
-
-  When using recvmsg() call, the msg->msg_flags may contain following flags:
-
-    MSG_DONTROUTE: set when the received frame was created on the local host.
-
-    MSG_CONFIRM: set when the frame was sent via the socket it is received on.
-      This flag can be interpreted as a 'transmission confirmation' when the
-      CAN driver supports the echo of frames on driver level, see 3.2 and 6.2.
-      In order to receive such messages, CAN_RAW_RECV_OWN_MSGS must be set.
-
-  4.2 Broadcast Manager protocol sockets (SOCK_DGRAM)
-
-  The Broadcast Manager protocol provides a command based configuration
-  interface to filter and send (e.g. cyclic) CAN messages in kernel space.
-
-  Receive filters can be used to down sample frequent messages; detect events
-  such as message contents changes, packet length changes, and do time-out
-  monitoring of received messages.
-
-  Periodic transmission tasks of CAN frames or a sequence of CAN frames can be
-  created and modified at runtime; both the message content and the two
-  possible transmit intervals can be altered.
-
-  A BCM socket is not intended for sending individual CAN frames using the
-  struct can_frame as known from the CAN_RAW socket. Instead a special BCM
-  configuration message is defined. The basic BCM configuration message used
-  to communicate with the broadcast manager and the available operations are
-  defined in the linux/can/bcm.h include. The BCM message consists of a
-  message header with a command ('opcode') followed by zero or more CAN frames.
-  The broadcast manager sends responses to user space in the same form:
-
-    struct bcm_msg_head {
-            __u32 opcode;                   /* command */
-            __u32 flags;                    /* special flags */
-            __u32 count;                    /* run 'count' times with ival1 */
-            struct timeval ival1, ival2;    /* count and subsequent interval */
-            canid_t can_id;                 /* unique can_id for task */
-            __u32 nframes;                  /* number of can_frames following */
-            struct can_frame frames[0];
-    };
-
-  The aligned payload 'frames' uses the same basic CAN frame structure defined
-  at the beginning of section 4 and in the include/linux/can.h include. All
-  messages to the broadcast manager from user space have this structure.
-
-  Note a CAN_BCM socket must be connected instead of bound after socket
-  creation (example without error checking):
-
-    int s;
-    struct sockaddr_can addr;
-    struct ifreq ifr;
-
-    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
-
-    strcpy(ifr.ifr_name, "can0");
-    ioctl(s, SIOCGIFINDEX, &ifr);
-
-    addr.can_family = AF_CAN;
-    addr.can_ifindex = ifr.ifr_ifindex;
-
-    connect(s, (struct sockaddr *)&addr, sizeof(addr));
-
-    (..)
-
-  The broadcast manager socket is able to handle any number of in flight
-  transmissions or receive filters concurrently. The different RX/TX jobs are
-  distinguished by the unique can_id in each BCM message. However additional
-  CAN_BCM sockets are recommended to communicate on multiple CAN interfaces.
-  When the broadcast manager socket is bound to 'any' CAN interface (=> the
-  interface index is set to zero) the configured receive filters apply to any
-  CAN interface unless the sendto() syscall is used to overrule the 'any' CAN
-  interface index. When using recvfrom() instead of read() to retrieve BCM
-  socket messages the originating CAN interface is provided in can_ifindex.
-
-  4.2.1 Broadcast Manager operations
-
-  The opcode defines the operation for the broadcast manager to carry out,
-  or details the broadcast managers response to several events, including
-  user requests.
-
-  Transmit Operations (user space to broadcast manager):
-
-    TX_SETUP:   Create (cyclic) transmission task.
-
-    TX_DELETE:  Remove (cyclic) transmission task, requires only can_id.
-
-    TX_READ:    Read properties of (cyclic) transmission task for can_id.
-
-    TX_SEND:    Send one CAN frame.
-
-  Transmit Responses (broadcast manager to user space):
-
-    TX_STATUS:  Reply to TX_READ request (transmission task configuration).
-
-    TX_EXPIRED: Notification when counter finishes sending at initial interval
-      'ival1'. Requires the TX_COUNTEVT flag to be set at TX_SETUP.
-
-  Receive Operations (user space to broadcast manager):
-
-    RX_SETUP:   Create RX content filter subscription.
-
-    RX_DELETE:  Remove RX content filter subscription, requires only can_id.
-
-    RX_READ:    Read properties of RX content filter subscription for can_id.
-
-  Receive Responses (broadcast manager to user space):
-
-    RX_STATUS:  Reply to RX_READ request (filter task configuration).
-
-    RX_TIMEOUT: Cyclic message is detected to be absent (timer ival1 expired).
-
-    RX_CHANGED: BCM message with updated CAN frame (detected content change).
-      Sent on first message received or on receipt of revised CAN messages.
-
-  4.2.2 Broadcast Manager message flags
-
-  When sending a message to the broadcast manager the 'flags' element may
-  contain the following flag definitions which influence the behaviour:
-
-    SETTIMER:           Set the values of ival1, ival2 and count
-
-    STARTTIMER:         Start the timer with the actual values of ival1, ival2
-      and count. Starting the timer leads simultaneously to emit a CAN frame.
-
-    TX_COUNTEVT:        Create the message TX_EXPIRED when count expires
-
-    TX_ANNOUNCE:        A change of data by the process is emitted immediately.
-
-    TX_CP_CAN_ID:       Copies the can_id from the message header to each
-      subsequent frame in frames. This is intended as usage simplification. For
-      TX tasks the unique can_id from the message header may differ from the
-      can_id(s) stored for transmission in the subsequent struct can_frame(s).
-
-    RX_FILTER_ID:       Filter by can_id alone, no frames required (nframes=0).
-
-    RX_CHECK_DLC:       A change of the DLC leads to an RX_CHANGED.
-
-    RX_NO_AUTOTIMER:    Prevent automatically starting the timeout monitor.
-
-    RX_ANNOUNCE_RESUME: If passed at RX_SETUP and a receive timeout occurred, a
-      RX_CHANGED message will be generated when the (cyclic) receive restarts.
-
-    TX_RESET_MULTI_IDX: Reset the index for the multiple frame transmission.
-
-    RX_RTR_FRAME:       Send reply for RTR-request (placed in op->frames[0]).
-
-  4.2.3 Broadcast Manager transmission timers
-
-  Periodic transmission configurations may use up to two interval timers.
-  In this case the BCM sends a number of messages ('count') at an interval
-  'ival1', then continuing to send at another given interval 'ival2'. When
-  only one timer is needed 'count' is set to zero and only 'ival2' is used.
-  When SET_TIMER and START_TIMER flag were set the timers are activated.
-  The timer values can be altered at runtime when only SET_TIMER is set.
-
-  4.2.4 Broadcast Manager message sequence transmission
-
-  Up to 256 CAN frames can be transmitted in a sequence in the case of a cyclic
-  TX task configuration. The number of CAN frames is provided in the 'nframes'
-  element of the BCM message head. The defined number of CAN frames are added
-  as array to the TX_SETUP BCM configuration message.
-
-    /* create a struct to set up a sequence of four CAN frames */
-    struct {
-            struct bcm_msg_head msg_head;
-            struct can_frame frame[4];
-    } mytxmsg;
-
-    (..)
-    mytxmsg.msg_head.nframes = 4;
-    (..)
-
-    write(s, &mytxmsg, sizeof(mytxmsg));
-
-  With every transmission the index in the array of CAN frames is increased
-  and set to zero at index overflow.
-
-  4.2.5 Broadcast Manager receive filter timers
-
-  The timer values ival1 or ival2 may be set to non-zero values at RX_SETUP.
-  When the SET_TIMER flag is set the timers are enabled:
-
-  ival1: Send RX_TIMEOUT when a received message is not received again within
-    the given time. When START_TIMER is set at RX_SETUP the timeout detection
-    is activated directly - even without a former CAN frame reception.
-
-  ival2: Throttle the received message rate down to the value of ival2. This
-    is useful to reduce messages for the application when the signal inside the
-    CAN frame is stateless as state changes within the ival2 periode may get
-    lost.
-
-  4.2.6 Broadcast Manager multiplex message receive filter
-
-  To filter for content changes in multiplex message sequences an array of more
-  than one CAN frames can be passed in a RX_SETUP configuration message. The
-  data bytes of the first CAN frame contain the mask of relevant bits that
-  have to match in the subsequent CAN frames with the received CAN frame.
-  If one of the subsequent CAN frames is matching the bits in that frame data
-  mark the relevant content to be compared with the previous received content.
-  Up to 257 CAN frames (multiplex filter bit mask CAN frame plus 256 CAN
-  filters) can be added as array to the TX_SETUP BCM configuration message.
-
-    /* usually used to clear CAN frame data[] - beware of endian problems! */
-    #define U64_DATA(p) (*(unsigned long long*)(p)->data)
-
-    struct {
-            struct bcm_msg_head msg_head;
-            struct can_frame frame[5];
-    } msg;
-
-    msg.msg_head.opcode  = RX_SETUP;
-    msg.msg_head.can_id  = 0x42;
-    msg.msg_head.flags   = 0;
-    msg.msg_head.nframes = 5;
-    U64_DATA(&msg.frame[0]) = 0xFF00000000000000ULL; /* MUX mask */
-    U64_DATA(&msg.frame[1]) = 0x01000000000000FFULL; /* data mask (MUX 0x01) */
-    U64_DATA(&msg.frame[2]) = 0x0200FFFF000000FFULL; /* data mask (MUX 0x02) */
-    U64_DATA(&msg.frame[3]) = 0x330000FFFFFF0003ULL; /* data mask (MUX 0x33) */
-    U64_DATA(&msg.frame[4]) = 0x4F07FC0FF0000000ULL; /* data mask (MUX 0x4F) */
-
-    write(s, &msg, sizeof(msg));
-
-  4.2.7 Broadcast Manager CAN FD support
-
-  The programming API of the CAN_BCM depends on struct can_frame which is
-  given as array directly behind the bcm_msg_head structure. To follow this
-  schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head
-  flags indicates that the concatenated CAN frame structures behind the
-  bcm_msg_head are defined as struct canfd_frame.
-
-    struct {
-            struct bcm_msg_head msg_head;
-            struct canfd_frame frame[5];
-    } msg;
-
-    msg.msg_head.opcode  = RX_SETUP;
-    msg.msg_head.can_id  = 0x42;
-    msg.msg_head.flags   = CAN_FD_FRAME;
-    msg.msg_head.nframes = 5;
-    (..)
-
-  When using CAN FD frames for multiplex filtering the MUX mask is still
-  expected in the first 64 bit of the struct canfd_frame data section.
-
-  4.3 connected transport protocols (SOCK_SEQPACKET)
-  4.4 unconnected transport protocols (SOCK_DGRAM)
-
-
-5. SocketCAN core module
--------------------------
-
-  The SocketCAN core module implements the protocol family
-  PF_CAN. CAN protocol modules are loaded by the core module at
-  runtime. The core module provides an interface for CAN protocol
-  modules to subscribe needed CAN IDs (see chapter 3.1).
-
-  5.1 can.ko module params
-
-  - stats_timer: To calculate the SocketCAN core statistics
-    (e.g. current/maximum frames per second) this 1 second timer is
-    invoked at can.ko module start time by default. This timer can be
-    disabled by using stattimer=0 on the module commandline.
-
-  - debug: (removed since SocketCAN SVN r546)
-
-  5.2 procfs content
-
-  As described in chapter 3.1 the SocketCAN core uses several filter
-  lists to deliver received CAN frames to CAN protocol modules. These
-  receive lists, their filters and the count of filter matches can be
-  checked in the appropriate receive list. All entries contain the
-  device and a protocol module identifier:
-
-    foo@bar:~$ cat /proc/net/can/rcvlist_all
-
-    receive list 'rx_all':
-      (vcan3: no entry)
-      (vcan2: no entry)
-      (vcan1: no entry)
-      device   can_id   can_mask  function  userdata   matches  ident
-       vcan0     000    00000000  f88e6370  f6c6f400         0  raw
-      (any: no entry)
-
-  In this example an application requests any CAN traffic from vcan0.
-
-    rcvlist_all - list for unfiltered entries (no filter operations)
-    rcvlist_eff - list for single extended frame (EFF) entries
-    rcvlist_err - list for error message frames masks
-    rcvlist_fil - list for mask/value filters
-    rcvlist_inv - list for mask/value filters (inverse semantic)
-    rcvlist_sff - list for single standard frame (SFF) entries
-
-  Additional procfs files in /proc/net/can
-
-    stats       - SocketCAN core statistics (rx/tx frames, match ratios, ...)
-    reset_stats - manual statistic reset
-    version     - prints the SocketCAN core version and the ABI version
-
-  5.3 writing own CAN protocol modules
-
-  To implement a new protocol in the protocol family PF_CAN a new
-  protocol has to be defined in include/linux/can.h .
-  The prototypes and definitions to use the SocketCAN core can be
-  accessed by including include/linux/can/core.h .
-  In addition to functions that register the CAN protocol and the
-  CAN device notifier chain there are functions to subscribe CAN
-  frames received by CAN interfaces and to send CAN frames:
-
-    can_rx_register   - subscribe CAN frames from a specific interface
-    can_rx_unregister - unsubscribe CAN frames from a specific interface
-    can_send          - transmit a CAN frame (optional with local loopback)
-
-  For details see the kerneldoc documentation in net/can/af_can.c or
-  the source code of net/can/raw.c or net/can/bcm.c .
-
-6. CAN network drivers
-----------------------
-
-  Writing a CAN network device driver is much easier than writing a
-  CAN character device driver. Similar to other known network device
-  drivers you mainly have to deal with:
-
-  - TX: Put the CAN frame from the socket buffer to the CAN controller.
-  - RX: Put the CAN frame from the CAN controller to the socket buffer.
-
-  See e.g. at Documentation/networking/netdevices.txt . The differences
-  for writing CAN network device driver are described below:
-
-  6.1 general settings
-
-    dev->type  = ARPHRD_CAN; /* the netdevice hardware type */
-    dev->flags = IFF_NOARP;  /* CAN has no arp */
-
-    dev->mtu = CAN_MTU; /* sizeof(struct can_frame) -> legacy CAN interface */
-
-    or alternative, when the controller supports CAN with flexible data rate:
-    dev->mtu = CANFD_MTU; /* sizeof(struct canfd_frame) -> CAN FD interface */
-
-  The struct can_frame or struct canfd_frame is the payload of each socket
-  buffer (skbuff) in the protocol family PF_CAN.
-
-  6.2 local loopback of sent frames
-
-  As described in chapter 3.2 the CAN network device driver should
-  support a local loopback functionality similar to the local echo
-  e.g. of tty devices. In this case the driver flag IFF_ECHO has to be
-  set to prevent the PF_CAN core from locally echoing sent frames
-  (aka loopback) as fallback solution:
-
-    dev->flags = (IFF_NOARP | IFF_ECHO);
-
-  6.3 CAN controller hardware filters
-
-  To reduce the interrupt load on deep embedded systems some CAN
-  controllers support the filtering of CAN IDs or ranges of CAN IDs.
-  These hardware filter capabilities vary from controller to
-  controller and have to be identified as not feasible in a multi-user
-  networking approach. The use of the very controller specific
-  hardware filters could make sense in a very dedicated use-case, as a
-  filter on driver level would affect all users in the multi-user
-  system. The high efficient filter sets inside the PF_CAN core allow
-  to set different multiple filters for each socket separately.
-  Therefore the use of hardware filters goes to the category 'handmade
-  tuning on deep embedded systems'. The author is running a MPC603e
-  @133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
-  load without any problems ...
-
-  6.4 The virtual CAN driver (vcan)
-
-  Similar to the network loopback devices, vcan offers a virtual local
-  CAN interface. A full qualified address on CAN consists of
-
-  - a unique CAN Identifier (CAN ID)
-  - the CAN bus this CAN ID is transmitted on (e.g. can0)
-
-  so in common use cases more than one virtual CAN interface is needed.
-
-  The virtual CAN interfaces allow the transmission and reception of CAN
-  frames without real CAN controller hardware. Virtual CAN network
-  devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
-  When compiled as a module the virtual CAN driver module is called vcan.ko
-
-  Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
-  netlink interface to create vcan network devices. The creation and
-  removal of vcan network devices can be managed with the ip(8) tool:
-
-  - Create a virtual CAN network interface:
-       $ ip link add type vcan
-
-  - Create a virtual CAN network interface with a specific name 'vcan42':
-       $ ip link add dev vcan42 type vcan
-
-  - Remove a (virtual CAN) network interface 'vcan42':
-       $ ip link del vcan42
-
-  6.5 The CAN network device driver interface
-
-  The CAN network device driver interface provides a generic interface
-  to setup, configure and monitor CAN network devices. The user can then
-  configure the CAN device, like setting the bit-timing parameters, via
-  the netlink interface using the program "ip" from the "IPROUTE2"
-  utility suite. The following chapter describes briefly how to use it.
-  Furthermore, the interface uses a common data structure and exports a
-  set of common functions, which all real CAN network device drivers
-  should use. Please have a look to the SJA1000 or MSCAN driver to
-  understand how to use them. The name of the module is can-dev.ko.
-
-  6.5.1 Netlink interface to set/get devices properties
-
-  The CAN device must be configured via netlink interface. The supported
-  netlink message types are defined and briefly described in
-  "include/linux/can/netlink.h". CAN link support for the program "ip"
-  of the IPROUTE2 utility suite is available and it can be used as shown
-  below:
-
-  - Setting CAN device properties:
-
-    $ ip link set can0 type can help
-    Usage: ip link set DEVICE type can
-        [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] |
-        [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1
-          phase-seg2 PHASE-SEG2 [ sjw SJW ] ]
-
-        [ dbitrate BITRATE [ dsample-point SAMPLE-POINT] ] |
-        [ dtq TQ dprop-seg PROP_SEG dphase-seg1 PHASE-SEG1
-          dphase-seg2 PHASE-SEG2 [ dsjw SJW ] ]
-
-        [ loopback { on | off } ]
-        [ listen-only { on | off } ]
-        [ triple-sampling { on | off } ]
-        [ one-shot { on | off } ]
-        [ berr-reporting { on | off } ]
-        [ fd { on | off } ]
-        [ fd-non-iso { on | off } ]
-        [ presume-ack { on | off } ]
-
-        [ restart-ms TIME-MS ]
-        [ restart ]
-
-        Where: BITRATE       := { 1..1000000 }
-               SAMPLE-POINT  := { 0.000..0.999 }
-               TQ            := { NUMBER }
-               PROP-SEG      := { 1..8 }
-               PHASE-SEG1    := { 1..8 }
-               PHASE-SEG2    := { 1..8 }
-               SJW           := { 1..4 }
-               RESTART-MS    := { 0 | NUMBER }
-
-  - Display CAN device details and statistics:
-
-    $ ip -details -statistics link show can0
-    2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10
-      link/can
-      can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100
-      bitrate 125000 sample_point 0.875
-      tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1
-      sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
-      clock 8000000
-      re-started bus-errors arbit-lost error-warn error-pass bus-off
-      41         17457      0          41         42         41
-      RX: bytes  packets  errors  dropped overrun mcast
-      140859     17608    17457   0       0       0
-      TX: bytes  packets  errors  dropped carrier collsns
-      861        112      0       41      0       0
-
-  More info to the above output:
-
-    "<TRIPLE-SAMPLING>"
-	Shows the list of selected CAN controller modes: LOOPBACK,
-	LISTEN-ONLY, or TRIPLE-SAMPLING.
-
-    "state ERROR-ACTIVE"
-	The current state of the CAN controller: "ERROR-ACTIVE",
-	"ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED"
-
-    "restart-ms 100"
-	Automatic restart delay time. If set to a non-zero value, a
-	restart of the CAN controller will be triggered automatically
-	in case of a bus-off condition after the specified delay time
-	in milliseconds. By default it's off.
-
-    "bitrate 125000 sample-point 0.875"
-	Shows the real bit-rate in bits/sec and the sample-point in the
-	range 0.000..0.999. If the calculation of bit-timing parameters
-	is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the
-	bit-timing can be defined by setting the "bitrate" argument.
-	Optionally the "sample-point" can be specified. By default it's
-	0.000 assuming CIA-recommended sample-points.
-
-    "tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1"
-	Shows the time quanta in ns, propagation segment, phase buffer
-	segment 1 and 2 and the synchronisation jump width in units of
-	tq. They allow to define the CAN bit-timing in a hardware
-	independent format as proposed by the Bosch CAN 2.0 spec (see
-	chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf).
-
-    "sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
-     clock 8000000"
-	Shows the bit-timing constants of the CAN controller, here the
-	"sja1000". The minimum and maximum values of the time segment 1
-	and 2, the synchronisation jump width in units of tq, the
-	bitrate pre-scaler and the CAN system clock frequency in Hz.
-	These constants could be used for user-defined (non-standard)
-	bit-timing calculation algorithms in user-space.
-
-    "re-started bus-errors arbit-lost error-warn error-pass bus-off"
-	Shows the number of restarts, bus and arbitration lost errors,
-	and the state changes to the error-warning, error-passive and
-	bus-off state. RX overrun errors are listed in the "overrun"
-	field of the standard network statistics.
-
-  6.5.2 Setting the CAN bit-timing
-
-  The CAN bit-timing parameters can always be defined in a hardware
-  independent format as proposed in the Bosch CAN 2.0 specification
-  specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2"
-  and "sjw":
-
-    $ ip link set canX type can tq 125 prop-seg 6 \
-				phase-seg1 7 phase-seg2 2 sjw 1
-
-  If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA
-  recommended CAN bit-timing parameters will be calculated if the bit-
-  rate is specified with the argument "bitrate":
-
-    $ ip link set canX type can bitrate 125000
-
-  Note that this works fine for the most common CAN controllers with
-  standard bit-rates but may *fail* for exotic bit-rates or CAN system
-  clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some
-  space and allows user-space tools to solely determine and set the
-  bit-timing parameters. The CAN controller specific bit-timing
-  constants can be used for that purpose. They are listed by the
-  following command:
-
-    $ ip -details link show can0
-    ...
-      sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
-
-  6.5.3 Starting and stopping the CAN network device
-
-  A CAN network device is started or stopped as usual with the command
-  "ifconfig canX up/down" or "ip link set canX up/down". Be aware that
-  you *must* define proper bit-timing parameters for real CAN devices
-  before you can start it to avoid error-prone default settings:
-
-    $ ip link set canX up type can bitrate 125000
-
-  A device may enter the "bus-off" state if too many errors occurred on
-  the CAN bus. Then no more messages are received or sent. An automatic
-  bus-off recovery can be enabled by setting the "restart-ms" to a
-  non-zero value, e.g.:
-
-    $ ip link set canX type can restart-ms 100
-
-  Alternatively, the application may realize the "bus-off" condition
-  by monitoring CAN error message frames and do a restart when
-  appropriate with the command:
-
-    $ ip link set canX type can restart
-
-  Note that a restart will also create a CAN error message frame (see
-  also chapter 3.3).
-
-  6.6 CAN FD (flexible data rate) driver support
-
-  CAN FD capable CAN controllers support two different bitrates for the
-  arbitration phase and the payload phase of the CAN FD frame. Therefore a
-  second bit timing has to be specified in order to enable the CAN FD bitrate.
-
-  Additionally CAN FD capable CAN controllers support up to 64 bytes of
-  payload. The representation of this length in can_frame.can_dlc and
-  canfd_frame.len for userspace applications and inside the Linux network
-  layer is a plain value from 0 .. 64 instead of the CAN 'data length code'.
-  The data length code was a 1:1 mapping to the payload length in the legacy
-  CAN frames anyway. The payload length to the bus-relevant DLC mapping is
-  only performed inside the CAN drivers, preferably with the helper
-  functions can_dlc2len() and can_len2dlc().
-
-  The CAN netdevice driver capabilities can be distinguished by the network
-  devices maximum transfer unit (MTU):
-
-  MTU = 16 (CAN_MTU)   => sizeof(struct can_frame)   => 'legacy' CAN device
-  MTU = 72 (CANFD_MTU) => sizeof(struct canfd_frame) => CAN FD capable device
-
-  The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
-  N.B. CAN FD capable devices can also handle and send legacy CAN frames.
-
-  When configuring CAN FD capable CAN controllers an additional 'data' bitrate
-  has to be set. This bitrate for the data phase of the CAN FD frame has to be
-  at least the bitrate which was configured for the arbitration phase. This
-  second bitrate is specified analogue to the first bitrate but the bitrate
-  setting keywords for the 'data' bitrate start with 'd' e.g. dbitrate,
-  dsample-point, dsjw or dtq and similar settings. When a data bitrate is set
-  within the configuration process the controller option "fd on" can be
-  specified to enable the CAN FD mode in the CAN controller. This controller
-  option also switches the device MTU to 72 (CANFD_MTU).
-
-  The first CAN FD specification presented as whitepaper at the International
-  CAN Conference 2012 needed to be improved for data integrity reasons.
-  Therefore two CAN FD implementations have to be distinguished today:
-
-  - ISO compliant:     The ISO 11898-1:2015 CAN FD implementation (default)
-  - non-ISO compliant: The CAN FD implementation following the 2012 whitepaper
-
-  Finally there are three types of CAN FD controllers:
-
-  1. ISO compliant (fixed)
-  2. non-ISO compliant (fixed, like the M_CAN IP core v3.0.1 in m_can.c)
-  3. ISO/non-ISO CAN FD controllers (switchable, like the PEAK PCAN-USB FD)
-
-  The current ISO/non-ISO mode is announced by the CAN controller driver via
-  netlink and displayed by the 'ip' tool (controller option FD-NON-ISO).
-  The ISO/non-ISO-mode can be altered by setting 'fd-non-iso {on|off}' for
-  switchable CAN FD controllers only.
-
-  Example configuring 500 kbit/s arbitration bitrate and 4 Mbit/s data bitrate:
-
-    $ ip link set can0 up type can bitrate 500000 sample-point 0.75 \
-                                   dbitrate 4000000 dsample-point 0.8 fd on
-    $ ip -details link show can0
-    5: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 72 qdisc pfifo_fast state UNKNOWN \
-             mode DEFAULT group default qlen 10
-    link/can  promiscuity 0
-    can <FD> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
-          bitrate 500000 sample-point 0.750
-          tq 50 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 1
-          pcan_usb_pro_fd: tseg1 1..64 tseg2 1..16 sjw 1..16 brp 1..1024 \
-          brp-inc 1
-          dbitrate 4000000 dsample-point 0.800
-          dtq 12 dprop-seg 7 dphase-seg1 8 dphase-seg2 4 dsjw 1
-          pcan_usb_pro_fd: dtseg1 1..16 dtseg2 1..8 dsjw 1..4 dbrp 1..1024 \
-          dbrp-inc 1
-          clock 80000000
-
-  Example when 'fd-non-iso on' is added on this switchable CAN FD adapter:
-   can <FD,FD-NON-ISO> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
-
-  6.7 Supported CAN hardware
-
-  Please check the "Kconfig" file in "drivers/net/can" to get an actual
-  list of the support CAN hardware. On the SocketCAN project website
-  (see chapter 7) there might be further drivers available, also for
-  older kernel versions.
-
-7. SocketCAN resources
------------------------
-
-  The Linux CAN / SocketCAN project resources (project site / mailing list)
-  are referenced in the MAINTAINERS file in the Linux source tree.
-  Search for CAN NETWORK [LAYERS|DRIVERS].
-
-8. Credits
-----------
-
-  Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver)
-  Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
-  Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
-  Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews,
-                       CAN device driver interface, MSCAN driver)
-  Robert Schwebel (design reviews, PTXdist integration)
-  Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
-  Benedikt Spranger (reviews)
-  Thomas Gleixner (LKML reviews, coding style, posting hints)
-  Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver)
-  Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
-  Klaus Hitschler (PEAK driver integration)
-  Uwe Koppe (CAN netdevices with PF_PACKET approach)
-  Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
-  Pavel Pisa (Bit-timing calculation)
-  Sascha Hauer (SJA1000 platform driver)
-  Sebastian Haas (SJA1000 EMS PCI driver)
-  Markus Plessing (SJA1000 EMS PCI driver)
-  Per Dalen (SJA1000 Kvaser PCI driver)
-  Sam Ravnborg (reviews, coding style, kbuild help)
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 7d4b15977d61..90966c2692d8 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -7,6 +7,7 @@ Contents:
    :maxdepth: 2
 
    batman-adv
+   can
    kapi
    z8530book
    msg_zerocopy
diff --git a/MAINTAINERS b/MAINTAINERS
index 51e3a0d503dc..884ee9601707 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3206,7 +3206,7 @@ W:	https://github.com/linux-can
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git
 S:	Maintained
-F:	Documentation/networking/can.txt
+F:	Documentation/networking/can.rst
 F:	net/can/
 F:	include/linux/can/core.h
 F:	include/uapi/linux/can.h
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index cc94604b23e0..b1779566c5bb 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(can_change_state);
  * Local echo of CAN messages
  *
  * CAN network devices *should* support a local echo functionality
- * (see Documentation/networking/can.txt). To test the handling of CAN
+ * (see Documentation/networking/can.rst). To test the handling of CAN
  * interfaces that do not support the local echo both driver types are
  * implemented. In the case that the driver does not support the echo
  * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index a8cb33264ff1..c2b04f505e16 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -61,7 +61,7 @@ MODULE_ALIAS_RTNL_LINK(DRV_NAME);
 /*
  * CAN test feature:
  * Enable the echo on driver level for testing the CAN core echo modes.
- * See Documentation/networking/can.txt for details.
+ * See Documentation/networking/can.rst for details.
  */
 
 static bool echo; /* echo testing. Default: 0 (Off) */
diff --git a/net/can/Kconfig b/net/can/Kconfig
index a15c0e0d1fc7..a4399be54ff4 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -11,7 +11,7 @@ menuconfig CAN
 	  1991, mainly for automotive, but now widely used in marine
 	  (NMEA2000), industrial, and medical applications.
 	  More information on the CAN network protocol family PF_CAN
-	  is contained in <Documentation/networking/can.txt>.
+	  is contained in <Documentation/networking/can.rst>.
 
 	  If you want CAN support you should say Y here and also to the
 	  specific driver for your controller(s) below.
-- 
cgit v1.2.3


From 1edce99fa8d6119d89b3b6e92b6cfa56889b4f3d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 25 Jan 2018 16:55:07 -0800
Subject: net/ipv6: Move gateway validation into helper

Move existing code to validate nexthop into a helper. Follow on patch
adds support for nexthops marked with onlink, and this helper keeps
the complexity of ip6_route_info_create in check.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 85 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aa4411c81e7e..bfd1a06a0700 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2469,6 +2469,54 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 	return rt;
 }
 
+static int ip6_route_check_nh(struct net *net,
+			      struct fib6_config *cfg,
+			      struct net_device **_dev,
+			      struct inet6_dev **idev)
+{
+	const struct in6_addr *gw_addr = &cfg->fc_gateway;
+	struct net_device *dev = _dev ? *_dev : NULL;
+	struct rt6_info *grt = NULL;
+	int err = -EHOSTUNREACH;
+
+	if (cfg->fc_table) {
+		grt = ip6_nh_lookup_table(net, cfg, gw_addr);
+		if (grt) {
+			if (grt->rt6i_flags & RTF_GATEWAY ||
+			    (dev && dev != grt->dst.dev)) {
+				ip6_rt_put(grt);
+				grt = NULL;
+			}
+		}
+	}
+
+	if (!grt)
+		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+
+	if (!grt)
+		goto out;
+
+	if (dev) {
+		if (dev != grt->dst.dev) {
+			ip6_rt_put(grt);
+			goto out;
+		}
+	} else {
+		*_dev = dev = grt->dst.dev;
+		*idev = grt->rt6i_idev;
+		dev_hold(dev);
+		in6_dev_hold(grt->rt6i_idev);
+	}
+
+	if (!(grt->rt6i_flags & RTF_GATEWAY))
+		err = 0;
+
+	ip6_rt_put(grt);
+
+out:
+	return err;
+}
+
 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 					      struct netlink_ext_ack *extack)
 {
@@ -2664,8 +2712,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		rt->rt6i_gateway = *gw_addr;
 
 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
-			struct rt6_info *grt = NULL;
-
 			/* IPv6 strictly inhibits using not link-local
 			   addresses as nexthop address.
 			   Otherwise, router will not able to send redirects.
@@ -2682,40 +2728,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 				goto out;
 			}
 
-			if (cfg->fc_table) {
-				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
-
-				if (grt) {
-					if (grt->rt6i_flags & RTF_GATEWAY ||
-					    (dev && dev != grt->dst.dev)) {
-						ip6_rt_put(grt);
-						grt = NULL;
-					}
-				}
-			}
-
-			if (!grt)
-				grt = rt6_lookup(net, gw_addr, NULL,
-						 cfg->fc_ifindex, 1);
-
-			err = -EHOSTUNREACH;
-			if (!grt)
-				goto out;
-			if (dev) {
-				if (dev != grt->dst.dev) {
-					ip6_rt_put(grt);
-					goto out;
-				}
-			} else {
-				dev = grt->dst.dev;
-				idev = grt->rt6i_idev;
-				dev_hold(dev);
-				in6_dev_hold(grt->rt6i_idev);
-			}
-			if (!(grt->rt6i_flags & RTF_GATEWAY))
-				err = 0;
-			ip6_rt_put(grt);
-
+			err = ip6_route_check_nh(net, cfg, &dev, &idev);
 			if (err)
 				goto out;
 		}
-- 
cgit v1.2.3


From f4797b33db813a394c4d4d2bc2fa38087fbe5545 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 25 Jan 2018 16:55:08 -0800
Subject: net/ipv6: Add flags and table id to ip6_nh_lookup_table

onlink verification needs to do a lookup in potentially different
table than the table in fib6_config and without the RT6_LOOKUP_F_IFACE
flag. Change ip6_nh_lookup_table to take table id and flags as input
arguments. Both verifications want to ignore link state, so add that
flag can stay in the lookup helper.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bfd1a06a0700..db2708658d8e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2440,7 +2440,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 
 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 					    struct fib6_config *cfg,
-					    const struct in6_addr *gw_addr)
+					    const struct in6_addr *gw_addr,
+					    u32 tbid, int flags)
 {
 	struct flowi6 fl6 = {
 		.flowi6_oif = cfg->fc_ifindex,
@@ -2449,15 +2450,15 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 	};
 	struct fib6_table *table;
 	struct rt6_info *rt;
-	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
 
-	table = fib6_get_table(net, cfg->fc_table);
+	table = fib6_get_table(net, tbid);
 	if (!table)
 		return NULL;
 
 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
 
+	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
 
 	/* if table lookup failed, fall back to full lookup */
@@ -2480,7 +2481,10 @@ static int ip6_route_check_nh(struct net *net,
 	int err = -EHOSTUNREACH;
 
 	if (cfg->fc_table) {
-		grt = ip6_nh_lookup_table(net, cfg, gw_addr);
+		int flags = RT6_LOOKUP_F_IFACE;
+
+		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
+					  cfg->fc_table, flags);
 		if (grt) {
 			if (grt->rt6i_flags & RTF_GATEWAY ||
 			    (dev && dev != grt->dst.dev)) {
-- 
cgit v1.2.3


From fc1e64e1092f62290d59151d16f9de0210e303c8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 25 Jan 2018 16:55:09 -0800
Subject: net/ipv6: Add support for onlink flag

Similar to IPv4 allow routes to be added with the RTNH_F_ONLINK flag.
The onlink option requires a gateway and a nexthop device. Any unicast
gateway is allowed (including IPv4 mapped addresses and unresolved
ones) as long as the gateway is not a local address and if it resolves
it must match the given device.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index db2708658d8e..fe3966a9c999 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2470,6 +2470,31 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 	return rt;
 }
 
+static int ip6_route_check_nh_onlink(struct net *net,
+				     struct fib6_config *cfg,
+				     struct net_device *dev,
+				     struct netlink_ext_ack *extack)
+{
+	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+	const struct in6_addr *gw_addr = &cfg->fc_gateway;
+	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
+	struct rt6_info *grt;
+	int err;
+
+	err = 0;
+	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
+	if (grt) {
+		if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			err = -EINVAL;
+		}
+
+		ip6_rt_put(grt);
+	}
+
+	return err;
+}
+
 static int ip6_route_check_nh(struct net *net,
 			      struct fib6_config *cfg,
 			      struct net_device **_dev,
@@ -2572,6 +2597,21 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (cfg->fc_metric == 0)
 		cfg->fc_metric = IP6_RT_PRIO_USER;
 
+	if (cfg->fc_flags & RTNH_F_ONLINK) {
+		if (!dev) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop device required for onlink");
+			err = -ENODEV;
+			goto out;
+		}
+
+		if (!(dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			err = -ENETDOWN;
+			goto out;
+		}
+	}
+
 	err = -ENOBUFS;
 	if (cfg->fc_nlinfo.nlh &&
 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
@@ -2732,7 +2772,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 				goto out;
 			}
 
-			err = ip6_route_check_nh(net, cfg, &dev, &idev);
+			if (cfg->fc_flags & RTNH_F_ONLINK) {
+				err = ip6_route_check_nh_onlink(net, cfg, dev,
+								extack);
+			} else {
+				err = ip6_route_check_nh(net, cfg, &dev, &idev);
+			}
 			if (err)
 				goto out;
 		}
@@ -2774,6 +2819,7 @@ install_route:
 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
 	    !netif_carrier_ok(dev))
 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
 	rt->dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
@@ -3843,6 +3889,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_CLONED)
 		cfg->fc_flags |= RTF_CACHE;
 
+	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
+
 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 	cfg->fc_nlinfo.nlh = nlh;
 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -4248,6 +4296,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
+	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
-- 
cgit v1.2.3


From da05bf2981f1035fc37d4253ccf0692faf19b8c0 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Fri, 26 Jan 2018 09:28:46 +0100
Subject: net/smc: handle device, port, and QP error events

RoCE device changes cause an IB event, processed in the global event
handler for the ROCE device. Problems for a certain Queue Pair cause a QP
event, processed in the QP event handler for this QP.
Among those events are port errors and other fatal device errors. All
link groups using such a port or device must be terminated in those cases.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_ib.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 90f1a7f9085c..2a8957bd6d38 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -141,6 +141,17 @@ out:
 	return rc;
 }
 
+static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_link_group *lgr, *l;
+
+	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
+			smc_lgr_terminate(lgr);
+	}
+}
+
 /* process context wrapper for might_sleep smc_ib_remember_port_attr */
 static void smc_ib_port_event_work(struct work_struct *work)
 {
@@ -151,6 +162,8 @@ static void smc_ib_port_event_work(struct work_struct *work)
 	for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
 		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
 		clear_bit(port_idx, &smcibdev->port_event_mask);
+		if (!smc_ib_port_active(smcibdev, port_idx + 1))
+			smc_ib_port_terminate(smcibdev, port_idx + 1);
 	}
 }
 
@@ -165,15 +178,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
 
 	switch (ibevent->event) {
 	case IB_EVENT_PORT_ERR:
-		port_idx = ibevent->element.port_num - 1;
-		set_bit(port_idx, &smcibdev->port_event_mask);
-		schedule_work(&smcibdev->port_event_work);
-		/* fall through */
 	case IB_EVENT_DEVICE_FATAL:
-		/* tbd in follow-on patch:
-		 * abnormal close of corresponding connections
-		 */
-		break;
 	case IB_EVENT_PORT_ACTIVE:
 		port_idx = ibevent->element.port_num - 1;
 		set_bit(port_idx, &smcibdev->port_event_mask);
@@ -186,7 +191,8 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
 
 void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
 {
-	ib_dealloc_pd(lnk->roce_pd);
+	if (lnk->roce_pd)
+		ib_dealloc_pd(lnk->roce_pd);
 	lnk->roce_pd = NULL;
 }
 
@@ -203,14 +209,18 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
 
 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 {
+	struct smc_ib_device *smcibdev =
+		(struct smc_ib_device *)ibevent->device;
+	u8 port_idx;
+
 	switch (ibevent->event) {
 	case IB_EVENT_DEVICE_FATAL:
 	case IB_EVENT_GID_CHANGE:
 	case IB_EVENT_PORT_ERR:
 	case IB_EVENT_QP_ACCESS_ERR:
-		/* tbd in follow-on patch:
-		 * abnormal close of corresponding connections
-		 */
+		port_idx = ibevent->element.port_num - 1;
+		set_bit(port_idx, &smcibdev->port_event_mask);
+		schedule_work(&smcibdev->port_event_work);
 		break;
 	default:
 		break;
@@ -219,7 +229,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 
 void smc_ib_destroy_queue_pair(struct smc_link *lnk)
 {
-	ib_destroy_qp(lnk->roce_qp);
+	if (lnk->roce_qp)
+		ib_destroy_qp(lnk->roce_qp);
 	lnk->roce_qp = NULL;
 }
 
@@ -462,6 +473,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
 {
 	if (!smcibdev->initialized)
 		return;
+	smcibdev->initialized = 0;
 	smc_wr_remove_dev(smcibdev);
 	ib_unregister_event_handler(&smcibdev->event_handler);
 	ib_destroy_cq(smcibdev->roce_cq_recv);
-- 
cgit v1.2.3


From 8dce2786a2905e5b0ce49263910b4a47af75b752 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Fri, 26 Jan 2018 09:28:47 +0100
Subject: net/smc: smc_poll improvements

Increase the socket refcount during poll wait.
Take the socket lock before checking socket state.
For a listening socket return a mask independent of state SMC_ACTIVE and
cover errors or closed state as well.
Get rid of the accept_q loop in smc_accept_poll().

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 74 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index cf0e11978b66..90c22a854f28 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1122,21 +1122,15 @@ out:
 
 static unsigned int smc_accept_poll(struct sock *parent)
 {
-	struct smc_sock *isk;
-	struct sock *sk;
-
-	lock_sock(parent);
-	list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
-		sk = (struct sock *)isk;
+	struct smc_sock *isk = smc_sk(parent);
+	int mask = 0;
 
-		if (sk->sk_state == SMC_ACTIVE) {
-			release_sock(parent);
-			return POLLIN | POLLRDNORM;
-		}
-	}
-	release_sock(parent);
+	spin_lock(&isk->accept_q_lock);
+	if (!list_empty(&isk->accept_q))
+		mask = POLLIN | POLLRDNORM;
+	spin_unlock(&isk->accept_q_lock);
 
-	return 0;
+	return mask;
 }
 
 static unsigned int smc_poll(struct file *file, struct socket *sock,
@@ -1147,9 +1141,15 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 	struct smc_sock *smc;
 	int rc;
 
+	if (!sk)
+		return POLLNVAL;
+
 	smc = smc_sk(sock->sk);
+	sock_hold(sk);
+	lock_sock(sk);
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
+		release_sock(sk);
 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		/* if non-blocking connect finished ... */
 		lock_sock(sk);
@@ -1161,37 +1161,43 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 				rc = smc_connect_rdma(smc);
 				if (rc < 0)
 					mask |= POLLERR;
-				else
-					/* success cases including fallback */
-					mask |= POLLOUT | POLLWRNORM;
+				/* success cases including fallback */
+				mask |= POLLOUT | POLLWRNORM;
 			}
 		}
-		release_sock(sk);
 	} else {
-		sock_poll_wait(file, sk_sleep(sk), wait);
-		if (sk->sk_state == SMC_LISTEN)
-			/* woken up by sk_data_ready in smc_listen_work() */
-			mask |= smc_accept_poll(sk);
+		if (sk->sk_state != SMC_CLOSED) {
+			release_sock(sk);
+			sock_poll_wait(file, sk_sleep(sk), wait);
+			lock_sock(sk);
+		}
 		if (sk->sk_err)
 			mask |= POLLERR;
-		if (atomic_read(&smc->conn.sndbuf_space) ||
-		    (sk->sk_shutdown & SEND_SHUTDOWN)) {
-			mask |= POLLOUT | POLLWRNORM;
-		} else {
-			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		}
-		if (atomic_read(&smc->conn.bytes_to_rcv))
-			mask |= POLLIN | POLLRDNORM;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
 		    (sk->sk_state == SMC_CLOSED))
 			mask |= POLLHUP;
-		if (sk->sk_shutdown & RCV_SHUTDOWN)
-			mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-		if (sk->sk_state == SMC_APPCLOSEWAIT1)
-			mask |= POLLIN;
+		if (sk->sk_state == SMC_LISTEN) {
+			/* woken up by sk_data_ready in smc_listen_work() */
+			mask = smc_accept_poll(sk);
+		} else {
+			if (atomic_read(&smc->conn.sndbuf_space) ||
+			    sk->sk_shutdown & SEND_SHUTDOWN) {
+				mask |= POLLOUT | POLLWRNORM;
+			} else {
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			}
+			if (atomic_read(&smc->conn.bytes_to_rcv))
+				mask |= POLLIN | POLLRDNORM;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+			if (sk->sk_state == SMC_APPCLOSEWAIT1)
+				mask |= POLLIN;
+		}
 
 	}
+	release_sock(sk);
+	sock_put(sk);
 
 	return mask;
 }
-- 
cgit v1.2.3


From 51f1de79ad8ed3555fd01ae8fd432691d397684b Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Fri, 26 Jan 2018 09:28:48 +0100
Subject: net/smc: replace sock_put worker by socket refcounting

Proper socket refcounting makes the sock_put worker obsolete.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c    | 65 +++++++++++++++++++++++++++++------------------------
 net/smc/smc.h       |  1 -
 net/smc/smc_cdc.c   | 20 +++++++++--------
 net/smc/smc_close.c | 63 ++++++++++++++++++++++++++++++---------------------
 net/smc/smc_close.h |  1 -
 net/smc/smc_core.c  |  6 ++---
 6 files changed, 88 insertions(+), 68 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 90c22a854f28..732a37ddbc21 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -115,7 +115,6 @@ static int smc_release(struct socket *sock)
 		goto out;
 
 	smc = smc_sk(sk);
-	sock_hold(sk);
 	if (sk->sk_state == SMC_LISTEN)
 		/* smc_close_non_accepted() is called and acquires
 		 * sock lock for child sockets again
@@ -124,10 +123,7 @@ static int smc_release(struct socket *sock)
 	else
 		lock_sock(sk);
 
-	if (smc->use_fallback) {
-		sk->sk_state = SMC_CLOSED;
-		sk->sk_state_change(sk);
-	} else {
+	if (!smc->use_fallback) {
 		rc = smc_close_active(smc);
 		sock_set_flag(sk, SOCK_DEAD);
 		sk->sk_shutdown |= SHUTDOWN_MASK;
@@ -136,20 +132,21 @@ static int smc_release(struct socket *sock)
 		sock_release(smc->clcsock);
 		smc->clcsock = NULL;
 	}
+	if (smc->use_fallback) {
+		sock_put(sk); /* passive closing */
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk);
+	}
 
 	/* detach socket */
 	sock_orphan(sk);
 	sock->sk = NULL;
-	if (smc->use_fallback) {
-		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
-	} else if (sk->sk_state == SMC_CLOSED) {
+	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
 		smc_conn_free(&smc->conn);
-		schedule_delayed_work(&smc->sock_put_work,
-				      SMC_CLOSE_SOCK_PUT_DELAY);
-	}
 	release_sock(sk);
 
-	sock_put(sk);
+	sk->sk_prot->unhash(sk);
+	sock_put(sk); /* final sock_put */
 out:
 	return rc;
 }
@@ -181,7 +178,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
-	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
 	sk->sk_prot->hash(sk);
 	sk_refcnt_debug_inc(sk);
 
@@ -399,6 +395,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	int rc = 0;
 	u8 ibport;
 
+	sock_hold(&smc->sk); /* sock put in passive closing */
+
 	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
 		/* peer has not signalled SMC-capability */
 		smc->use_fallback = true;
@@ -542,6 +540,8 @@ out_err_unlock:
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_conn_free(&smc->conn);
 out_err:
+	if (smc->sk.sk_state == SMC_INIT)
+		sock_put(&smc->sk); /* passive closing */
 	return rc;
 }
 
@@ -620,7 +620,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
 		new_sk->sk_prot->unhash(new_sk);
-		sock_put(new_sk);
+		sock_put(new_sk); /* final */
 		*new_smc = NULL;
 		goto out;
 	}
@@ -637,7 +637,7 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
 {
 	struct smc_sock *par = smc_sk(parent);
 
-	sock_hold(sk);
+	sock_hold(sk); /* sock_put in smc_accept_unlink () */
 	spin_lock(&par->accept_q_lock);
 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
 	spin_unlock(&par->accept_q_lock);
@@ -653,7 +653,7 @@ static void smc_accept_unlink(struct sock *sk)
 	list_del_init(&smc_sk(sk)->accept_q);
 	spin_unlock(&par->accept_q_lock);
 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
-	sock_put(sk);
+	sock_put(sk); /* sock_hold in smc_accept_enqueue */
 }
 
 /* remove a sock from the accept queue to bind it to a new socket created
@@ -671,7 +671,7 @@ struct sock *smc_accept_dequeue(struct sock *parent,
 		smc_accept_unlink(new_sk);
 		if (new_sk->sk_state == SMC_CLOSED) {
 			new_sk->sk_prot->unhash(new_sk);
-			sock_put(new_sk);
+			sock_put(new_sk); /* final */
 			continue;
 		}
 		if (new_sock)
@@ -686,14 +686,11 @@ void smc_close_non_accepted(struct sock *sk)
 {
 	struct smc_sock *smc = smc_sk(sk);
 
-	sock_hold(sk);
 	lock_sock(sk);
 	if (!sk->sk_lingertime)
 		/* wait for peer closing */
 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
-	if (smc->use_fallback) {
-		sk->sk_state = SMC_CLOSED;
-	} else {
+	if (!smc->use_fallback) {
 		smc_close_active(smc);
 		sock_set_flag(sk, SOCK_DEAD);
 		sk->sk_shutdown |= SHUTDOWN_MASK;
@@ -706,14 +703,15 @@ void smc_close_non_accepted(struct sock *sk)
 		sock_release(tcp);
 	}
 	if (smc->use_fallback) {
-		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
-	} else if (sk->sk_state == SMC_CLOSED) {
-		smc_conn_free(&smc->conn);
-		schedule_delayed_work(&smc->sock_put_work,
-				      SMC_CLOSE_SOCK_PUT_DELAY);
+		sock_put(sk); /* passive closing */
+		sk->sk_state = SMC_CLOSED;
+	} else {
+		if (sk->sk_state == SMC_CLOSED)
+			smc_conn_free(&smc->conn);
 	}
 	release_sock(sk);
-	sock_put(sk);
+	sk->sk_prot->unhash(sk);
+	sock_put(sk); /* final sock_put */
 }
 
 static int smc_serv_conf_first_link(struct smc_sock *smc)
@@ -937,6 +935,8 @@ out_err_unlock:
 		smc_lgr_forget(new_smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 out_err:
+	if (newsmcsk->sk_state == SMC_INIT)
+		sock_put(&new_smc->sk); /* passive closing */
 	newsmcsk->sk_state = SMC_CLOSED;
 	smc_conn_free(&new_smc->conn);
 	goto enqueue; /* queue new sock with sk_err set */
@@ -963,12 +963,15 @@ static void smc_tcp_listen_work(struct work_struct *work)
 		sock_hold(lsk); /* sock_put in smc_listen_work */
 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 		smc_copy_sock_settings_to_smc(new_smc);
-		schedule_work(&new_smc->smc_listen_work);
+		sock_hold(&new_smc->sk); /* sock_put in passive closing */
+		if (!schedule_work(&new_smc->smc_listen_work))
+			sock_put(&new_smc->sk);
 	}
 
 out:
 	release_sock(lsk);
 	lsk->sk_data_ready(lsk); /* no more listening, wake accept */
+	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
 }
 
 static int smc_listen(struct socket *sock, int backlog)
@@ -1002,7 +1005,9 @@ static int smc_listen(struct socket *sock, int backlog)
 	sk->sk_ack_backlog = 0;
 	sk->sk_state = SMC_LISTEN;
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
-	schedule_work(&smc->tcp_listen_work);
+	sock_hold(sk); /* sock_hold in tcp_listen_worker */
+	if (!schedule_work(&smc->tcp_listen_work))
+		sock_put(sk);
 
 out:
 	release_sock(sk);
@@ -1019,6 +1024,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 	int rc = 0;
 
 	lsmc = smc_sk(sk);
+	sock_hold(sk); /* sock_put below */
 	lock_sock(sk);
 
 	if (lsmc->sk.sk_state != SMC_LISTEN) {
@@ -1053,6 +1059,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 
 out:
 	release_sock(sk);
+	sock_put(sk); /* sock_hold above */
 	return rc;
 }
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 0bee9d16cf29..bfbe20234105 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -178,7 +178,6 @@ struct smc_sock {				/* smc sock container */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
-	struct delayed_work	sock_put_work;	/* final socket freeing */
 	bool			use_fallback;	/* fallback to tcp */
 	u8			wait_close_tx_prepared : 1;
 						/* shutdown wr or close
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 6e8f5fbe0f09..3cd086e5bd28 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -212,6 +212,14 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		smc->sk.sk_data_ready(&smc->sk);
 	}
 
+	/* piggy backed tx info */
+	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+	if (diff_cons && smc_tx_prepared_sends(conn)) {
+		smc_tx_sndbuf_nonempty(conn);
+		/* trigger socket release if connection closed */
+		smc_close_wake_tx_prepared(smc);
+	}
+
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
@@ -221,15 +229,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		if (smc->clcsock && smc->clcsock->sk)
 			smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
 		sock_set_flag(&smc->sk, SOCK_DONE);
-		schedule_work(&conn->close_work);
-	}
-
-	/* piggy backed tx info */
-	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
-	if (diff_cons && smc_tx_prepared_sends(conn)) {
-		smc_tx_sndbuf_nonempty(conn);
-		/* trigger socket release if connection closed */
-		smc_close_wake_tx_prepared(smc);
+		sock_hold(&smc->sk); /* sock_put in close_work */
+		if (!schedule_work(&conn->close_work))
+			sock_put(&smc->sk);
 	}
 }
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index babe05d385e7..4339852a8910 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -110,6 +110,7 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		release_sock(sk);
 		cancel_delayed_work_sync(&smc->conn.tx_work);
 		lock_sock(sk);
+		sock_put(sk); /* passive closing */
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
@@ -125,11 +126,13 @@ static void smc_close_active_abort(struct smc_sock *smc)
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 		if (!txflags->peer_conn_closed) {
+			/* just SHUTDOWN_SEND done */
 			sk->sk_state = SMC_PEERABORTWAIT;
 			sock_release(smc->clcsock);
 		} else {
 			sk->sk_state = SMC_CLOSED;
 		}
+		sock_put(sk); /* passive closing */
 		break;
 	case SMC_PROCESSABORT:
 	case SMC_APPFINCLOSEWAIT:
@@ -138,6 +141,8 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		sk->sk_state = SMC_CLOSED;
 		break;
 	case SMC_PEERFINCLOSEWAIT:
+		sock_put(sk); /* passive closing */
+		break;
 	case SMC_PEERABORTWAIT:
 	case SMC_CLOSED:
 		break;
@@ -229,12 +234,14 @@ again:
 		rc = smc_close_final(conn);
 		if (rc)
 			break;
-		if (smc_cdc_rxed_any_close(conn))
+		if (smc_cdc_rxed_any_close(conn)) {
 			/* peer has closed the socket already */
 			sk->sk_state = SMC_CLOSED;
-		else
+			sock_put(sk); /* postponed passive closing */
+		} else {
 			/* peer has just issued a shutdown write */
 			sk->sk_state = SMC_PEERFINCLOSEWAIT;
+		}
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
@@ -272,27 +279,33 @@ static void smc_close_passive_abort_received(struct smc_sock *smc)
 	struct sock *sk = &smc->sk;
 
 	switch (sk->sk_state) {
+	case SMC_INIT:
 	case SMC_ACTIVE:
-	case SMC_APPFINCLOSEWAIT:
 	case SMC_APPCLOSEWAIT1:
-	case SMC_APPCLOSEWAIT2:
+		sk->sk_state = SMC_PROCESSABORT;
+		sock_put(sk); /* passive closing */
+		break;
+	case SMC_APPFINCLOSEWAIT:
 		sk->sk_state = SMC_PROCESSABORT;
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 		if (txflags->peer_done_writing &&
-		    !smc_close_sent_any_close(&smc->conn)) {
+		    !smc_close_sent_any_close(&smc->conn))
 			/* just shutdown, but not yet closed locally */
 			sk->sk_state = SMC_PROCESSABORT;
-		} else {
+		else
 			sk->sk_state = SMC_CLOSED;
-		}
+		sock_put(sk); /* passive closing */
 		break;
+	case SMC_APPCLOSEWAIT2:
 	case SMC_PEERFINCLOSEWAIT:
+		sk->sk_state = SMC_CLOSED;
+		sock_put(sk); /* passive closing */
+		break;
 	case SMC_PEERABORTWAIT:
 		sk->sk_state = SMC_CLOSED;
 		break;
-	case SMC_INIT:
 	case SMC_PROCESSABORT:
 	/* nothing to do, add tracing in future patch */
 		break;
@@ -336,13 +349,18 @@ static void smc_close_passive_work(struct work_struct *work)
 	case SMC_INIT:
 		if (atomic_read(&conn->bytes_to_rcv) ||
 		    (rxflags->peer_done_writing &&
-		     !smc_cdc_rxed_any_close(conn)))
+		     !smc_cdc_rxed_any_close(conn))) {
 			sk->sk_state = SMC_APPCLOSEWAIT1;
-		else
+		} else {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk); /* passive closing */
+		}
 		break;
 	case SMC_ACTIVE:
 		sk->sk_state = SMC_APPCLOSEWAIT1;
+		/* postpone sock_put() for passive closing to cover
+		 * received SEND_SHUTDOWN as well
+		 */
 		break;
 	case SMC_PEERCLOSEWAIT1:
 		if (rxflags->peer_done_writing)
@@ -360,13 +378,20 @@ static void smc_close_passive_work(struct work_struct *work)
 			/* just shutdown, but not yet closed locally */
 			sk->sk_state = SMC_APPFINCLOSEWAIT;
 		}
+		sock_put(sk); /* passive closing */
 		break;
 	case SMC_PEERFINCLOSEWAIT:
-		if (smc_cdc_rxed_any_close(conn))
+		if (smc_cdc_rxed_any_close(conn)) {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk); /* passive closing */
+		}
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
+		/* postpone sock_put() for passive closing to cover
+		 * received SEND_SHUTDOWN as well
+		 */
+		break;
 	case SMC_APPFINCLOSEWAIT:
 	case SMC_PEERABORTWAIT:
 	case SMC_PROCESSABORT:
@@ -382,23 +407,11 @@ wakeup:
 	if (old_state != sk->sk_state) {
 		sk->sk_state_change(sk);
 		if ((sk->sk_state == SMC_CLOSED) &&
-		    (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
+		    (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket))
 			smc_conn_free(conn);
-			schedule_delayed_work(&smc->sock_put_work,
-					      SMC_CLOSE_SOCK_PUT_DELAY);
-		}
 	}
 	release_sock(sk);
-}
-
-void smc_close_sock_put_work(struct work_struct *work)
-{
-	struct smc_sock *smc = container_of(to_delayed_work(work),
-					    struct smc_sock,
-					    sock_put_work);
-
-	smc->sk.sk_prot->unhash(&smc->sk);
-	sock_put(&smc->sk);
+	sock_put(sk); /* sock_hold done by schedulers of close_work */
 }
 
 int smc_close_shutdown_write(struct smc_sock *smc)
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index 8c498885d758..19eb6a211c23 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -21,7 +21,6 @@
 
 void smc_close_wake_tx_prepared(struct smc_sock *smc);
 int smc_close_active(struct smc_sock *smc);
-void smc_close_sock_put_work(struct work_struct *work);
 int smc_close_shutdown_write(struct smc_sock *smc);
 void smc_close_init(struct smc_sock *smc);
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ed5b46d1fe41..2424c7100aaf 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -328,13 +328,13 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	while (node) {
 		conn = rb_entry(node, struct smc_connection, alert_node);
 		smc = container_of(conn, struct smc_sock, conn);
-		sock_hold(&smc->sk);
+		sock_hold(&smc->sk); /* sock_put in close work */
 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
 		__smc_lgr_unregister_conn(conn);
 		write_unlock_bh(&lgr->conns_lock);
-		schedule_work(&conn->close_work);
+		if (!schedule_work(&conn->close_work))
+			sock_put(&smc->sk);
 		write_lock_bh(&lgr->conns_lock);
-		sock_put(&smc->sk);
 		node = rb_first(&lgr->conns_all);
 	}
 	write_unlock_bh(&lgr->conns_lock);
-- 
cgit v1.2.3


From 127f497058236e5f07672e11382232f80cb7e8c4 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Fri, 26 Jan 2018 09:28:49 +0100
Subject: net/smc: release clcsock from tcp_listen_worker

Closing a listen socket may hit the warning
WARN_ON(sock_owned_by_user(sk)) of tcp_close(), if the wake up of
the smc_tcp_listen_worker has not yet finished.
This patch introduces smc_close_wait_listen_clcsock() making sure
the listening internal clcsock has been closed in smc_tcp_listen_work(),
before the listening external SMC socket finishes closing.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c    | 13 ++++++++++++-
 net/smc/smc_close.c | 33 ++++++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 732a37ddbc21..267e68379110 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -670,6 +670,10 @@ struct sock *smc_accept_dequeue(struct sock *parent,
 
 		smc_accept_unlink(new_sk);
 		if (new_sk->sk_state == SMC_CLOSED) {
+			if (isk->clcsock) {
+				sock_release(isk->clcsock);
+				isk->clcsock = NULL;
+			}
 			new_sk->sk_prot->unhash(new_sk);
 			sock_put(new_sk); /* final */
 			continue;
@@ -969,8 +973,15 @@ static void smc_tcp_listen_work(struct work_struct *work)
 	}
 
 out:
+	if (lsmc->clcsock) {
+		sock_release(lsmc->clcsock);
+		lsmc->clcsock = NULL;
+	}
 	release_sock(lsk);
-	lsk->sk_data_ready(lsk); /* no more listening, wake accept */
+	/* no more listening, wake up smc_close_wait_listen_clcsock and
+	 * accept
+	 */
+	lsk->sk_state_change(lsk);
 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
 }
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 4339852a8910..e339c0186dcf 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -19,6 +19,8 @@
 #include "smc_cdc.h"
 #include "smc_close.h"
 
+#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME	(5 * HZ)
+
 static void smc_close_cleanup_listen(struct sock *parent)
 {
 	struct sock *sk;
@@ -28,6 +30,27 @@ static void smc_close_cleanup_listen(struct sock *parent)
 		smc_close_non_accepted(sk);
 }
 
+static void smc_close_wait_listen_clcsock(struct smc_sock *smc)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+	signed long timeout;
+
+	timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME;
+	add_wait_queue(sk_sleep(sk), &wait);
+	do {
+		release_sock(sk);
+		if (smc->clcsock)
+			timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE,
+					     timeout);
+		sched_annotate_sleep();
+		lock_sock(sk);
+		if (!smc->clcsock)
+			break;
+	} while (timeout);
+	remove_wait_queue(sk_sleep(sk), &wait);
+}
+
 /* wait for sndbuf data being transmitted */
 static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
 {
@@ -114,7 +137,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
-		sock_release(smc->clcsock);
 		if (!smc_cdc_rxed_any_close(&smc->conn))
 			sk->sk_state = SMC_PEERABORTWAIT;
 		else
@@ -128,7 +150,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		if (!txflags->peer_conn_closed) {
 			/* just SHUTDOWN_SEND done */
 			sk->sk_state = SMC_PEERABORTWAIT;
-			sock_release(smc->clcsock);
 		} else {
 			sk->sk_state = SMC_CLOSED;
 		}
@@ -136,8 +157,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		break;
 	case SMC_PROCESSABORT:
 	case SMC_APPFINCLOSEWAIT:
-		if (!txflags->peer_conn_closed)
-			sock_release(smc->clcsock);
 		sk->sk_state = SMC_CLOSED;
 		break;
 	case SMC_PEERFINCLOSEWAIT:
@@ -177,8 +196,6 @@ again:
 	switch (sk->sk_state) {
 	case SMC_INIT:
 		sk->sk_state = SMC_CLOSED;
-		if (smc->smc_listen_work.func)
-			cancel_work_sync(&smc->smc_listen_work);
 		break;
 	case SMC_LISTEN:
 		sk->sk_state = SMC_CLOSED;
@@ -187,11 +204,9 @@ again:
 			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
 			/* wake up kernel_accept of smc_tcp_listen_worker */
 			smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+			smc_close_wait_listen_clcsock(smc);
 		}
-		release_sock(sk);
 		smc_close_cleanup_listen(sk);
-		cancel_work_sync(&smc->smc_listen_work);
-		lock_sock(sk);
 		break;
 	case SMC_ACTIVE:
 		smc_close_stream_wait(smc, timeout);
-- 
cgit v1.2.3


From a8fbf8e7ecda38d70a7983246fbadf04063f3843 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 26 Jan 2018 09:28:50 +0100
Subject: net/smc: return booleans instead of integers

Return statements in functions returning bool should use
true/false instead of 1/0.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc.h b/net/smc/smc.h
index bfbe20234105..9518986c97b1 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -252,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed)
 static inline bool using_ipsec(struct smc_sock *smc)
 {
 	return (smc->clcsock->sk->sk_policy[0] ||
-		smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+		smc->clcsock->sk->sk_policy[1]) ? true : false;
 }
 #else
 static inline bool using_ipsec(struct smc_sock *smc)
 {
-	return 0;
+	return false;
 }
 #endif
 
-- 
cgit v1.2.3


From 1d621674d923790d09cab9e2c7da7da6446a6257 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Jan 2018 23:33:36 +0100
Subject: bpf: xor of a/x in cbpf can be done in 32 bit alu

Very minor optimization; saves 1 byte per program in x86_64
JIT in cBPF prologue.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index fe2c7937351f..60d8c8712652 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -401,8 +401,8 @@ do_pass:
 		/* Classic BPF expects A and X to be reset first. These need
 		 * to be guaranteed to be the first two instructions.
 		 */
-		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
-		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+		*new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+		*new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 
 		/* All programs must keep CTX in callee saved BPF_REG_CTX.
 		 * In eBPF case it's done by the compiler, here we need to
-- 
cgit v1.2.3


From f6b1b3bf0d5f681631a293cfe1ca934b81716f1e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Jan 2018 23:33:39 +0100
Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception

One of the ugly leftovers from the early eBPF days is that div/mod
operations based on registers have a hard-coded src_reg == 0 test
in the interpreter as well as in JIT code generators that would
return from the BPF program with exit code 0. This was basically
adopted from cBPF interpreter for historical reasons.

There are multiple reasons why this is very suboptimal and prone
to bugs. To name one: the return code mapping for such abnormal
program exit of 0 does not always match with a suitable program
type's exit code mapping. For example, '0' in tc means action 'ok'
where the packet gets passed further up the stack, which is just
undesirable for such cases (e.g. when implementing policy) and
also does not match with other program types.

While trying to work out an exception handling scheme, I also
noticed that programs crafted like the following will currently
pass the verifier:

  0: (bf) r6 = r1
  1: (85) call pc+8
  caller:
   R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  callee:
   frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1
  10: (b4) (u32) r2 = (u32) 0
  11: (b4) (u32) r3 = (u32) 1
  12: (3c) (u32) r3 /= (u32) r2
  13: (61) r0 = *(u32 *)(r1 +76)
  14: (95) exit
  returning from callee:
   frame1: R0_w=pkt(id=0,off=0,r=0,imm=0)
           R1=ctx(id=0,off=0,imm=0) R2_w=inv0
           R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
           R10=fp0,call_1
  to caller at 2:
   R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0)
   R10=fp0,call_-1

  from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0)
                R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  2: (bf) r1 = r6
  3: (61) r1 = *(u32 *)(r1 +80)
  4: (bf) r2 = r0
  5: (07) r2 += 8
  6: (2d) if r2 > r1 goto pc+1
   R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0)
   R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0)
   R10=fp0,call_-1
  7: (71) r0 = *(u8 *)(r0 +0)
  8: (b7) r0 = 1
  9: (95) exit

  from 6 to 8: safe
  processed 16 insns (limit 131072), stack depth 0+0

Basically what happens is that in the subprog we make use of a
div/mod by 0 exception and in the 'normal' subprog's exit path
we just return skb->data back to the main prog. This has the
implication that the verifier thinks we always get a pkt pointer
in R0 while we still have the implicit 'return 0' from the div
as an alternative unconditional return path earlier. Thus, R0
then contains 0, meaning back in the parent prog we get the
address range of [0x0, skb->data_end] as read and writeable.
Similar can be crafted with other pointer register types.

Since i) BPF_ABS/IND is not allowed in programs that contain
BPF to BPF calls (and generally it's also disadvised to use in
native eBPF context), ii) unknown opcodes don't return zero
anymore, iii) we don't return an exception code in dead branches,
the only last missing case affected and to fix is the div/mod
handling.

What we would really need is some infrastructure to propagate
exceptions all the way to the original prog unwinding the
current stack and returning that code to the caller of the
BPF program. In user space such exception handling for similar
runtimes is typically implemented with setjmp(3) and longjmp(3)
as one possibility which is not available in the kernel,
though (kgdb used to implement it in kernel long time ago). I
implemented a PoC exception handling mechanism into the BPF
interpreter with porting setjmp()/longjmp() into x86_64 and
adding a new internal BPF_ABRT opcode that can use a program
specific exception code for all exception cases we have (e.g.
div/mod by 0, unknown opcodes, etc). While this seems to work
in the constrained BPF environment (meaning, here, we don't
need to deal with state e.g. from memory allocations that we
would need to undo before going into exception state), it still
has various drawbacks: i) we would need to implement the
setjmp()/longjmp() for every arch supported in the kernel and
for x86_64, arm64, sparc64 JITs currently supporting calls,
ii) it has unconditional additional cost on main program
entry to store CPU register state in initial setjmp() call,
and we would need some way to pass the jmp_buf down into
___bpf_prog_run() for main prog and all subprogs, but also
storing on stack is not really nice (other option would be
per-cpu storage for this, but it also has the drawback that
we need to disable preemption for every BPF program types).
All in all this approach would add a lot of complexity.

Another poor-man's solution would be to have some sort of
additional shared register or scratch buffer to hold state
for exceptions, and test that after every call return to
chain returns and pass R0 all the way down to BPF prog caller.
This is also problematic in various ways: i) an additional
register doesn't map well into JITs, and some other scratch
space could only be on per-cpu storage, which, again has the
side-effect that this only works when we disable preemption,
or somewhere in the input context which is not available
everywhere either, and ii) this adds significant runtime
overhead by putting conditionals after each and every call,
as well as implementation complexity.

Yet another option is to teach verifier that div/mod can
return an integer, which however is also complex to implement
as verifier would need to walk such fake 'mov r0,<code>; exit;'
sequeuence and there would still be no guarantee for having
propagation of this further down to the BPF caller as proper
exception code. For parent prog, it is also is not distinguishable
from a normal return of a constant scalar value.

The approach taken here is a completely different one with
little complexity and no additional overhead involved in
that we make use of the fact that a div/mod by 0 is undefined
behavior. Instead of bailing out, we adapt the same behavior
as on some major archs like ARMv8 [0] into eBPF as well:
X div 0 results in 0, and X mod 0 results in X. aarch64 and
aarch32 ISA do not generate any traps or otherwise aborts
of program execution for unsigned divides. I verified this
also with a test program compiled by gcc and clang, and the
behavior matches with the spec. Going forward we adapt the
eBPF verifier to emit such rewrites once div/mod by register
was seen. cBPF is not touched and will keep existing 'return 0'
semantics. Given the options, it seems the most suitable from
all of them, also since major archs have similar schemes in
place. Given this is all in the realm of undefined behavior,
we still have the option to adapt if deemed necessary and
this way we would also have the option of more flexibility
from LLVM code generation side (which is then fully visible
to verifier). Thus, this patch i) fixes the panic seen in
above program and ii) doesn't bypass the verifier observations.

  [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b]
      http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf
      1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV)
         "A division by zero results in a zero being written to
          the destination register, without any indication that
          the division by zero occurred."
      2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV)
         "For the SDIV and UDIV instructions, division by zero
          always returns a zero result."

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c     |  8 --------
 kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++--------
 net/core/filter.c     |  9 ++++++++-
 3 files changed, 38 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8301de2d1f96..5f35f93dcab2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -999,14 +999,10 @@ select_insn:
 		(*(s64 *) &DST) >>= IMM;
 		CONT;
 	ALU64_MOD_X:
-		if (unlikely(SRC == 0))
-			return 0;
 		div64_u64_rem(DST, SRC, &tmp);
 		DST = tmp;
 		CONT;
 	ALU_MOD_X:
-		if (unlikely((u32)SRC == 0))
-			return 0;
 		tmp = (u32) DST;
 		DST = do_div(tmp, (u32) SRC);
 		CONT;
@@ -1019,13 +1015,9 @@ select_insn:
 		DST = do_div(tmp, (u32) IMM);
 		CONT;
 	ALU64_DIV_X:
-		if (unlikely(SRC == 0))
-			return 0;
 		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
-		if (unlikely((u32)SRC == 0))
-			return 0;
 		tmp = (u32) DST;
 		do_div(tmp, (u32) SRC);
 		DST = (u32) tmp;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0c5269415090..5fb69a85d967 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5400,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	int i, cnt, delta = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+		    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+		    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
 		    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
-			/* due to JIT bugs clear upper 32-bits of src register
-			 * before div/mod operation
-			 */
-			insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
-			insn_buf[1] = *insn;
-			cnt = 2;
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+			struct bpf_insn mask_and_div[] = {
+				BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+				/* Rx div 0 -> 0 */
+				BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
+				BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
+				BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+				*insn,
+			};
+			struct bpf_insn mask_and_mod[] = {
+				BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+				/* Rx mod 0 -> Rx */
+				BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+				*insn,
+			};
+			struct bpf_insn *patchlet;
+
+			if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+			    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+				patchlet = mask_and_div + (is64 ? 1 : 0);
+				cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
+			} else {
+				patchlet = mask_and_mod + (is64 ? 1 : 0);
+				cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
 			if (!new_prog)
 				return -ENOMEM;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 60d8c8712652..08ab4c65a998 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,8 +459,15 @@ do_pass:
 				break;
 
 			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
-			    fp->code == (BPF_ALU | BPF_MOD | BPF_X))
+			    fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
 				*insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
+				/* Error with exception code on div/mod by 0.
+				 * For cBPF programs, this was always return 0.
+				 */
+				*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
+				*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+				*insn++ = BPF_EXIT_INSN();
+			}
 
 			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 			break;
-- 
cgit v1.2.3


From 7c4f63ba824302492985553018881455982241d6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Wed, 24 Jan 2018 15:26:32 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID in do_setlink()

RTM_{NEW,SET}LINK already allow operations on other network namespaces
by identifying the target network namespace through IFLA_NET_NS_{FD,PID}
properties. This is done by looking for the corresponding properties in
do_setlink(). Extend do_setlink() to also look for the IFLA_IF_NETNSID
property. This introduces no functional changes since all callers of
do_setlink() currently block IFLA_IF_NETNSID by reporting an error before
they reach do_setlink().

This introduces the helpers:

static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct
                                               nlattr *tb[])

static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
                                             struct net *src_net,
					     struct nlattr *tb[], int cap)

to simplify permission checks and target network namespace retrieval for
RTM_* requests that already support IFLA_NET_NS_{FD,PID} but get extended
to IFLA_IF_NETNSID. To perserve backwards compatibility the helpers look
for IFLA_NET_NS_{FD,PID} properties first before checking for
IFLA_IF_NETNSID.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 54 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 97874daa1336..f7e99c25dfe4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1902,6 +1902,49 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 }
 EXPORT_SYMBOL(rtnl_link_get_net);
 
+/* Figure out which network namespace we are talking about by
+ * examining the link attributes in the following order:
+ *
+ * 1. IFLA_NET_NS_PID
+ * 2. IFLA_NET_NS_FD
+ * 3. IFLA_IF_NETNSID
+ */
+static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
+					       struct nlattr *tb[])
+{
+	struct net *net;
+
+	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
+		return rtnl_link_get_net(src_net, tb);
+
+	if (!tb[IFLA_IF_NETNSID])
+		return get_net(src_net);
+
+	net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID]));
+	if (!net)
+		return ERR_PTR(-EINVAL);
+
+	return net;
+}
+
+static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
+					     struct net *src_net,
+					     struct nlattr *tb[], int cap)
+{
+	struct net *net;
+
+	net = rtnl_link_get_net_by_nlattr(src_net, tb);
+	if (IS_ERR(net))
+		return net;
+
+	if (!netlink_ns_capable(skb, net->user_ns, cap)) {
+		put_net(net);
+		return ERR_PTR(-EPERM);
+	}
+
+	return net;
+}
+
 static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 {
 	if (dev) {
@@ -2164,17 +2207,14 @@ static int do_setlink(const struct sk_buff *skb,
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
-	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
-		struct net *net = rtnl_link_get_net(dev_net(dev), tb);
+	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
+		struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
+							    tb, CAP_NET_ADMIN);
 		if (IS_ERR(net)) {
 			err = PTR_ERR(net);
 			goto errout;
 		}
-		if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
-			put_net(net);
-			err = -EPERM;
-			goto errout;
-		}
+
 		err = dev_change_net_namespace(dev, net, ifname);
 		put_net(net);
 		if (err)
-- 
cgit v1.2.3


From c310bfcb6e1be993629c5747accf8e1c65fbb255 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Wed, 24 Jan 2018 15:26:33 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_SETLINK

- Backwards Compatibility:
  If userspace wants to determine whether RTM_SETLINK supports the
  IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
  with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
  does not include IFLA_IF_NETNSID userspace should assume that
  IFLA_IF_NETNSID is not supported on this kernel.
  If the reply does contain an IFLA_IF_NETNSID property userspace
  can send an RTM_SETLINK with a IFLA_IF_NETNSID property. If they receive
  EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
  with RTM_SETLINK. Userpace should then fallback to other means.

  To retain backwards compatibility the kernel will first check whether a
  IFLA_NET_NS_PID or IFLA_NET_NS_FD property has been passed. If either
  one is found it will be used to identify the target network namespace.
  This implies that users who do not care whether their running kernel
  supports IFLA_IF_NETNSID with RTM_SETLINK can pass both
  IFLA_NET_NS_{FD,PID} and IFLA_IF_NETNSID referring to the same network
  namespace.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f7e99c25dfe4..d0c02943c05a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2555,9 +2555,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
-	if (tb[IFLA_IF_NETNSID])
-		return -EOPNOTSUPP;
-
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
-- 
cgit v1.2.3


From b61ad68a9fe85d29d5363eb36860164a049723cf Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Wed, 24 Jan 2018 15:26:34 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_DELLINK

- Backwards Compatibility:
  If userspace wants to determine whether RTM_DELLINK supports the
  IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
  with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
  does not include IFLA_IF_NETNSID userspace should assume that
  IFLA_IF_NETNSID is not supported on this kernel.
  If the reply does contain an IFLA_IF_NETNSID property userspace
  can send an RTM_DELLINK with a IFLA_IF_NETNSID property. If they receive
  EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
  with RTM_DELLINK. Userpace should then fallback to other means.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d0c02943c05a..f111557958bb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2639,36 +2639,53 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct net_device *dev;
+	struct net *tgt_net = net;
+	struct net_device *dev = NULL;
 	struct ifinfomsg *ifm;
 	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	int err;
+	int netnsid = -1;
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
 	if (err < 0)
 		return err;
 
-	if (tb[IFLA_IF_NETNSID])
-		return -EOPNOTSUPP;
-
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
+	if (tb[IFLA_IF_NETNSID]) {
+		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+		if (IS_ERR(tgt_net))
+			return PTR_ERR(tgt_net);
+	}
+
+	err = -EINVAL;
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifi_index > 0)
-		dev = __dev_get_by_index(net, ifm->ifi_index);
+		dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
 	else if (tb[IFLA_IFNAME])
-		dev = __dev_get_by_name(net, ifname);
+		dev = __dev_get_by_name(tgt_net, ifname);
 	else if (tb[IFLA_GROUP])
-		return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP]));
+		err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
 	else
-		return -EINVAL;
+		goto out;
 
-	if (!dev)
-		return -ENODEV;
+	if (!dev) {
+		if (tb[IFLA_IFNAME] || ifm->ifi_index > 0)
+			err = -ENODEV;
 
-	return rtnl_delete_link(dev);
+		goto out;
+	}
+
+	err = rtnl_delete_link(dev);
+
+out:
+	if (netnsid >= 0)
+		put_net(tgt_net);
+
+	return err;
 }
 
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
-- 
cgit v1.2.3


From 7ece54a60ee2ba7a386308cae73c790bd580589c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Jan 2018 23:15:27 -0800
Subject: ipv6: Fix SO_REUSEPORT UDP socket with implicit sk_ipv6only

If a sk_v6_rcv_saddr is !IPV6_ADDR_ANY and !IPV6_ADDR_MAPPED, it
implicitly implies it is an ipv6only socket.  However, in inet6_bind(),
this addr_type checking and setting sk->sk_ipv6only to 1 are only done
after sk->sk_prot->get_port(sk, snum) has been completed successfully.

This inconsistency between sk_v6_rcv_saddr and sk_ipv6only confuses
the 'get_port()'.

In particular, when binding SO_REUSEPORT UDP sockets,
udp_reuseport_add_sock(sk,...) is called.  udp_reuseport_add_sock()
checks "ipv6_only_sock(sk2) == ipv6_only_sock(sk)" before adding sk to
sk2->sk_reuseport_cb.  In this case, ipv6_only_sock(sk2) could be
1 while ipv6_only_sock(sk) is still 0 here.  The end result is,
reuseport_alloc(sk) is called instead of adding sk to the existing
sk2->sk_reuseport_cb.

It can be reproduced by binding two SO_REUSEPORT UDP sockets on an
IPv6 address (!ANY and !MAPPED).  Only one of the socket will
receive packet.

The fix is to set the implicit sk_ipv6only before calling get_port().
The original sk_ipv6only has to be saved such that it can be restored
in case get_port() failed.  The situation is similar to the
inet_reset_saddr(sk) after get_port() has failed.

Thanks to Calvin Owens <calvinowens@fb.com> who created an easy
reproduction which leads to a fix.

Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c9441ca45399..416917719a6f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -284,6 +284,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct net *net = sock_net(sk);
 	__be32 v4addr = 0;
 	unsigned short snum;
+	bool saved_ipv6only;
 	int addr_type = 0;
 	int err = 0;
 
@@ -389,19 +390,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (!(addr_type & IPV6_ADDR_MULTICAST))
 		np->saddr = addr->sin6_addr;
 
+	saved_ipv6only = sk->sk_ipv6only;
+	if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED)
+		sk->sk_ipv6only = 1;
+
 	/* Make sure we are allowed to bind here. */
 	if ((snum || !inet->bind_address_no_port) &&
 	    sk->sk_prot->get_port(sk, snum)) {
+		sk->sk_ipv6only = saved_ipv6only;
 		inet_reset_saddr(sk);
 		err = -EADDRINUSE;
 		goto out;
 	}
 
-	if (addr_type != IPV6_ADDR_ANY) {
+	if (addr_type != IPV6_ADDR_ANY)
 		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
-		if (addr_type != IPV6_ADDR_MAPPED)
-			sk->sk_ipv6only = 1;
-	}
 	if (snum)
 		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
 	inet->inet_sport = htons(inet->inet_num);
-- 
cgit v1.2.3


From c36ac8e2307334c83e8bf81ed361f0e4959d995f Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 25 Jan 2018 15:01:38 +0100
Subject: dev: always advertise the new nsid when the netns iface changes

The user should be able to follow any interface that moves to another
netns.  There is no reason to hide physical interfaces.

CC: Jiri Benc <jbenc@redhat.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4670ccabe23a..59987eb6511a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8529,10 +8529,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net)
-		new_nsid = peernet2id_alloc(dev_net(dev), net);
-	else
-		new_nsid = peernet2id(dev_net(dev), net);
+	new_nsid = peernet2id_alloc(dev_net(dev), net);
 	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
 
 	/*
-- 
cgit v1.2.3


From 38e01b30563a5b5ade7b54e5d739d16a2b02fe82 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 25 Jan 2018 15:01:39 +0100
Subject: dev: advertise the new ifindex when the netns iface changes

The goal is to let the user follow an interface that moves to another
netns.

CC: Jiri Benc <jbenc@redhat.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  5 +++--
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 19 ++++++++++++-------
 net/core/rtnetlink.c         | 31 ++++++++++++++++++++-----------
 4 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 62d508b31f56..0514cc36ac34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -19,10 +19,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
-			 gfp_t flags, int *new_nsid);
+			 gfp_t flags, int *new_nsid, int new_ifindex);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags, int *new_nsid);
+				       gfp_t flags, int *new_nsid,
+				       int new_ifindex);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8616131e2c61..6d9447700e18 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -163,6 +163,7 @@ enum {
 	IFLA_IF_NETNSID,
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
+	IFLA_NEW_IFINDEX,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 59987eb6511a..858501b12869 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7360,7 +7360,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL, NULL);
+						     GFP_KERNEL, NULL, 0);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -8473,7 +8473,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err, new_nsid;
+	int err, new_nsid, new_ifindex;
 
 	ASSERT_RTNL();
 
@@ -8529,8 +8529,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+
 	new_nsid = peernet2id_alloc(dev_net(dev), net);
-	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
+	/* If there is an ifindex conflict assign a new one */
+	if (__dev_get_by_index(net, dev->ifindex))
+		new_ifindex = dev_new_index(net);
+	else
+		new_ifindex = dev->ifindex;
+
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
+			    new_ifindex);
 
 	/*
 	 *	Flush the unicast and multicast chains
@@ -8544,10 +8552,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
-
-	/* If there is an ifindex conflict assign a new one */
-	if (__dev_get_by_index(net, dev->ifindex))
-		dev->ifindex = dev_new_index(net);
+	dev->ifindex = new_ifindex;
 
 	/* Send a netdev-add uevent to the new namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f111557958bb..e04af7b7f448 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -988,6 +988,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
+	       + nla_total_size(4)  /* IFLA_NEW_IFINDEX */
 	       + nla_total_size(1)  /* IFLA_PROTO_DOWN */
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
 	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
@@ -1511,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event, int *new_nsid, int tgt_netnsid)
+			    u32 event, int *new_nsid, int new_ifindex,
+			    int tgt_netnsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1608,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	if (new_nsid &&
 	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
 		goto nla_put_failure;
+	if (new_ifindex &&
+	    nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
+		goto nla_put_failure;
+
 
 	rcu_read_lock();
 	if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
@@ -1853,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0, NULL,
+					       ext_filter_mask, 0, NULL, 0,
 					       netnsid);
 
 			if (err < 0) {
@@ -3088,7 +3094,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = rtnl_fill_ifinfo(nskb, dev, net,
 			       RTM_NEWLINK, NETLINK_CB(skb).portid,
 			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
-			       0, NULL, netnsid);
+			       0, NULL, 0, netnsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -3184,7 +3190,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags, int *new_nsid)
+				       u32 event, gfp_t flags, int *new_nsid,
+				       int new_ifindex)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -3197,7 +3204,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
 			       type, 0, 0, change, 0, 0, event,
-			       new_nsid, -1);
+			       new_nsid, new_ifindex, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -3220,14 +3227,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags, int *new_nsid)
+			       gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
+				     new_ifindex);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -3235,14 +3243,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   NULL, 0);
 }
 
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
-			 gfp_t flags, int *new_nsid)
+			 gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   new_nsid);
+			   new_nsid, new_ifindex);
 }
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4642,7 +4651,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL, NULL);
+				   GFP_KERNEL, NULL, 0);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 6a643ddb5624be7e0694d49f5765a8d41c1ab6d0 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 25 Jan 2018 18:26:22 -0800
Subject: net: introduce helper dev_change_tx_queue_len()

This patch promotes the local change_tx_queue_len() to a core
helper function, dev_change_tx_queue_len(), so that rtnetlink
and net-sysfs could share the code. This also prepares for the
following patch.

Note, the -EFAULT in the original code doesn't make sense,
we should propagate the errno from notifiers.

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 28 ++++++++++++++++++++++++++++
 net/core/net-sysfs.c      | 25 +------------------------
 net/core/rtnetlink.c      | 18 +++++-------------
 4 files changed, 35 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cd46d3d63aa0..4c77f39ebd65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3331,6 +3331,7 @@ int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
 int dev_set_mtu(struct net_device *, int);
+int dev_change_tx_queue_len(struct net_device *, unsigned long);
 void dev_set_group(struct net_device *, int);
 int dev_set_mac_address(struct net_device *, struct sockaddr *);
 int dev_change_carrier(struct net_device *, bool new_carrier);
diff --git a/net/core/dev.c b/net/core/dev.c
index 858501b12869..520c24671bc5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7047,6 +7047,34 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL(dev_set_mtu);
 
+/**
+ *	dev_change_tx_queue_len - Change TX queue length of a netdevice
+ *	@dev: device
+ *	@new_len: new tx queue length
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+	unsigned int orig_len = dev->tx_queue_len;
+	int res;
+
+	if (new_len != (unsigned int)new_len)
+		return -ERANGE;
+
+	if (new_len != orig_len) {
+		dev->tx_queue_len = new_len;
+		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+		res = notifier_to_errno(res);
+		if (res) {
+			netdev_err(dev,
+				   "refused to change device tx_queue_len\n");
+			dev->tx_queue_len = orig_len;
+			return res;
+		}
+	}
+
+	return 0;
+}
+
 /**
  *	dev_set_group - Change group this device belongs to
  *	@dev: device
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c4a28f4667b6..60a5ad2c33ee 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -346,29 +346,6 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
 }
 NETDEVICE_SHOW_RW(flags, fmt_hex);
 
-static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
-{
-	unsigned int orig_len = dev->tx_queue_len;
-	int res;
-
-	if (new_len != (unsigned int)new_len)
-		return -ERANGE;
-
-	if (new_len != orig_len) {
-		dev->tx_queue_len = new_len;
-		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
-		res = notifier_to_errno(res);
-		if (res) {
-			netdev_err(dev,
-				   "refused to change device tx_queue_len\n");
-			dev->tx_queue_len = orig_len;
-			return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
 static ssize_t tx_queue_len_store(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf, size_t len)
@@ -376,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev,
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
-	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
+	return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len);
 }
 NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e04af7b7f448..061e27cb6c12 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2337,19 +2337,11 @@ static int do_setlink(const struct sk_buff *skb,
 
 	if (tb[IFLA_TXQLEN]) {
 		unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
-		unsigned int orig_len = dev->tx_queue_len;
-
-		if (dev->tx_queue_len ^ value) {
-			dev->tx_queue_len = value;
-			err = call_netdevice_notifiers(
-			      NETDEV_CHANGE_TX_QUEUE_LEN, dev);
-			err = notifier_to_errno(err);
-			if (err) {
-				dev->tx_queue_len = orig_len;
-				goto errout;
-			}
-			status |= DO_SETLINK_MODIFIED;
-		}
+
+		err = dev_change_tx_queue_len(dev, value);
+		if (err)
+			goto errout;
+		status |= DO_SETLINK_MODIFIED;
 	}
 
 	if (tb[IFLA_GSO_MAX_SIZE]) {
-- 
cgit v1.2.3


From 48bfd55e7e4149a304e89c1999436cf52d094a27 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 25 Jan 2018 18:26:23 -0800
Subject: net_sched: plug in qdisc ops change_tx_queue_len

Introduce a new qdisc ops ->change_tx_queue_len() so that
each qdisc could decide how to implement this if it wants.
Previously we simply read dev->tx_queue_len, after pfifo_fast
switches to skb array, we need this API to resize the skb array
when we change dev->tx_queue_len.

To avoid handling race conditions with TX BH, we need to
deactivate all TX queues before change the value and bring them
back after we are done, this also makes implementation easier.

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  2 ++
 net/core/dev.c            |  1 +
 net/sched/sch_generic.c   | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index eac43e8ca96d..e2ab13687fb9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -200,6 +200,7 @@ struct Qdisc_ops {
 					  struct nlattr *arg,
 					  struct netlink_ext_ack *extack);
 	void			(*attach)(struct Qdisc *sch);
+	int			(*change_tx_queue_len)(struct Qdisc *, unsigned int);
 
 	int			(*dump)(struct Qdisc *, struct sk_buff *);
 	int			(*dump_stats)(struct Qdisc *, struct gnet_dump *);
@@ -489,6 +490,7 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *,
 void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
 void qdisc_class_hash_destroy(struct Qdisc_class_hash *);
 
+int dev_qdisc_change_tx_queue_len(struct net_device *dev);
 void dev_init_scheduler(struct net_device *dev);
 void dev_shutdown(struct net_device *dev);
 void dev_activate(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 520c24671bc5..dda9d7b9a840 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7070,6 +7070,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 			dev->tx_queue_len = orig_len;
 			return res;
 		}
+		return dev_qdisc_change_tx_queue_len(dev);
 	}
 
 	return 0;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1816bde47256..08f9fa27e06e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1178,6 +1178,39 @@ void dev_deactivate(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_deactivate);
 
+static int qdisc_change_tx_queue_len(struct net_device *dev,
+				     struct netdev_queue *dev_queue)
+{
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+	const struct Qdisc_ops *ops = qdisc->ops;
+
+	if (ops->change_tx_queue_len)
+		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
+	return 0;
+}
+
+int dev_qdisc_change_tx_queue_len(struct net_device *dev)
+{
+	bool up = dev->flags & IFF_UP;
+	unsigned int i;
+	int ret = 0;
+
+	if (up)
+		dev_deactivate(dev);
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
+
+		/* TODO: revert changes on a partial failure */
+		if (ret)
+			break;
+	}
+
+	if (up)
+		dev_activate(dev);
+	return ret;
+}
+
 static void dev_init_scheduler_queue(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
 				     void *_qdisc)
-- 
cgit v1.2.3


From 7007ba630e4a6f809eab9abfc0e3a6e864e9d880 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 25 Jan 2018 18:26:24 -0800
Subject: net_sched: implement ->change_tx_queue_len() for pfifo_fast

pfifo_fast used to drop based on qdisc_dev(qdisc)->tx_queue_len,
so we have to resize skb array when we change tx_queue_len.

Other qdiscs which read tx_queue_len are fine because they
all save it to sch->limit or somewhere else in qdisc during init.
They don't have to implement this, it is nicer if they do so
that users don't have to re-configure qdisc after changing
tx_queue_len.

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 08f9fa27e06e..190570f21b20 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -763,6 +763,23 @@ static void pfifo_fast_destroy(struct Qdisc *sch)
 	}
 }
 
+static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
+					  unsigned int new_len)
+{
+	struct pfifo_fast_priv *priv = qdisc_priv(sch);
+	struct skb_array *bands[PFIFO_FAST_BANDS];
+	int prio;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+
+		bands[prio] = q;
+	}
+
+	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
+					 GFP_KERNEL);
+}
+
 struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
 	.priv_size	=	sizeof(struct pfifo_fast_priv),
@@ -773,6 +790,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.destroy	=	pfifo_fast_destroy,
 	.reset		=	pfifo_fast_reset,
 	.dump		=	pfifo_fast_dump,
+	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
 	.owner		=	THIS_MODULE,
 	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
 };
-- 
cgit v1.2.3


From c76fe2d98c726224a975a0d0198c3fb50406d325 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 25 Jan 2018 20:16:29 -0800
Subject: net: ipv6: send unsolicited NA after DAD

Unsolicited IPv6 neighbor advertisements should be sent after DAD
completes. Update ndisc_send_unsol_na to skip tentative, non-optimistic
addresses and have those sent by addrconf_dad_completed after DAD.

Fixes: 4a6e3c5def13c ("net: ipv6: send unsolicited NA on admin up")
Reported-by: Vivek Venkatraman <vivek@cumulusnetworks.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 30 ++++++++++++++++++++++++++----
 net/ipv6/ndisc.c    |  5 +++++
 2 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab99cb641b7c..adcaaad115f5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -186,7 +186,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 
 static void addrconf_dad_start(struct inet6_ifaddr *ifp);
 static void addrconf_dad_work(struct work_struct *w);
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
+				   bool send_na);
 static void addrconf_dad_run(struct inet6_dev *idev);
 static void addrconf_rs_timer(struct timer_list *t);
 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
@@ -3838,12 +3839,17 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	     idev->cnf.accept_dad < 1) ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
 	    ifp->flags & IFA_F_NODAD) {
+		bool send_na = false;
+
+		if (ifp->flags & IFA_F_TENTATIVE &&
+		    !(ifp->flags & IFA_F_OPTIMISTIC))
+			send_na = true;
 		bump_id = ifp->flags & IFA_F_TENTATIVE;
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
 		spin_unlock(&ifp->lock);
 		read_unlock_bh(&idev->lock);
 
-		addrconf_dad_completed(ifp, bump_id);
+		addrconf_dad_completed(ifp, bump_id, send_na);
 		return;
 	}
 
@@ -3972,16 +3978,21 @@ static void addrconf_dad_work(struct work_struct *w)
 	}
 
 	if (ifp->dad_probes == 0) {
+		bool send_na = false;
+
 		/*
 		 * DAD was successful
 		 */
 
+		if (ifp->flags & IFA_F_TENTATIVE &&
+		    !(ifp->flags & IFA_F_OPTIMISTIC))
+			send_na = true;
 		bump_id = ifp->flags & IFA_F_TENTATIVE;
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
 		spin_unlock(&ifp->lock);
 		write_unlock_bh(&idev->lock);
 
-		addrconf_dad_completed(ifp, bump_id);
+		addrconf_dad_completed(ifp, bump_id, send_na);
 
 		goto out;
 	}
@@ -4019,7 +4030,8 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
 	return true;
 }
 
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
+				   bool send_na)
 {
 	struct net_device *dev = ifp->idev->dev;
 	struct in6_addr lladdr;
@@ -4051,6 +4063,16 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
 	if (send_mld)
 		ipv6_mc_dad_complete(ifp->idev);
 
+	/* send unsolicited NA if enabled */
+	if (send_na &&
+	    (ifp->idev->cnf.ndisc_notify ||
+	     dev_net(dev)->ipv6.devconf_all->ndisc_notify)) {
+		ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr,
+			      /*router=*/ !!ifp->idev->cnf.forwarding,
+			      /*solicited=*/ false, /*override=*/ true,
+			      /*inc_opt=*/ true);
+	}
+
 	if (send_rs) {
 		/*
 		 *	If a host as already performed a random delay
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b3cea200c85e..f61a5b613b52 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -566,6 +566,11 @@ static void ndisc_send_unsol_na(struct net_device *dev)
 
 	read_lock_bh(&idev->lock);
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		/* skip tentative addresses until dad completes */
+		if (ifa->flags & IFA_F_TENTATIVE &&
+		    !(ifa->flags & IFA_F_OPTIMISTIC))
+			continue;
+
 		ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr,
 			      /*router=*/ !!idev->cnf.forwarding,
 			      /*solicited=*/ false, /*override=*/ true,
-- 
cgit v1.2.3


From 31afeb425f7fad8bcf9561aeb0b8405479f97a98 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 26 Jan 2018 11:40:17 -0800
Subject: ipv6: change route cache aging logic

In current route cache aging logic, if a route has both RTF_EXPIRE and
RTF_GATEWAY set, the route will only be removed if the neighbor cache
has no NTF_ROUTER flag. Otherwise, even if the route has expired, it
won't get deleted.
Fix this logic to always check if the route has expired first and then
do the gateway neighbor cache check if previous check decide to not
remove the exception entry.

Fixes: 1859bac04fb6 ("ipv6: remove from fib tree aged out RTF_CACHE dst")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fe3966a9c999..fb2d251c0500 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1585,12 +1585,19 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
 	 * expired, independently from their aging, as per RFC 8201 section 4
 	 */
-	if (!(rt->rt6i_flags & RTF_EXPIRES) &&
-	    time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
-		RT6_TRACE("aging clone %p\n", rt);
+	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
+		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+			RT6_TRACE("aging clone %p\n", rt);
+			rt6_remove_exception(bucket, rt6_ex);
+			return;
+		}
+	} else if (time_after(jiffies, rt->dst.expires)) {
+		RT6_TRACE("purging expired route %p\n", rt);
 		rt6_remove_exception(bucket, rt6_ex);
 		return;
-	} else if (rt->rt6i_flags & RTF_GATEWAY) {
+	}
+
+	if (rt->rt6i_flags & RTF_GATEWAY) {
 		struct neighbour *neigh;
 		__u8 neigh_flags = 0;
 
@@ -1605,11 +1612,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 			rt6_remove_exception(bucket, rt6_ex);
 			return;
 		}
-	} else if (__rt6_check_expired(rt)) {
-		RT6_TRACE("purging expired route %p\n", rt);
-		rt6_remove_exception(bucket, rt6_ex);
-		return;
 	}
+
 	gc_args->more++;
 }
 
-- 
cgit v1.2.3


From e64e469b9a2c22d41b3dd7172118760cec22d473 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 26 Jan 2018 16:10:43 -0800
Subject: ipv6: addrconf: break critical section in addrconf_verify_rtnl()

Heiner reported a lockdep splat [1]

This is caused by attempting GFP_KERNEL allocation while RCU lock is
held and BH blocked.

We believe that addrconf_verify_rtnl() could run for a long period,
so instead of using GFP_ATOMIC here as Ido suggested, we should break
the critical section and restart it after the allocation.

[1]
[86220.125562] =============================
[86220.125586] WARNING: suspicious RCU usage
[86220.125612] 4.15.0-rc7-next-20180110+ #7 Not tainted
[86220.125641] -----------------------------
[86220.125666] kernel/sched/core.c:6026 Illegal context switch in RCU-bh read-side critical section!
[86220.125711]
               other info that might help us debug this:

[86220.125755]
               rcu_scheduler_active = 2, debug_locks = 1
[86220.125792] 4 locks held by kworker/0:2/1003:
[86220.125817]  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: [<00000000da8e9b73>] process_one_work+0x1de/0x680
[86220.125895]  #1:  ((addr_chk_work).work){+.+.}, at: [<00000000da8e9b73>] process_one_work+0x1de/0x680
[86220.125959]  #2:  (rtnl_mutex){+.+.}, at: [<00000000b06d9510>] rtnl_lock+0x12/0x20
[86220.126017]  #3:  (rcu_read_lock_bh){....}, at: [<00000000aef52299>] addrconf_verify_rtnl+0x1e/0x510 [ipv6]
[86220.126111]
               stack backtrace:
[86220.126142] CPU: 0 PID: 1003 Comm: kworker/0:2 Not tainted 4.15.0-rc7-next-20180110+ #7
[86220.126185] Hardware name: ZOTAC ZBOX-CI321NANO/ZBOX-CI321NANO, BIOS B246P105 06/01/2015
[86220.126250] Workqueue: ipv6_addrconf addrconf_verify_work [ipv6]
[86220.126288] Call Trace:
[86220.126312]  dump_stack+0x70/0x9e
[86220.126337]  lockdep_rcu_suspicious+0xce/0xf0
[86220.126365]  ___might_sleep+0x1d3/0x240
[86220.126390]  __might_sleep+0x45/0x80
[86220.126416]  kmem_cache_alloc_trace+0x53/0x250
[86220.126458]  ? ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.126498]  ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.126538]  ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.126580]  ? ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.126623]  addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.126664]  ? addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.126708]  addrconf_verify_work+0xe/0x20 [ipv6]
[86220.126738]  process_one_work+0x258/0x680
[86220.126765]  worker_thread+0x35/0x3f0
[86220.126790]  kthread+0x124/0x140
[86220.126813]  ? process_one_work+0x680/0x680
[86220.126839]  ? kthread_create_worker_on_cpu+0x40/0x40
[86220.126869]  ? umh_complete+0x40/0x40
[86220.126893]  ? call_usermodehelper_exec_async+0x12a/0x160
[86220.126926]  ret_from_fork+0x4b/0x60
[86220.126999] BUG: sleeping function called from invalid context at mm/slab.h:420
[86220.127041] in_atomic(): 1, irqs_disabled(): 0, pid: 1003, name: kworker/0:2
[86220.127082] 4 locks held by kworker/0:2/1003:
[86220.127107]  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: [<00000000da8e9b73>] process_one_work+0x1de/0x680
[86220.127179]  #1:  ((addr_chk_work).work){+.+.}, at: [<00000000da8e9b73>] process_one_work+0x1de/0x680
[86220.127242]  #2:  (rtnl_mutex){+.+.}, at: [<00000000b06d9510>] rtnl_lock+0x12/0x20
[86220.127300]  #3:  (rcu_read_lock_bh){....}, at: [<00000000aef52299>] addrconf_verify_rtnl+0x1e/0x510 [ipv6]
[86220.127414] CPU: 0 PID: 1003 Comm: kworker/0:2 Not tainted 4.15.0-rc7-next-20180110+ #7
[86220.127463] Hardware name: ZOTAC ZBOX-CI321NANO/ZBOX-CI321NANO, BIOS B246P105 06/01/2015
[86220.127528] Workqueue: ipv6_addrconf addrconf_verify_work [ipv6]
[86220.127568] Call Trace:
[86220.127591]  dump_stack+0x70/0x9e
[86220.127616]  ___might_sleep+0x14d/0x240
[86220.127644]  __might_sleep+0x45/0x80
[86220.127672]  kmem_cache_alloc_trace+0x53/0x250
[86220.127717]  ? ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.127762]  ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.127807]  ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.127854]  ? ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.127903]  addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.127950]  ? addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.127998]  addrconf_verify_work+0xe/0x20 [ipv6]
[86220.128032]  process_one_work+0x258/0x680
[86220.128063]  worker_thread+0x35/0x3f0
[86220.128091]  kthread+0x124/0x140
[86220.128117]  ? process_one_work+0x680/0x680
[86220.128146]  ? kthread_create_worker_on_cpu+0x40/0x40
[86220.128180]  ? umh_complete+0x40/0x40
[86220.128207]  ? call_usermodehelper_exec_async+0x12a/0x160
[86220.128243]  ret_from_fork+0x4b/0x60

Fixes: f3d9832e56c4 ("ipv6: addrconf: cleanup locking in ipv6_add_addr")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index adcaaad115f5..e1846b97ee69 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4378,9 +4378,11 @@ restart:
 						spin_lock(&ifpub->lock);
 						ifpub->regen_count = 0;
 						spin_unlock(&ifpub->lock);
+						rcu_read_unlock_bh();
 						ipv6_create_tempaddr(ifpub, ifp, true);
 						in6_ifa_put(ifpub);
 						in6_ifa_put(ifp);
+						rcu_read_lock_bh();
 						goto restart;
 					}
 				} else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
-- 
cgit v1.2.3


From 40ca54e3a686f13117f3de0c443f8026dadf7c44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 27 Jan 2018 10:58:43 -0800
Subject: net_sched: gen_estimator: fix lockdep splat

syzbot reported a lockdep splat in gen_new_estimator() /
est_fetch_counters() when attempting to lock est->stats_lock.

Since est_fetch_counters() is called from BH context from timer
interrupt, we need to block BH as well when calling it from process
context.

Most qdiscs use per cpu counters and are immune to the problem,
but net/sched/act_api.c and net/netfilter/xt_RATEEST.c are using
a spinlock to protect their data. They both call gen_new_estimator()
while object is created and not yet alive, so this bug could
not trigger a deadlock, only a lockdep splat.

Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_estimator.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9834cfa21b21..0a3f88f08727 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -159,7 +159,11 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	est->intvl_log = intvl_log;
 	est->cpu_bstats = cpu_bstats;
 
+	if (stats_lock)
+		local_bh_disable();
 	est_fetch_counters(est, &b);
+	if (stats_lock)
+		local_bh_enable();
 	est->last_bytes = b.bytes;
 	est->last_packets = b.packets;
 	old = rcu_dereference_protected(*rate_est, 1);
-- 
cgit v1.2.3


From 30e948a37839c633d18f6c4cf8a212912ba9449c Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sun, 28 Jan 2018 03:38:58 -0800
Subject: ipv4: Get the address of interface correctly.

When using ioctl to get address of interface, we can't
get it anymore. For example, the command is show as below.

	# ifconfig eth0

In the patch ("03aef17bb79b3"), the devinet_ioctl does not
return a suitable value, even though we can find it in
the kernel. Then fix it now.

Fixes: 03aef17bb79b3 ("devinet_ioctl(): take copyin/copyout to caller")
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e056c0067f2c..40f001782c1b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1048,18 +1048,22 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 
 	switch (cmd) {
 	case SIOCGIFADDR:	/* Get interface address */
+		ret = 0;
 		sin->sin_addr.s_addr = ifa->ifa_local;
 		break;
 
 	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+		ret = 0;
 		sin->sin_addr.s_addr = ifa->ifa_broadcast;
 		break;
 
 	case SIOCGIFDSTADDR:	/* Get the destination address */
+		ret = 0;
 		sin->sin_addr.s_addr = ifa->ifa_address;
 		break;
 
 	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		ret = 0;
 		sin->sin_addr.s_addr = ifa->ifa_mask;
 		break;
 
-- 
cgit v1.2.3


From 9b42d55a66d388e4dd5550107df051a9637564fc Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 26 Jan 2018 16:40:41 +0800
Subject: tcp: release sk_frag.page in tcp_disconnect

socket can be disconnected and gets transformed back to a listening
socket, if sk_frag.page is not released, which will be cloned into
a new socket by sk_clone_lock, but the reference count of this page
is increased, lead to a use after free or double free issue

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8ed3a04b504..874c9317b8df 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2458,6 +2458,12 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
+	if (sk->sk_frag.page) {
+		put_page(sk->sk_frag.page);
+		sk->sk_frag.page = NULL;
+		sk->sk_frag.offset = 0;
+	}
+
 	sk->sk_error_report(sk);
 	return err;
 }
-- 
cgit v1.2.3


From 91e6dd8284256ef62b43b78da6e7684e4f06ac2f Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Tue, 30 Jan 2018 09:48:02 +0000
Subject: ipmr: Fix ptrdiff_t print formatting

ipmr_vif_seq_show() prints the difference between two pointers with the
format string %2zd (z for size_t), however the correct format string is
%2td instead (t for ptrdiff_t).

The same bug in ip6mr_vif_seq_show() was already fixed long ago by
commit d430a227d272 ("bogus format in ip6mr").

Signed-off-by: James Hogan <jhogan@kernel.org>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a819fab45d17..b05689bbba31 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3022,7 +3022,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 		const char *name =  vif->dev ? vif->dev->name : "none";
 
 		seq_printf(seq,
-			   "%2zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
 			   vif - mrt->vif_table,
 			   name, vif->bytes_in, vif->pkt_in,
 			   vif->bytes_out, vif->pkt_out,
-- 
cgit v1.2.3


From 5bb8ed075428b71492734af66230aa0c07fcc515 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 29 Jan 2018 18:07:20 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK

- Backwards Compatibility:
  If userspace wants to determine whether RTM_NEWLINK supports the
  IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
  with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
  does not include IFLA_IF_NETNSID userspace should assume that
  IFLA_IF_NETNSID is not supported on this kernel.
  If the reply does contain an IFLA_IF_NETNSID property userspace
  can send an RTM_NEWLINK with a IFLA_IF_NETNSID property. If they receive
  EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
  with RTM_NEWLINK. Userpace should then fallback to other means.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 061e27cb6c12..204297dffd2a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2952,14 +2952,10 @@ replay:
 			name_assign_type = NET_NAME_ENUM;
 		}
 
-		dest_net = rtnl_link_get_net(net, tb);
+		dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
 		if (IS_ERR(dest_net))
 			return PTR_ERR(dest_net);
 
-		err = -EPERM;
-		if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN))
-			goto out;
-
 		if (tb[IFLA_LINK_NETNSID]) {
 			int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
 
-- 
cgit v1.2.3


From e4823fbd229bfbba368b40cdadb8f4eeb20604cc Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 30 Jan 2018 22:21:48 -0600
Subject: tcp_nv: fix potential integer overflow in tcpnv_acked

Add suffix ULL to constant 80000 in order to avoid a potential integer
overflow and give the compiler complete information about the proper
arithmetic to use. Notice that this constant is used in a context that
expects an expression of type u64.

The current cast to u64 effectively applies to the whole expression
as an argument of type u64 to be passed to div64_u64, but it does
not prevent it from being evaluated using 32-bit arithmetic instead
of 64-bit arithmetic.

Also, once the expression is properly evaluated using 64-bit arithmentic,
there is no need for the parentheses and the external cast to u64.

Addresses-Coverity-ID: 1357588 ("Unintentional integer overflow")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_nv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index ddbce73edae8..764298e52577 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -364,7 +364,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
 		 */
 		cwnd_by_slope = (u32)
 			div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
-				  (u64)(80000 * tp->mss_cache));
+				  80000ULL * tp->mss_cache);
 		max_win = cwnd_by_slope + nv_pad;
 
 		/* If cwnd > max_win, decrease cwnd
-- 
cgit v1.2.3


From 5b7789e8fa8f353ad8f2c44de2385cb161b22d32 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 30 Jan 2018 22:55:33 -0600
Subject: openvswitch: meter: Use 64-bit arithmetic instead of 32-bit

Add suffix LL to constant 1000 in order to give the compiler
complete information about the proper arithmetic to use. Notice
that this constant is used in a context that expects an expression
of type long long int (64 bits, signed).

The expression (band->burst_size + band->rate) * 1000 is currently
being evaluated using 32-bit arithmetic.

Addresses-Coverity-ID: 1461563 ("Unintentional integer overflow")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 3fbfc78991ac..04b94281a30b 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -488,7 +488,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
 		long long int max_bucket_size;
 
 		band = &meter->bands[i];
-		max_bucket_size = (band->burst_size + band->rate) * 1000;
+		max_bucket_size = (band->burst_size + band->rate) * 1000LL;
 
 		band->bucket += delta_ms * band->rate;
 		if (band->bucket > max_bucket_size)
-- 
cgit v1.2.3


From 11eab14805d0220000711cb62e604a9db5ebddca Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:20 +0100
Subject: net/sched: kconfig: Remove blank help texts

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index c03d86a7775e..f24a6ae6819a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -857,17 +857,14 @@ config NET_ACT_TUNNEL_KEY
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
-        ---help---
 
 config NET_IFE_SKBPRIO
         tristate "Support to encoding decoding skb prio on IFE action"
         depends on NET_ACT_IFE
-        ---help---
 
 config NET_IFE_SKBTCINDEX
         tristate "Support to encoding decoding skb tcindex on IFE action"
         depends on NET_ACT_IFE
-        ---help---
 
 config NET_CLS_IND
 	bool "Incoming device classification"
-- 
cgit v1.2.3


From 4adfa79fc254efb7b0eb3cd58f62c2c3f805f1ba Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 31 Jan 2018 16:29:30 +0200
Subject: ip6mr: fix stale iterator

When we dump the ip6mr mfc entries via proc, we initialize an iterator
with the table to dump but we don't clear the cache pointer which might
be initialized from a prior read on the same descriptor that ended. This
can result in lock imbalance (an unnecessary unlock) leading to other
crashes and hangs. Clear the cache pointer like ipmr does to fix the issue.
Thanks for the reliable reproducer.

Here's syzbot's trace:
 WARNING: bad unlock balance detected!
 4.15.0-rc3+ #128 Not tainted
 syzkaller971460/3195 is trying to release lock (mrt_lock) at:
 [<000000006898068d>] ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553
 but there are no more locks to release!

 other info that might help us debug this:
 1 lock held by syzkaller971460/3195:
  #0:  (&p->lock){+.+.}, at: [<00000000744a6565>] seq_read+0xd5/0x13d0
 fs/seq_file.c:165

 stack backtrace:
 CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:53
  print_unlock_imbalance_bug+0x12f/0x140 kernel/locking/lockdep.c:3561
  __lock_release kernel/locking/lockdep.c:3775 [inline]
  lock_release+0x5f9/0xda0 kernel/locking/lockdep.c:4023
  __raw_read_unlock include/linux/rwlock_api_smp.h:225 [inline]
  _raw_read_unlock+0x1a/0x30 kernel/locking/spinlock.c:255
  ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553
  traverse+0x3bc/0xa00 fs/seq_file.c:135
  seq_read+0x96a/0x13d0 fs/seq_file.c:189
  proc_reg_read+0xef/0x170 fs/proc/inode.c:217
  do_loop_readv_writev fs/read_write.c:673 [inline]
  do_iter_read+0x3db/0x5b0 fs/read_write.c:897
  compat_readv+0x1bf/0x270 fs/read_write.c:1140
  do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189
  C_SYSC_preadv fs/read_write.c:1209 [inline]
  compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203
  do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
  do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
  entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
 RIP: 0023:0xf7f73c79
 RSP: 002b:00000000e574a15c EFLAGS: 00000292 ORIG_RAX: 000000000000014d
 RAX: ffffffffffffffda RBX: 000000000000000f RCX: 0000000020a3afb0
 RDX: 0000000000000001 RSI: 0000000000000067 RDI: 0000000000000000
 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 BUG: sleeping function called from invalid context at lib/usercopy.c:25
 in_atomic(): 1, irqs_disabled(): 0, pid: 3195, name: syzkaller971460
 INFO: lockdep is turned off.
 CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:53
  ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6060
  __might_sleep+0x95/0x190 kernel/sched/core.c:6013
  __might_fault+0xab/0x1d0 mm/memory.c:4525
  _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
  copy_to_user include/linux/uaccess.h:155 [inline]
  seq_read+0xcb4/0x13d0 fs/seq_file.c:279
  proc_reg_read+0xef/0x170 fs/proc/inode.c:217
  do_loop_readv_writev fs/read_write.c:673 [inline]
  do_iter_read+0x3db/0x5b0 fs/read_write.c:897
  compat_readv+0x1bf/0x270 fs/read_write.c:1140
  do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189
  C_SYSC_preadv fs/read_write.c:1209 [inline]
  compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203
  do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
  do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
  entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
 RIP: 0023:0xf7f73c79
 RSP: 002b:00000000e574a15c EFLAGS: 00000292 ORIG_RAX: 000000000000014d
 RAX: ffffffffffffffda RBX: 000000000000000f RCX: 0000000020a3afb0
 RDX: 0000000000000001 RSI: 0000000000000067 RDI: 0000000000000000
 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 WARNING: CPU: 1 PID: 3195 at lib/usercopy.c:26 _copy_to_user+0xb5/0xc0
 lib/usercopy.c:26

Reported-by: syzbot <bot+eceb3204562c41a438fa1f2335e0fe4f6886d669@syzkaller.appspotmail.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 754ef84cf354..9f6cace9c817 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -494,6 +494,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 		return ERR_PTR(-ENOENT);
 
 	it->mrt = mrt;
+	it->cache = NULL;
 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
 		: SEQ_START_TOKEN;
 }
-- 
cgit v1.2.3


From a54667f6728c2714a400f3c884727da74b6d1717 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Wed, 31 Jan 2018 21:34:37 +0530
Subject: tls: Add support for encryption using async offload accelerator

Async crypto accelerators (e.g. drivers/crypto/caam) support offloading
GCM operation. If they are enabled, crypto_aead_encrypt() return error
code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a
completion till the time the response for crypto offload request is
received.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 ++
 net/tls/tls_sw.c  | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/tls.h b/include/net/tls.h
index 9185e53a743c..4913430ab807 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -36,6 +36,7 @@
 
 #include <linux/types.h>
 #include <asm/byteorder.h>
+#include <linux/crypto.h>
 #include <linux/socket.h>
 #include <linux/tcp.h>
 #include <net/tcp.h>
@@ -57,6 +58,7 @@
 
 struct tls_sw_context {
 	struct crypto_aead *aead_send;
+	struct crypto_wait async_wait;
 
 	/* Sending context */
 	char aad_space[TLS_AAD_SPACE_SIZE];
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 0a9b72fbd761..f26376e954ae 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -214,7 +214,11 @@ static int tls_do_encryption(struct tls_context *tls_ctx,
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
 	aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
 			       data_len, tls_ctx->iv);
-	rc = crypto_aead_encrypt(aead_req);
+
+	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &ctx->async_wait);
+
+	rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait);
 
 	ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size;
 	ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size;
@@ -665,6 +669,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 		goto out;
 	}
 
+	crypto_init_wait(&sw_ctx->async_wait);
+
 	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
 
 	crypto_info = &ctx->crypto_send;
-- 
cgit v1.2.3