From 00fd38d938db3f1ab1c486549afc450cb7e751b1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 12 Nov 2015 08:43:18 -0800
Subject: tcp: ensure proper barriers in lockless contexts

Some functions access TCP sockets without holding a lock and
might output non consistent data, depending on compiler and or
architecture.

tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...

Introduce sk_state_load() and sk_state_store() to fix the issues,
and more clearly document where this lack of locking is happening.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index bbf7c2cf15b4..7f89e4ba18d1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2226,6 +2226,31 @@ static inline bool sk_listener(const struct sock *sk)
 	return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
 }
 
+/**
+ * sk_state_load - read sk->sk_state for lockless contexts
+ * @sk: socket pointer
+ *
+ * Paired with sk_state_store(). Used in places we do not hold socket lock :
+ * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
+ */
+static inline int sk_state_load(const struct sock *sk)
+{
+	return smp_load_acquire(&sk->sk_state);
+}
+
+/**
+ * sk_state_store - update sk->sk_state
+ * @sk: socket pointer
+ * @newstate: new state
+ *
+ * Paired with sk_state_load(). Should be used in contexts where
+ * state change might impact lockless readers.
+ */
+static inline void sk_state_store(struct sock *sk, int newstate)
+{
+	smp_store_release(&sk->sk_state, newstate);
+}
+
 void sock_enable_timestamp(struct sock *sk, int flag);
 int sock_get_timestamp(struct sock *, struct timeval __user *);
 int sock_get_timestampns(struct sock *, struct timespec __user *);
-- 
cgit v1.2.3


From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 26 Nov 2015 13:55:39 +0800
Subject: net: Generalise wq_has_sleeper helper

The memory barrier in the helper wq_has_sleeper is needed by just
about every user of waitqueue_active.  This patch generalises it
by making it take a wait_queue_head_t directly.  The existing
helper is renamed to skwq_has_sleeper.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/algif_aead.c     |  4 ++--
 crypto/algif_skcipher.c |  4 ++--
 include/linux/wait.h    | 21 +++++++++++++++++++++
 include/net/sock.h      | 15 +++++----------
 net/atm/common.c        |  4 ++--
 net/core/sock.c         |  8 ++++----
 net/core/stream.c       |  2 +-
 net/dccp/output.c       |  2 +-
 net/iucv/af_iucv.c      |  2 +-
 net/rxrpc/af_rxrpc.c    |  2 +-
 net/sctp/socket.c       |  2 +-
 net/tipc/socket.c       |  4 ++--
 net/unix/af_unix.c      |  2 +-
 13 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 0aa6fdfb448a..fb99f30849d2 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -106,7 +106,7 @@ static void aead_wmem_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 							   POLLRDNORM |
 							   POLLRDBAND);
@@ -157,7 +157,7 @@ static void aead_data_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 							   POLLRDNORM |
 							   POLLRDBAND);
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index af31a0ee4057..0e6702e41472 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -238,7 +238,7 @@ static void skcipher_wmem_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 							   POLLRDNORM |
 							   POLLRDBAND);
@@ -288,7 +288,7 @@ static void skcipher_data_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 							   POLLRDNORM |
 							   POLLRDBAND);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..6aa09a875fbd 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -107,6 +107,27 @@ static inline int waitqueue_active(wait_queue_head_t *q)
 	return !list_empty(&q->task_list);
 }
 
+/**
+ * wq_has_sleeper - check if there are any waiting processes
+ * @wq: wait queue head
+ *
+ * Returns true if wq has waiting processes
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_sleeper(wait_queue_head_t *wq)
+{
+	/*
+	 * We need to be sure we are in sync with the
+	 * add_wait_queue modifications to the wait queue.
+	 *
+	 * This memory barrier should be paired with one on the
+	 * waiting side.
+	 */
+	smp_mb();
+	return waitqueue_active(wq);
+}
+
 extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
 extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
 extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..62d35afcb3ac 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -58,6 +58,7 @@
 #include <linux/memcontrol.h>
 #include <linux/static_key.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -1879,12 +1880,12 @@ static inline bool sk_has_allocations(const struct sock *sk)
 }
 
 /**
- * wq_has_sleeper - check if there are any waiting processes
+ * skwq_has_sleeper - check if there are any waiting processes
  * @wq: struct socket_wq
  *
  * Returns true if socket_wq has waiting processes
  *
- * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
+ * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
  * barrier call. They were added due to the race found within the tcp code.
  *
  * Consider following tcp code paths:
@@ -1910,15 +1911,9 @@ static inline bool sk_has_allocations(const struct sock *sk)
  * data on the socket.
  *
  */
-static inline bool wq_has_sleeper(struct socket_wq *wq)
+static inline bool skwq_has_sleeper(struct socket_wq *wq)
 {
-	/* We need to be sure we are in sync with the
-	 * add_wait_queue modifications to the wait queue.
-	 *
-	 * This memory barrier is paired in the sock_poll_wait.
-	 */
-	smp_mb();
-	return wq && waitqueue_active(&wq->wait);
+	return wq && wq_has_sleeper(&wq->wait);
 }
 
 /**
diff --git a/net/atm/common.c b/net/atm/common.c
index 49a872db7e42..6dc12305799e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -96,7 +96,7 @@ static void vcc_def_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up(&wq->wait);
 	rcu_read_unlock();
 }
@@ -117,7 +117,7 @@ static void vcc_write_space(struct sock *sk)
 
 	if (vcc_writable(sk)) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible(&wq->wait);
 
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e4dd54bfb5a..2769bd3a4d7c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2283,7 +2283,7 @@ static void sock_def_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_all(&wq->wait);
 	rcu_read_unlock();
 }
@@ -2294,7 +2294,7 @@ static void sock_def_error_report(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_poll(&wq->wait, POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
 	rcu_read_unlock();
@@ -2306,7 +2306,7 @@ static void sock_def_readable(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -2324,7 +2324,7 @@ static void sock_def_write_space(struct sock *sk)
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a0c889..8ff9d63b4265 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -35,7 +35,7 @@ void sk_stream_write_space(struct sock *sk)
 
 		rcu_read_lock();
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 4ce912e691d0..b66c84db0766 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -201,7 +201,7 @@ void dccp_write_space(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible(&wq->wait);
 	/* Should agree with poll, otherwise some programs break */
 	if (sock_writeable(sk))
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fcb2752419c6..4f0aa91470c6 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -303,7 +303,7 @@ static void iucv_sock_wake_msglim(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_all(&wq->wait);
 	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	rcu_read_unlock();
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 1f8a144a5dc2..7e2d1057d8bc 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -67,7 +67,7 @@ static void rxrpc_write_space(struct sock *sk)
 	if (rxrpc_writable(sk)) {
 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
 
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 897c01c029ca..ec10b66354b8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6978,7 +6978,7 @@ void sctp_data_ready(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 552dbaba9cf3..525acf6dd1c6 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1492,7 +1492,7 @@ static void tipc_write_space(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 	rcu_read_unlock();
@@ -1509,7 +1509,7 @@ static void tipc_data_ready(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	rcu_read_unlock();
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 955ec152cb71..efb706e1d1c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -339,7 +339,7 @@ static void unix_write_space(struct sock *sk)
 	rcu_read_lock();
 	if (unix_writable(sk)) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
 				POLLOUT | POLLWRNORM | POLLWRBAND);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-- 
cgit v1.2.3


From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 20:03:10 -0800
Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA

This patch is a cleanup to make following patch easier to
review.

Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
from (struct socket)->flags to a (struct socket_wq)->flags
to benefit from RCU protection in sock_wake_async()

To ease backports, we rename both constants.

Two new helpers, sk_set_bit(int nr, struct sock *sk)
and sk_clear_bit(int net, struct sock *sk) are added so that
following patch can change their implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/algif_aead.c          |  4 ++--
 crypto/algif_skcipher.c      |  6 +++---
 drivers/net/macvtap.c        |  4 ++--
 drivers/net/tun.c            |  4 ++--
 fs/dlm/lowcomms.c            |  4 ++--
 include/linux/net.h          |  6 +++---
 include/net/sock.h           | 10 ++++++++++
 net/bluetooth/af_bluetooth.c |  6 +++---
 net/caif/caif_socket.c       |  4 ++--
 net/core/datagram.c          |  2 +-
 net/core/sock.c              |  8 ++++----
 net/core/stream.c            |  4 ++--
 net/dccp/proto.c             |  3 +--
 net/decnet/af_decnet.c       |  8 ++++----
 net/ipv4/tcp.c               |  7 +++----
 net/iucv/af_iucv.c           |  2 +-
 net/nfc/llcp_sock.c          |  2 +-
 net/rxrpc/ar-output.c        |  2 +-
 net/sctp/socket.c            |  2 +-
 net/socket.c                 |  4 ++--
 net/sunrpc/xprtsock.c        | 14 +++++++-------
 net/unix/af_unix.c           |  6 +++---
 22 files changed, 60 insertions(+), 52 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 0aa6fdfb448a..6d4d4569447e 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -125,7 +125,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned flags)
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;
 
-	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	for (;;) {
 		if (signal_pending(current))
@@ -139,7 +139,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned flags)
 	}
 	finish_wait(sk_sleep(sk), &wait);
 
-	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	return err;
 }
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index af31a0ee4057..ca9efe17db1a 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -212,7 +212,7 @@ static int skcipher_wait_for_wmem(struct sock *sk, unsigned flags)
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;
 
-	set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	for (;;) {
 		if (signal_pending(current))
@@ -258,7 +258,7 @@ static int skcipher_wait_for_data(struct sock *sk, unsigned flags)
 		return -EAGAIN;
 	}
 
-	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	for (;;) {
 		if (signal_pending(current))
@@ -272,7 +272,7 @@ static int skcipher_wait_for_data(struct sock *sk, unsigned flags)
 	}
 	finish_wait(sk_sleep(sk), &wait);
 
-	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	return err;
 }
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 54036ae0a388..0fc521941c71 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -498,7 +498,7 @@ static void macvtap_sock_write_space(struct sock *sk)
 	wait_queue_head_t *wqueue;
 
 	if (!sock_writeable(sk) ||
-	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
 	wqueue = sk_sleep(sk);
@@ -585,7 +585,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
 		mask |= POLLIN | POLLRDNORM;
 
 	if (sock_writeable(&q->sk) ||
-	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
 	     sock_writeable(&q->sk)))
 		mask |= POLLOUT | POLLWRNORM;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b1878faea397..f0db770e8b2f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1040,7 +1040,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 		mask |= POLLIN | POLLRDNORM;
 
 	if (sock_writeable(sk) ||
-	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
 	     sock_writeable(sk)))
 		mask |= POLLOUT | POLLWRNORM;
 
@@ -1488,7 +1488,7 @@ static void tun_sock_write_space(struct sock *sk)
 	if (!sock_writeable(sk))
 		return;
 
-	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+	if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
 	wqueue = sk_sleep(sk);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 87e9d796cf7d..3a37bd3f9637 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -421,7 +421,7 @@ static void lowcomms_write_space(struct sock *sk)
 
 	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
 		con->sock->sk->sk_write_pending--;
-		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
 	}
 
 	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
@@ -1448,7 +1448,7 @@ static void send_to_sock(struct connection *con)
 					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
 				if (ret == -EAGAIN &&
-				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
 				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
 					/* Notify TCP that we're limited by the
 					 * application window size.
diff --git a/include/linux/net.h b/include/linux/net.h
index 70ac5e28e6b7..f514e4dd5521 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -34,8 +34,8 @@ struct inode;
 struct file;
 struct net;
 
-#define SOCK_ASYNC_NOSPACE	0
-#define SOCK_ASYNC_WAITDATA	1
+#define SOCKWQ_ASYNC_NOSPACE	0
+#define SOCKWQ_ASYNC_WAITDATA	1
 #define SOCK_NOSPACE		2
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
@@ -96,7 +96,7 @@ struct socket_wq {
  *  struct socket - general BSD socket
  *  @state: socket state (%SS_CONNECTED, etc)
  *  @type: socket type (%SOCK_STREAM, etc)
- *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
+ *  @flags: socket flags (%SOCK_NOSPACE, etc)
  *  @ops: protocol specific socket operations
  *  @file: File back pointer for gc
  *  @sk: internal networking protocol agnostic socket representation
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..c155d09d8af4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	return amt;
 }
 
+static inline void sk_set_bit(int nr, struct sock *sk)
+{
+	set_bit(nr, &sk->sk_socket->flags);
+}
+
+static inline void sk_clear_bit(int nr, struct sock *sk)
+{
+	clear_bit(nr, &sk->sk_socket->flags);
+}
+
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
 	if (sock_flag(sk, SOCK_FASYNC))
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index a3bffd1ec2b4..70306cc9d814 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -271,11 +271,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo)
 		if (signal_pending(current) || !timeo)
 			break;
 
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		release_sock(sk);
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	}
 
 	__set_current_state(TASK_RUNNING);
@@ -441,7 +441,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock,
 	if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index cc858919108e..aa209b1066c9 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -323,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
 			!timeo)
 			break;
 
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		release_sock(sk);
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
@@ -331,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
 		if (sock_flag(sk, SOCK_DEAD))
 			break;
 
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	}
 
 	finish_wait(sk_sleep(sk), &wait);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 617088aee21d..d62af69ad844 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -785,7 +785,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 	if (sock_writeable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e4dd54bfb5a..9d79569935a3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1815,7 +1815,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 {
 	DEFINE_WAIT(wait);
 
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 	for (;;) {
 		if (!timeo)
 			break;
@@ -1861,7 +1861,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
 			break;
 
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		err = -EAGAIN;
 		if (!timeo)
@@ -2048,9 +2048,9 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
-	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	finish_wait(sk_sleep(sk), &wait);
 	return rc;
 }
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a0c889..43309428644d 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
 
 	while (1) {
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
@@ -139,7 +139,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		}
 		if (signal_pending(current))
 			goto do_interrupted;
-		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		if (sk_stream_memory_free(sk) && !vm_wait)
 			break;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index b5cf13a28009..41e65804ddf5 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
 			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
-				set_bit(SOCK_ASYNC_NOSPACE,
-					&sk->sk_socket->flags);
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
 				/* Race breaker. If space is freed after
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 675cf94e04f8..eebf5ac8ce18 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1747,9 +1747,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		}
 
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		finish_wait(sk_sleep(sk), &wait);
 	}
 
@@ -2004,10 +2004,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 			}
 
 			prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-			set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+			sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 			sk_wait_event(sk, &timeo,
 				      !dn_queue_too_long(scp, queue, flags));
-			clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+			sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 			finish_wait(sk_sleep(sk), &wait);
 			continue;
 		}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c1728771cf89..c82cca18c90f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -517,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
-				set_bit(SOCK_ASYNC_NOSPACE,
-					&sk->sk_socket->flags);
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
 				/* Race breaker. If space is freed after
@@ -906,7 +905,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 			goto out_err;
 	}
 
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 	copied = 0;
@@ -1134,7 +1133,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	}
 
 	/* This should be in poll */
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fcb2752419c6..435608c4306d 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1483,7 +1483,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
 	if (sock_writeable(sk) && iucv_below_msglim(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b7de0da46acd..ecf0a0196f18 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock,
 	if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	pr_debug("mask 0x%x\n", mask);
 
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index a40d3afe93b7..14c4e12c47b0 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
 	/* this should be in poll */
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		return -EPIPE;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 897c01c029ca..2353985d689c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6458,7 +6458,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (sctp_writeable(sk)) {
 		mask |= POLLOUT | POLLWRNORM;
 	} else {
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		/*
 		 * Since the socket is not locked, the buffer
 		 * might be made available after the writeable check and
diff --git a/net/socket.c b/net/socket.c
index dd2c247c99e3..16be908205fc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1072,11 +1072,11 @@ int sock_wake_async(struct socket *sock, int how, int band)
 	}
 	switch (how) {
 	case SOCK_WAKE_WAITD:
-		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
+		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags))
 			break;
 		goto call_kill;
 	case SOCK_WAKE_SPACE:
-		if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
+		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags))
 			break;
 		/* fall through */
 	case SOCK_WAKE_IO:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1d1a70498910..2ffaf6a79499 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -398,7 +398,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+	clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags);
 	if (base != 0) {
 		addr = NULL;
 		addrlen = 0;
@@ -442,7 +442,7 @@ static void xs_nospace_callback(struct rpc_task *task)
 	struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
 
 	transport->inet->sk_write_pending--;
-	clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+	clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 }
 
 /**
@@ -467,7 +467,7 @@ static int xs_nospace(struct rpc_task *task)
 
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
-		if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+		if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) {
 			/*
 			 * Notify TCP that we're limited by the application
 			 * window size
@@ -478,7 +478,7 @@ static int xs_nospace(struct rpc_task *task)
 			xprt_wait_for_buffer_space(task, xs_nospace_callback);
 		}
 	} else {
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 		ret = -ENOTCONN;
 	}
 
@@ -626,7 +626,7 @@ process_status:
 	case -EPERM:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 	}
 
 	return status;
@@ -715,7 +715,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	case -EADDRINUSE:
 	case -ENOBUFS:
 	case -EPIPE:
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 	}
 
 	return status;
@@ -1618,7 +1618,7 @@ static void xs_write_space(struct sock *sk)
 
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
-	if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
+	if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0)
 		return;
 
 	xprt_write_space(xprt);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6ced74690eee..45aebd966978 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2191,7 +2191,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 		    !timeo)
 			break;
 
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		unix_state_unlock(sk);
 		timeo = freezable_schedule_timeout(timeo);
 		unix_state_lock(sk);
@@ -2199,7 +2199,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 		if (sock_flag(sk, SOCK_DEAD))
 			break;
 
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	}
 
 	finish_wait(sk_sleep(sk), &wait);
@@ -2683,7 +2683,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	if (writable)
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
-- 
cgit v1.2.3


From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 20:03:11 -0800
Subject: net: fix sock_wake_async() rcu protection

Dmitry provided a syzkaller (http://github.com/google/syzkaller)
triggering a fault in sock_wake_async() when async IO is requested.

Said program stressed af_unix sockets, but the issue is generic
and should be addressed in core networking stack.

The problem is that by the time sock_wake_async() is called,
we should not access the @flags field of 'struct socket',
as the inode containing this socket might be freed without
further notice, and without RCU grace period.

We already maintain an RCU protected structure, "struct socket_wq"
so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it
is the safe route.

It also reduces number of cache lines needing dirtying, so might
provide a performance improvement anyway.

In followup patches, we might move remaining flags (SOCK_NOSPACE,
SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket'
being mostly read and let it being shared between cpus.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  7 ++++++-
 include/net/sock.h  | 23 ++++++++++++++++-------
 net/core/stream.c   |  2 +-
 net/sctp/socket.c   | 24 ++++++++++++++----------
 net/socket.c        | 21 +++++++--------------
 5 files changed, 44 insertions(+), 33 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/net.h b/include/linux/net.h
index f514e4dd5521..0b4ac7da583a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -34,6 +34,10 @@ struct inode;
 struct file;
 struct net;
 
+/* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located
+ * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
+ * Eventually all flags will be in sk->sk_wq_flags.
+ */
 #define SOCKWQ_ASYNC_NOSPACE	0
 #define SOCKWQ_ASYNC_WAITDATA	1
 #define SOCK_NOSPACE		2
@@ -89,6 +93,7 @@ struct socket_wq {
 	/* Note: wait MUST be first field of socket_wq */
 	wait_queue_head_t	wait;
 	struct fasync_struct	*fasync_list;
+	unsigned long		flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */
 	struct rcu_head		rcu;
 } ____cacheline_aligned_in_smp;
 
@@ -202,7 +207,7 @@ enum {
 	SOCK_WAKE_URG,
 };
 
-int sock_wake_async(struct socket *sk, int how, int band);
+int sock_wake_async(struct socket_wq *sk_wq, int how, int band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
diff --git a/include/net/sock.h b/include/net/sock.h
index c155d09d8af4..0434138c5f95 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -384,8 +384,10 @@ struct sock {
 	int			sk_rcvbuf;
 
 	struct sk_filter __rcu	*sk_filter;
-	struct socket_wq __rcu	*sk_wq;
-
+	union {
+		struct socket_wq __rcu	*sk_wq;
+		struct socket_wq	*sk_wq_raw;
+	};
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
@@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	return amt;
 }
 
+/* Note:
+ *  We use sk->sk_wq_raw, from contexts knowing this
+ *  pointer is not NULL and cannot disappear/change.
+ */
 static inline void sk_set_bit(int nr, struct sock *sk)
 {
-	set_bit(nr, &sk->sk_socket->flags);
+	set_bit(nr, &sk->sk_wq_raw->flags);
 }
 
 static inline void sk_clear_bit(int nr, struct sock *sk)
 {
-	clear_bit(nr, &sk->sk_socket->flags);
+	clear_bit(nr, &sk->sk_wq_raw->flags);
 }
 
-static inline void sk_wake_async(struct sock *sk, int how, int band)
+static inline void sk_wake_async(const struct sock *sk, int how, int band)
 {
-	if (sock_flag(sk, SOCK_FASYNC))
-		sock_wake_async(sk->sk_socket, how, band);
+	if (sock_flag(sk, SOCK_FASYNC)) {
+		rcu_read_lock();
+		sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
+		rcu_read_unlock();
+	}
 }
 
 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
diff --git a/net/core/stream.c b/net/core/stream.c
index 43309428644d..b96f7a79e544 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
-			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		rcu_read_unlock();
 	}
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2353985d689c..5e35ef34008b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6801,26 +6801,30 @@ no_packet:
 static void __sctp_write_space(struct sctp_association *asoc)
 {
 	struct sock *sk = asoc->base.sk;
-	struct socket *sock = sk->sk_socket;
 
-	if ((sctp_wspace(asoc) > 0) && sock) {
-		if (waitqueue_active(&asoc->wait))
-			wake_up_interruptible(&asoc->wait);
+	if (sctp_wspace(asoc) <= 0)
+		return;
+
+	if (waitqueue_active(&asoc->wait))
+		wake_up_interruptible(&asoc->wait);
 
-		if (sctp_writeable(sk)) {
-			wait_queue_head_t *wq = sk_sleep(sk);
+	if (sctp_writeable(sk)) {
+		struct socket_wq *wq;
 
-			if (wq && waitqueue_active(wq))
-				wake_up_interruptible(wq);
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq) {
+			if (waitqueue_active(&wq->wait))
+				wake_up_interruptible(&wq->wait);
 
 			/* Note that we try to include the Async I/O support
 			 * here by modeling from the current TCP/UDP code.
 			 * We have not tested with it yet.
 			 */
 			if (!(sk->sk_shutdown & SEND_SHUTDOWN))
-				sock_wake_async(sock,
-						SOCK_WAKE_SPACE, POLL_OUT);
+				sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		}
+		rcu_read_unlock();
 	}
 }
 
diff --git a/net/socket.c b/net/socket.c
index 16be908205fc..456fadb3d819 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1056,27 +1056,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
 	return 0;
 }
 
-/* This function may be called only under socket lock or callback_lock or rcu_lock */
+/* This function may be called only under rcu_lock */
 
-int sock_wake_async(struct socket *sock, int how, int band)
+int sock_wake_async(struct socket_wq *wq, int how, int band)
 {
-	struct socket_wq *wq;
-
-	if (!sock)
-		return -1;
-	rcu_read_lock();
-	wq = rcu_dereference(sock->wq);
-	if (!wq || !wq->fasync_list) {
-		rcu_read_unlock();
+	if (!wq || !wq->fasync_list)
 		return -1;
-	}
+
 	switch (how) {
 	case SOCK_WAKE_WAITD:
-		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags))
+		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
 			break;
 		goto call_kill;
 	case SOCK_WAKE_SPACE:
-		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags))
+		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
 			break;
 		/* fall through */
 	case SOCK_WAKE_IO:
@@ -1086,7 +1079,7 @@ call_kill:
 	case SOCK_WAKE_URG:
 		kill_fasync(&wq->fasync_list, SIGURG, band);
 	}
-	rcu_read_unlock();
+
 	return 0;
 }
 EXPORT_SYMBOL(sock_wake_async);
-- 
cgit v1.2.3


From 7450aaf61f0ae2ee6cc6491138d11df2c25e7609 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Nov 2015 08:57:28 -0800
Subject: tcp: suppress too verbose messages in tcp_send_ack()

If tcp_send_ack() can not allocate skb, we properly handle this
and setup a timer to try later.

Use __GFP_NOWARN to avoid polluting syslog in the case host is
under memory pressure, so that pertinent messages are not lost under
a flood of useless information.

sk_gfp_atomic() can use its gfp_mask argument (all callers currently
were using GFP_ATOMIC before this patch)

We rename sk_gfp_atomic() to sk_gfp_mask() to clearly express this
function now takes into account its second argument (gfp_mask)

Note that when tcp_transmit_skb() is called with clone_it set to false,
we do not attempt memory allocations, so can pass a 0 gfp_mask, which
most compilers can emit faster than a non zero or constant value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    |  4 ++--
 net/ipv4/tcp_output.c | 14 ++++++++------
 net/ipv6/tcp_ipv6.c   |  6 +++---
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 62d35afcb3ac..9065f8b7e646 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -775,9 +775,9 @@ static inline int sk_memalloc_socks(void)
 
 #endif
 
-static inline gfp_t sk_gfp_atomic(const struct sock *sk, gfp_t gfp_mask)
+static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
 {
-	return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);
+	return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
 }
 
 static inline void sk_acceptq_removed(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cb7ca569052c..a800cee88035 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2296,7 +2296,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 		return;
 
 	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
-			   sk_gfp_atomic(sk, GFP_ATOMIC)))
+			   sk_gfp_mask(sk, GFP_ATOMIC)))
 		tcp_check_probe_timer(sk);
 }
 
@@ -3352,8 +3352,9 @@ void tcp_send_ack(struct sock *sk)
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.
 	 */
-	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
-	if (!buff) {
+	buff = alloc_skb(MAX_TCP_HEADER,
+			 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+	if (unlikely(!buff)) {
 		inet_csk_schedule_ack(sk);
 		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3375,7 +3376,7 @@ void tcp_send_ack(struct sock *sk)
 
 	/* Send it off, this clears delayed acks for us. */
 	skb_mstamp_get(&buff->skb_mstamp);
-	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+	tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
 }
 EXPORT_SYMBOL_GPL(tcp_send_ack);
 
@@ -3396,7 +3397,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
 	struct sk_buff *skb;
 
 	/* We don't queue it, tcp_transmit_skb() sets ownership. */
-	skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+	skb = alloc_skb(MAX_TCP_HEADER,
+			sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
 	if (!skb)
 		return -1;
 
@@ -3409,7 +3411,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
 	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
 	skb_mstamp_get(&skb->skb_mstamp);
 	NET_INC_STATS(sock_net(sk), mib);
-	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
 }
 
 void tcp_send_window_probe(struct sock *sk)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c5429a636f1a..41bcd59a2ac7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1130,7 +1130,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		 */
 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
 			       AF_INET6, key->key, key->keylen,
-			       sk_gfp_atomic(sk, GFP_ATOMIC));
+			       sk_gfp_mask(sk, GFP_ATOMIC));
 	}
 #endif
 
@@ -1146,7 +1146,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		/* Clone pktoptions received with SYN, if we own the req */
 		if (ireq->pktopts) {
 			newnp->pktoptions = skb_clone(ireq->pktopts,
-						      sk_gfp_atomic(sk, GFP_ATOMIC));
+						      sk_gfp_mask(sk, GFP_ATOMIC));
 			consume_skb(ireq->pktopts);
 			ireq->pktopts = NULL;
 			if (newnp->pktoptions)
@@ -1212,7 +1212,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 					       --ANK (980728)
 	 */
 	if (np->rxopt.all)
-		opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
+		opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
 		struct dst_entry *dst = sk->sk_rx_dst;
-- 
cgit v1.2.3


From 6bd4f355df2eae80b8a5c7b097371cd1e05f20d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Dec 2015 21:53:57 -0800
Subject: ipv6: kill sk_dst_lock

While testing the np->opt RCU conversion, I found that UDP/IPv6 was
using a mixture of xchg() and sk_dst_lock to protect concurrent changes
to sk->sk_dst_cache, leading to possible corruptions and crashes.

ip6_sk_dst_lookup_flow() uses sk_dst_check() anyway, so the simplest
way to fix the mess is to remove sk_dst_lock completely, as we did for
IPv4.

__ip6_dst_store() and ip6_dst_store() share same implementation.

sk_setup_caps() being called with socket lock being held or not,
we have to use sk_dst_set() instead of __sk_dst_set()

Note that I had to move the "np->dst_cookie = rt6_get_cookie(rt);"
in ip6_dst_store() before the sk_setup_caps(sk, dst) call.

This is because ip6_dst_store() can be called from process context,
without any lock held.

As soon as the dst is installed in sk->sk_dst_cache, dst can be freed
from another cpu doing a concurrent ip6_dst_store()

Doing the dst dereference before doing the install is needed to make
sure no use after free would trigger.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h          | 17 ++++-------------
 include/net/sock.h               |  3 +--
 net/core/sock.c                  |  4 +---
 net/dccp/ipv6.c                  |  4 ++--
 net/ipv6/af_inet6.c              |  2 +-
 net/ipv6/icmp.c                  | 14 --------------
 net/ipv6/inet6_connection_sock.c | 10 +---------
 net/ipv6/tcp_ipv6.c              |  4 ++--
 8 files changed, 12 insertions(+), 46 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 2bfb2ad2fab1..877f682989b8 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -133,27 +133,18 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 /*
  *	Store a destination cache entry in a socket
  */
-static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				   const struct in6_addr *daddr,
-				   const struct in6_addr *saddr)
+static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *saddr)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct rt6_info *rt = (struct rt6_info *) dst;
 
+	np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst);
 	sk_setup_caps(sk, dst);
 	np->daddr_cache = daddr;
 #ifdef CONFIG_IPV6_SUBTREES
 	np->saddr_cache = saddr;
 #endif
-	np->dst_cookie = rt6_get_cookie(rt);
-}
-
-static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				 struct in6_addr *daddr, struct in6_addr *saddr)
-{
-	spin_lock(&sk->sk_dst_lock);
-	__ip6_dst_store(sk, dst, daddr, saddr);
-	spin_unlock(&sk->sk_dst_lock);
 }
 
 static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
diff --git a/include/net/sock.h b/include/net/sock.h
index 0434138c5f95..52d27ee924f4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -254,7 +254,6 @@ struct cg_proto;
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early demux
   *	@sk_dst_cache: destination cache
-  *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
@@ -393,7 +392,7 @@ struct sock {
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
-	spinlock_t		sk_dst_lock;
+	/* Note: 32bit hole on 64bit arches */
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
diff --git a/net/core/sock.c b/net/core/sock.c
index 9d79569935a3..e31dfcee1729 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1530,7 +1530,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
 
-		spin_lock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1607,7 +1606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
 	u32 max_segs = 1;
 
-	__sk_dst_set(sk, dst);
+	sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
@@ -2388,7 +2387,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	} else
 		sk->sk_wq	=	NULL;
 
-	spin_lock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index e7e0b9bc2a43..9c6d0508e63a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -459,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 	 * comment in that function for the gory details. -acme
 	 */
 
-	__ip6_dst_store(newsk, dst, NULL, NULL);
+	ip6_dst_store(newsk, dst, NULL, NULL);
 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
 						      NETIF_F_TSO);
 	newdp6 = (struct dccp6_sock *)newsk;
@@ -883,7 +883,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	np->saddr = *saddr;
 	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
-	__ip6_dst_store(sk, dst, NULL, NULL);
+	ip6_dst_store(sk, dst, NULL, NULL);
 
 	icsk->icsk_ext_hdr_len = 0;
 	if (opt)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 38d66ddfb937..8ec0df75f1c4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -673,7 +673,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 			return PTR_ERR(dst);
 		}
 
-		__ip6_dst_store(sk, dst, NULL, NULL);
+		ip6_dst_store(sk, dst, NULL, NULL);
 	}
 
 	return 0;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 36c5a98b0472..0a37ddc7af51 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 }
 
-/*
- * Special lock-class for __icmpv6_sk:
- */
-static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
-
 static int __net_init icmpv6_sk_init(struct net *net)
 {
 	struct sock *sk;
@@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net)
 
 		net->ipv6.icmp_sk[i] = sk;
 
-		/*
-		 * Split off their lock-class, because sk->sk_dst_lock
-		 * gets used from softirqs, which is safe for
-		 * __icmpv6_sk (because those never get directly used
-		 * via userspace syscalls), but unsafe for normal sockets.
-		 */
-		lockdep_set_class(&sk->sk_dst_lock,
-				  &icmpv6_socket_sk_dst_lock_key);
-
 		/* Enough space for 2 64K ICMP packets, including
 		 * sk_buff struct overhead.
 		 */
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3ff5208772bb..a7ca2cde2ecb 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -110,14 +110,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
 }
 EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
 
-static inline
-void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
-			   const struct in6_addr *daddr,
-			   const struct in6_addr *saddr)
-{
-	__ip6_dst_store(sk, dst, daddr, saddr);
-}
-
 static inline
 struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 {
@@ -153,7 +145,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 		dst = ip6_dst_lookup_flow(sk, fl6, final_p);
 
 		if (!IS_ERR(dst))
-			__inet6_csk_dst_store(sk, dst, NULL, NULL);
+			ip6_dst_store(sk, dst, NULL, NULL);
 	}
 	return dst;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6a50bb4a0dae..e7aab561b7b4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -257,7 +257,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 	sk->sk_gso_type = SKB_GSO_TCPV6;
-	__ip6_dst_store(sk, dst, NULL, NULL);
+	ip6_dst_store(sk, dst, NULL, NULL);
 
 	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
@@ -1060,7 +1060,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	 */
 
 	newsk->sk_gso_type = SKB_GSO_TCPV6;
-	__ip6_dst_store(newsk, dst, NULL, NULL);
+	ip6_dst_store(newsk, dst, NULL, NULL);
 	inet6_sk_rx_dst_set(newsk, skb);
 
 	newtcp6sk = (struct tcp6_sock *)newsk;
-- 
cgit v1.2.3


From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 4 Dec 2015 15:14:04 -0200
Subject: sctp: update the netstamp_needed counter when copying sockets

Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy
related to disabling sock timestamp.

When SCTP accepts an association or peel one off, it copies sock flags
but forgot to call net_enable_timestamp() if a packet timestamping flag
was copied, leading to extra calls to net_disable_timestamp() whenever
such clones were closed.

The fix is to call net_enable_timestamp() whenever we copy a sock with
that flag on, like tcp does.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 ++
 net/core/sock.c    | 2 --
 net/sctp/socket.c  | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 52d27ee924f4..b1d475b5db68 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -740,6 +740,8 @@ enum sock_flags {
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 };
 
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
 {
 	nsk->sk_flags = osk->sk_flags;
diff --git a/net/core/sock.c b/net/core/sock.c
index e31dfcee1729..d01c8f42dbb2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -433,8 +433,6 @@ static bool sock_needs_netstamp(const struct sock *sk)
 	}
 }
 
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-
 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 {
 	if (sk->sk_flags & flags) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 03c8256063ec..4c9282bdd067 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7199,6 +7199,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_ttl = 1;
 	newinet->mc_index = 0;
 	newinet->mc_list = NULL;
+
+	if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+		net_enable_timestamp();
 }
 
 static inline void sctp_copy_descendant(struct sock *sk_to,
-- 
cgit v1.2.3


From 297dbde19cf6a0ccb6fd4396c6220a5912ed61e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Dec 2015 17:38:51 -0500
Subject: netprio_cgroup: limit the maximum css->id to USHRT_MAX

netprio builds per-netdev contiguous priomap array which is indexed by
css->id.  The array is allocated using kzalloc() effectively limiting
the maximum ID supported to some thousand range.  This patch caps the
maximum supported css->id to USHRT_MAX which should be way above what
is actually useable.

This allows reducing sock->sk_cgrp_prioidx to u16 from u32.  The freed
up part will be used to overload the cgroup related fields.
sock->sk_cgrp_prioidx's position is swapped with sk_mark so that the
two cgroup related fields are adjacent.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h        | 10 +++++-----
 net/core/netprio_cgroup.c |  9 +++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 6f58b84fc742..a95bcf7d6efa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -288,7 +288,6 @@ struct cg_proto;
   *	@sk_ack_backlog: current listen backlog
   *	@sk_max_ack_backlog: listen backlog set in listen()
   *	@sk_priority: %SO_PRIORITY setting
-  *	@sk_cgrp_prioidx: socket group's priority map index
   *	@sk_type: socket type (%SOCK_STREAM, etc)
   *	@sk_protocol: which protocol this socket belongs in this network family
   *	@sk_peer_pid: &struct pid for this socket's peer
@@ -309,6 +308,7 @@ struct cg_proto;
   *	@sk_send_head: front of stuff to transmit
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
+  *	@sk_cgrp_prioidx: socket group's priority map index
   *	@sk_classid: this socket's cgroup classid
   *	@sk_cgrp: this socket's cgroup-specific proto data
   *	@sk_write_pending: a write to stream socket waits to start
@@ -425,9 +425,7 @@ struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	__u32			sk_priority;
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-	__u32			sk_cgrp_prioidx;
-#endif
+	__u32			sk_mark;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
@@ -445,7 +443,9 @@ struct sock {
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
-	__u32			sk_mark;
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+	u16			sk_cgrp_prioidx;
+#endif
 #ifdef CONFIG_CGROUP_NET_CLASSID
 	u32			sk_classid;
 #endif
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index cbd0a199bf52..2b9159b7a28a 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -27,6 +27,12 @@
 
 #include <linux/fdtable.h>
 
+/*
+ * netprio allocates per-net_device priomap array which is indexed by
+ * css->id.  Limiting css ID to 16bits doesn't lose anything.
+ */
+#define NETPRIO_ID_MAX		USHRT_MAX
+
 #define PRIOMAP_MIN_SZ		128
 
 /*
@@ -144,6 +150,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
 	struct net_device *dev;
 	int ret = 0;
 
+	if (css->id > NETPRIO_ID_MAX)
+		return -ENOSPC;
+
 	if (!parent_css)
 		return 0;
 
-- 
cgit v1.2.3


From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Dec 2015 17:38:52 -0500
Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct

Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data.
->sk_cgroup_prioidx and ->sk_classid are moved into it.  The struct
and its accessors are defined in cgroup-defs.h.  This is to prepare
for overloading the fields with a cgroup pointer.

This patch mostly performs equivalent conversions but the followings
are noteworthy.

* Equality test before updating classid is removed from
  sock_update_classid().  This shouldn't make any noticeable
  difference and a similar test will be implemented on the helper side
  later.

* sock_update_netprioidx() now takes struct sock_cgroup_data and can
  be moved to netprio_cgroup.h without causing include dependency
  loop.  Moved.

* The dummy version of sock_update_netprioidx() converted to a static
  inline function while at it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup-defs.h  | 36 ++++++++++++++++++++++++++++++++++++
 include/net/cls_cgroup.h     | 11 +++++------
 include/net/netprio_cgroup.h | 16 +++++++++++++---
 include/net/sock.h           | 11 +++--------
 net/Kconfig                  |  6 ++++++
 net/core/dev.c               |  3 ++-
 net/core/netclassid_cgroup.c |  4 ++--
 net/core/netprio_cgroup.c    |  3 ++-
 net/core/scm.c               |  4 ++--
 net/core/sock.c              | 15 ++-------------
 net/netfilter/nft_meta.c     |  2 +-
 net/netfilter/xt_cgroup.c    |  3 ++-
 12 files changed, 76 insertions(+), 38 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 504d8591b6d3..ed128fed0335 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -542,4 +542,40 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
 
 #endif	/* CONFIG_CGROUPS */
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+struct sock_cgroup_data {
+	u16	prioidx;
+	u32	classid;
+};
+
+static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
+{
+	return skcd->prioidx;
+}
+
+static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
+{
+	return skcd->classid;
+}
+
+static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
+					   u16 prioidx)
+{
+	skcd->prioidx = prioidx;
+}
+
+static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
+					   u32 classid)
+{
+	skcd->classid = classid;
+}
+
+#else	/* CONFIG_SOCK_CGROUP_DATA */
+
+struct sock_cgroup_data {
+};
+
+#endif	/* CONFIG_SOCK_CGROUP_DATA */
+
 #endif	/* _LINUX_CGROUP_DEFS_H */
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index ccd6d8bffa4d..c0a92e2c286d 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -41,13 +41,12 @@ static inline u32 task_cls_classid(struct task_struct *p)
 	return classid;
 }
 
-static inline void sock_update_classid(struct sock *sk)
+static inline void sock_update_classid(struct sock_cgroup_data *skcd)
 {
 	u32 classid;
 
 	classid = task_cls_classid(current);
-	if (classid != sk->sk_classid)
-		sk->sk_classid = classid;
+	sock_cgroup_set_classid(skcd, classid);
 }
 
 static inline u32 task_get_classid(const struct sk_buff *skb)
@@ -64,17 +63,17 @@ static inline u32 task_get_classid(const struct sk_buff *skb)
 	 * softirqs always disables bh.
 	 */
 	if (in_serving_softirq()) {
-		/* If there is an sk_classid we'll use that. */
+		/* If there is an sock_cgroup_classid we'll use that. */
 		if (!skb->sk)
 			return 0;
 
-		classid = skb->sk->sk_classid;
+		classid = sock_cgroup_classid(&skb->sk->sk_cgrp_data);
 	}
 
 	return classid;
 }
 #else /* !CONFIG_CGROUP_NET_CLASSID */
-static inline void sock_update_classid(struct sock *sk)
+static inline void sock_update_classid(struct sock_cgroup_data *skcd)
 {
 }
 
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index f2a9597ff53c..604190596cde 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -25,8 +25,6 @@ struct netprio_map {
 	u32 priomap[];
 };
 
-void sock_update_netprioidx(struct sock *sk);
-
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	struct cgroup_subsys_state *css;
@@ -38,13 +36,25 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	rcu_read_unlock();
 	return idx;
 }
+
+static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
+{
+	if (in_interrupt())
+		return;
+
+	sock_cgroup_set_prioidx(skcd, task_netprioidx(current));
+}
+
 #else /* !CONFIG_CGROUP_NET_PRIO */
+
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	return 0;
 }
 
-#define sock_update_netprioidx(sk)
+static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
+{
+}
 
 #endif /* CONFIG_CGROUP_NET_PRIO */
 #endif  /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index a95bcf7d6efa..0ca22b014de1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -59,6 +59,7 @@
 #include <linux/static_key.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/cgroup-defs.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -308,8 +309,7 @@ struct cg_proto;
   *	@sk_send_head: front of stuff to transmit
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
-  *	@sk_cgrp_prioidx: socket group's priority map index
-  *	@sk_classid: this socket's cgroup classid
+  *	@sk_cgrp_data: cgroup data for this cgroup
   *	@sk_cgrp: this socket's cgroup-specific proto data
   *	@sk_write_pending: a write to stream socket waits to start
   *	@sk_state_change: callback to indicate change in the state of the sock
@@ -443,12 +443,7 @@ struct sock {
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-	u16			sk_cgrp_prioidx;
-#endif
-#ifdef CONFIG_CGROUP_NET_CLASSID
-	u32			sk_classid;
-#endif
+	struct sock_cgroup_data	sk_cgrp_data;
 	struct cg_proto		*sk_cgrp;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
diff --git a/net/Kconfig b/net/Kconfig
index 127da94ae25e..11f8c22af34d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -250,9 +250,14 @@ config XPS
 	depends on SMP
 	default y
 
+config SOCK_CGROUP_DATA
+	bool
+	default n
+
 config CGROUP_NET_PRIO
 	bool "Network priority cgroup"
 	depends on CGROUPS
+	select SOCK_CGROUP_DATA
 	---help---
 	  Cgroup subsystem for use in assigning processes to network priorities on
 	  a per-interface basis.
@@ -260,6 +265,7 @@ config CGROUP_NET_PRIO
 config CGROUP_NET_CLASSID
 	bool "Network classid cgroup"
 	depends on CGROUPS
+	select SOCK_CGROUP_DATA
 	---help---
 	  Cgroup subsystem for use as general purpose socket classid marker that is
 	  being used in cls_cgroup and for netfilter matching.
diff --git a/net/core/dev.c b/net/core/dev.c
index e5c395473eba..8f705fcedb94 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2929,7 +2929,8 @@ static void skb_update_prio(struct sk_buff *skb)
 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 	if (!skb->priority && skb->sk && map) {
-		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+		unsigned int prioidx =
+			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
 
 		if (prioidx < map->priomap_len)
 			skb->priority = map->priomap[prioidx];
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 2e4df84c34a1..e60ded46b3ac 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -62,8 +62,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
 	struct socket *sock = sock_from_file(file, &err);
 
 	if (sock)
-		sock->sk->sk_classid = (u32)(unsigned long)v;
-
+		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
+					(unsigned long)v);
 	return 0;
 }
 
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2b9159b7a28a..de42aa7f6c77 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -223,7 +223,8 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
 	int err;
 	struct socket *sock = sock_from_file(file, &err);
 	if (sock)
-		sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+					(unsigned long)v);
 	return 0;
 }
 
diff --git a/net/core/scm.c b/net/core/scm.c
index 8a1741b14302..14596fb37172 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -289,8 +289,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 		/* Bump the usage count and install the file. */
 		sock = sock_from_file(fp[i], &err);
 		if (sock) {
-			sock_update_netprioidx(sock->sk);
-			sock_update_classid(sock->sk);
+			sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+			sock_update_classid(&sock->sk->sk_cgrp_data);
 		}
 		fd_install(new_fd, get_file(fp[i]));
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 7965ef487375..947741dc43fa 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1393,17 +1393,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 	module_put(owner);
 }
 
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-void sock_update_netprioidx(struct sock *sk)
-{
-	if (in_interrupt())
-		return;
-
-	sk->sk_cgrp_prioidx = task_netprioidx(current);
-}
-EXPORT_SYMBOL_GPL(sock_update_netprioidx);
-#endif
-
 /**
  *	sk_alloc - All socket objects are allocated here
  *	@net: the applicable net namespace
@@ -1432,8 +1421,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sock_net_set(sk, net);
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
-		sock_update_classid(sk);
-		sock_update_netprioidx(sk);
+		sock_update_classid(&sk->sk_cgrp_data);
+		sock_update_netprioidx(&sk->sk_cgrp_data);
 	}
 
 	return sk;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 9dfaf4d55ee0..1915cab7f32d 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -174,7 +174,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		sk = skb_to_full_sk(skb);
 		if (!sk || !sk_fullsock(sk))
 			goto err;
-		*dest = sk->sk_classid;
+		*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
 		break;
 #endif
 	default:
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a1d126f29463..54eaeb45ce99 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -42,7 +42,8 @@ cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (skb->sk == NULL || !sk_fullsock(skb->sk))
 		return false;
 
-	return (info->id == skb->sk->sk_classid) ^ info->invert;
+	return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
+		info->invert;
 }
 
 static struct xt_match cgroup_mt_reg __read_mostly = {
-- 
cgit v1.2.3


From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Dec 2015 07:22:02 -0800
Subject: xfrm: add rcu protection to sk->sk_policy[]

XFRM can deal with SYNACK messages, sent while listener socket
is not locked. We add proper rcu protection to __xfrm_sk_clone_policy()
and xfrm_sk_policy_lookup()

This might serve as the first step to remove xfrm.xfrm_policy_lock
use in fast path.

Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     |  2 +-
 include/net/xfrm.h     | 24 +++++++++++++++---------
 net/core/sock.c        |  2 +-
 net/xfrm/xfrm_policy.c | 37 +++++++++++++++++++++++++------------
 4 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index b1d475b5db68..eaef41433d7a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -388,7 +388,7 @@ struct sock {
 		struct socket_wq	*sk_wq_raw;
 	};
 #ifdef CONFIG_XFRM
-	struct xfrm_policy	*sk_policy[2];
+	struct xfrm_policy __rcu *sk_policy[2];
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8bae1ef647cd..d6f6e5006ee9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1142,12 +1142,14 @@ static inline int xfrm6_route_forward(struct sk_buff *skb)
 	return xfrm_route_forward(skb, AF_INET6);
 }
 
-int __xfrm_sk_clone_policy(struct sock *sk);
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);
 
-static inline int xfrm_sk_clone_policy(struct sock *sk)
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
-	if (unlikely(sk->sk_policy[0] || sk->sk_policy[1]))
-		return __xfrm_sk_clone_policy(sk);
+	sk->sk_policy[0] = NULL;
+	sk->sk_policy[1] = NULL;
+	if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
+		return __xfrm_sk_clone_policy(sk, osk);
 	return 0;
 }
 
@@ -1155,12 +1157,16 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir);
 
 static inline void xfrm_sk_free_policy(struct sock *sk)
 {
-	if (unlikely(sk->sk_policy[0] != NULL)) {
-		xfrm_policy_delete(sk->sk_policy[0], XFRM_POLICY_MAX);
+	struct xfrm_policy *pol;
+
+	pol = rcu_dereference_protected(sk->sk_policy[0], 1);
+	if (unlikely(pol != NULL)) {
+		xfrm_policy_delete(pol, XFRM_POLICY_MAX);
 		sk->sk_policy[0] = NULL;
 	}
-	if (unlikely(sk->sk_policy[1] != NULL)) {
-		xfrm_policy_delete(sk->sk_policy[1], XFRM_POLICY_MAX+1);
+	pol = rcu_dereference_protected(sk->sk_policy[1], 1);
+	if (unlikely(pol != NULL)) {
+		xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
 		sk->sk_policy[1] = NULL;
 	}
 }
@@ -1170,7 +1176,7 @@ void xfrm_garbage_collect(struct net *net);
 #else
 
 static inline void xfrm_sk_free_policy(struct sock *sk) {}
-static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; }
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
 static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }  
 static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } 
 static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
diff --git a/net/core/sock.c b/net/core/sock.c
index d01c8f42dbb2..765be835b06c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1550,7 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			 */
 			is_charged = sk_filter_charge(newsk, filter);
 
-		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
 			/* It is still raw copy of parent, so invalidate
 			 * destructor and make plain sk_free() */
 			newsk->sk_destruct = NULL;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f57a5712cedd..948fa5560de5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1221,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 	struct xfrm_policy *pol;
 	struct net *net = sock_net(sk);
 
+	rcu_read_lock();
 	read_lock_bh(&net->xfrm.xfrm_policy_lock);
-	if ((pol = sk->sk_policy[dir]) != NULL) {
+	pol = rcu_dereference(sk->sk_policy[dir]);
+	if (pol != NULL) {
 		bool match = xfrm_selector_match(&pol->selector, fl,
 						 sk->sk_family);
 		int err = 0;
@@ -1246,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 	}
 out:
 	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_unlock();
 	return pol;
 }
 
@@ -1314,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 #endif
 
 	write_lock_bh(&net->xfrm.xfrm_policy_lock);
-	old_pol = sk->sk_policy[dir];
-	sk->sk_policy[dir] = pol;
+	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
+				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 	if (pol) {
 		pol->curlft.add_time = get_seconds();
 		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
 		xfrm_sk_policy_link(pol, dir);
 	}
+	rcu_assign_pointer(sk->sk_policy[dir], pol);
 	if (old_pol) {
 		if (pol)
 			xfrm_policy_requeue(old_pol, pol);
@@ -1368,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
 	return newp;
 }
 
-int __xfrm_sk_clone_policy(struct sock *sk)
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
-	struct xfrm_policy *p0 = sk->sk_policy[0],
-			   *p1 = sk->sk_policy[1];
+	const struct xfrm_policy *p;
+	struct xfrm_policy *np;
+	int i, ret = 0;
 
-	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
-	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
-		return -ENOMEM;
-	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
-		return -ENOMEM;
-	return 0;
+	rcu_read_lock();
+	for (i = 0; i < 2; i++) {
+		p = rcu_dereference(osk->sk_policy[i]);
+		if (p) {
+			np = clone_policy(p, i);
+			if (unlikely(!np)) {
+				ret = -ENOMEM;
+				break;
+			}
+			rcu_assign_pointer(sk->sk_policy[i], np);
+		}
+	}
+	rcu_read_unlock();
+	return ret;
 }
 
 static int
-- 
cgit v1.2.3


From 79462ad02e861803b3840cc782248c7359451cd9 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 14 Dec 2015 22:03:39 +0100
Subject: net: add validation for the socket syscall protocol argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

郭永刚 reported that one could simply crash the kernel as root by
using a simple program:

	int socket_fd;
	struct sockaddr_in addr;
	addr.sin_port = 0;
	addr.sin_addr.s_addr = INADDR_ANY;
	addr.sin_family = 10;

	socket_fd = socket(10,3,0x40000000);
	connect(socket_fd , &addr,16);

AF_INET, AF_INET6 sockets actually only support 8-bit protocol
identifiers. inet_sock's skc_protocol field thus is sized accordingly,
thus larger protocol identifiers simply cut off the higher bits and
store a zero in the protocol fields.

This could lead to e.g. NULL function pointer because as a result of
the cut off inet_num is zero and we call down to inet_autobind, which
is NULL for raw sockets.

kernel: Call Trace:
kernel:  [<ffffffff816db90e>] ? inet_autobind+0x2e/0x70
kernel:  [<ffffffff816db9a4>] inet_dgram_connect+0x54/0x80
kernel:  [<ffffffff81645069>] SYSC_connect+0xd9/0x110
kernel:  [<ffffffff810ac51b>] ? ptrace_notify+0x5b/0x80
kernel:  [<ffffffff810236d8>] ? syscall_trace_enter_phase2+0x108/0x200
kernel:  [<ffffffff81645e0e>] SyS_connect+0xe/0x10
kernel:  [<ffffffff81779515>] tracesys_phase2+0x84/0x89

I found no particular commit which introduced this problem.

CVE: CVE-2015-8543
Cc: Cong Wang <cwang@twopensource.com>
Reported-by: 郭永刚 <guoyonggang@360.cn>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     | 1 +
 net/ax25/af_ax25.c     | 3 +++
 net/decnet/af_decnet.c | 3 +++
 net/ipv4/af_inet.c     | 3 +++
 net/ipv6/af_inet6.c    | 3 +++
 net/irda/af_irda.c     | 3 +++
 6 files changed, 16 insertions(+)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index eaef41433d7a..c4205e0a3a2d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -403,6 +403,7 @@ struct sock {
 				sk_no_check_rx : 1,
 				sk_userlocks : 4,
 				sk_protocol  : 8,
+#define SK_PROTOCOL_MAX U8_MAX
 				sk_type      : 16;
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index ae3a47f9d1d5..fbd0acf80b13 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -805,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
 	struct sock *sk;
 	ax25_cb *ax25;
 
+	if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+		return -EINVAL;
+
 	if (!net_eq(net, &init_net))
 		return -EAFNOSUPPORT;
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index eebf5ac8ce18..13d6b1a6e0fc 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 
+	if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+		return -EINVAL;
+
 	if (!net_eq(net, &init_net))
 		return -EAFNOSUPPORT;
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 11c4ca13ec3b..5c5db6636704 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -257,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 	int try_loading_module = 0;
 	int err;
 
+	if (protocol < 0 || protocol >= IPPROTO_MAX)
+		return -EINVAL;
+
 	sock->state = SS_UNCONNECTED;
 
 	/* Look for the requested type/protocol pair. */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8ec0df75f1c4..9f5137cd604e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 	int try_loading_module = 0;
 	int err;
 
+	if (protocol < 0 || protocol >= IPPROTO_MAX)
+		return -EINVAL;
+
 	/* Look for the requested type/protocol pair. */
 lookup_protocol:
 	err = -ESOCKTNOSUPPORT;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index e6aa48b5395c..923abd6b3064 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
 	struct sock *sk;
 	struct irda_sock *self;
 
+	if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+		return -EINVAL;
+
 	if (net != &init_net)
 		return -EAFNOSUPPORT;
 
-- 
cgit v1.2.3


From 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Dec 2015 14:08:53 -0800
Subject: net: fix IP early demux races

David Wilder reported crashes caused by dst reuse.

<quote David>
  I am seeing a crash on a distro V4.2.3 kernel caused by a double
  release of a dst_entry.  In ipv4_dst_destroy() the call to
  list_empty() finds a poisoned next pointer, indicating the dst_entry
  has already been removed from the list and freed. The crash occurs
  18 to 24 hours into a run of a network stress exerciser.
</quote>

Thanks to his detailed report and analysis, we were able to understand
the core issue.

IP early demux can associate a dst to skb, after a lookup in TCP/UDP
sockets.

When socket cache is not properly set, we want to store into
sk->sk_dst_cache the dst for future IP early demux lookups,
by acquiring a stable refcount on the dst.

Problem is this acquisition is simply using an atomic_inc(),
which works well, unless the dst was queued for destruction from
dst_release() noticing dst refcount went to zero, if DST_NOCACHE
was set on dst.

We need to make sure current refcount is not zero before incrementing
it, or risk double free as David reported.

This patch, being a stable candidate, adds two new helpers, and use
them only from IP early demux problematic paths.

It might be possible to merge in net-next skb_dst_force() and
skb_dst_force_safe(), but I prefer having the smallest patch for stable
kernels : Maybe some skb_dst_force() callers do not expect skb->dst
can suddenly be cleared.

Can probably be backported back to linux-3.6 kernels

Reported-by: David J. Wilder <dwilder@us.ibm.com>
Tested-by: David J. Wilder <dwilder@us.ibm.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h   | 33 +++++++++++++++++++++++++++++++++
 include/net/sock.h  |  2 +-
 net/ipv4/tcp_ipv4.c |  5 ++---
 net/ipv6/tcp_ipv6.c |  3 +--
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/dst.h b/include/net/dst.h
index 1279f9b09791..c7329dcd90cc 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -322,6 +322,39 @@ static inline void skb_dst_force(struct sk_buff *skb)
 	}
 }
 
+/**
+ * dst_hold_safe - Take a reference on a dst if possible
+ * @dst: pointer to dst entry
+ *
+ * This helper returns false if it could not safely
+ * take a reference on a dst.
+ */
+static inline bool dst_hold_safe(struct dst_entry *dst)
+{
+	if (dst->flags & DST_NOCACHE)
+		return atomic_inc_not_zero(&dst->__refcnt);
+	dst_hold(dst);
+	return true;
+}
+
+/**
+ * skb_dst_force_safe - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted and not destroyed, grab a ref on it.
+ */
+static inline void skb_dst_force_safe(struct sk_buff *skb)
+{
+	if (skb_dst_is_noref(skb)) {
+		struct dst_entry *dst = skb_dst(skb);
+
+		if (!dst_hold_safe(dst))
+			dst = NULL;
+
+		skb->_skb_refdst = (unsigned long)dst;
+	}
+}
+
 
 /**
  *	__skb_tunnel_rx - prepare skb for rx reinsert
diff --git a/include/net/sock.h b/include/net/sock.h
index c4205e0a3a2d..28790fe18206 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -817,7 +817,7 @@ void sk_stream_write_space(struct sock *sk);
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	/* dont let skb dst not refcounted, we are going to leave rcu lock */
-	skb_dst_force(skb);
+	skb_dst_force_safe(skb);
 
 	if (!sk->sk_backlog.tail)
 		sk->sk_backlog.head = skb;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index db003438aaf5..d8841a2f1569 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1493,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 	if (likely(sk->sk_rx_dst))
 		skb_dst_drop(skb);
 	else
-		skb_dst_force(skb);
+		skb_dst_force_safe(skb);
 
 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
 	tp->ucopy.memory += skb->truesize;
@@ -1721,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 
-	if (dst) {
-		dst_hold(dst);
+	if (dst && dst_hold_safe(dst)) {
 		sk->sk_rx_dst = dst;
 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e7aab561b7b4..6b8a8a9091fa 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 
-	if (dst) {
+	if (dst && dst_hold_safe(dst)) {
 		const struct rt6_info *rt = (const struct rt6_info *)dst;
 
-		dst_hold(dst);
 		sk->sk_rx_dst = dst;
 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 		inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
-- 
cgit v1.2.3


From 9a49850d0af7b9fd14d091dfe61ef6cb369f86b9 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 14 Dec 2015 11:19:45 -0800
Subject: tcp: Fix conditions to determine checksum offload

In tcp_send_sendpage and tcp_sendmsg we check the route capabilities to
determine if checksum offload can be performed. This check currently
does not take the IP protocol into account for devices that advertise
only one of NETIF_F_IPV6_CSUM or NETIF_F_IP_CSUM. This patch adds a
function to check capabilities for checksum offload with a socket
called sk_check_csum_caps. This function checks for specific IPv4 or
IPv6 offload support based on the family of the socket.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 9 +++++++++
 net/ipv4/tcp.c     | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 0ca22b014de1..ab0269f4b2cc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1791,6 +1791,15 @@ static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
 	sk->sk_route_caps &= ~flags;
 }
 
+static inline bool sk_check_csum_caps(struct sock *sk)
+{
+	return (sk->sk_route_caps & NETIF_F_HW_CSUM) ||
+	       (sk->sk_family == PF_INET &&
+		(sk->sk_route_caps & NETIF_F_IP_CSUM)) ||
+	       (sk->sk_family == PF_INET6 &&
+		(sk->sk_route_caps & NETIF_F_IPV6_CSUM));
+}
+
 static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
 					   struct iov_iter *from, char *to,
 					   int copy, int offset)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cf7ef7be79f0..92b3e61b847d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1018,7 +1018,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 	ssize_t res;
 
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
-	    !(sk->sk_route_caps & NETIF_F_CSUM_MASK))
+	    !sk_check_csum_caps(sk))
 		return sock_no_sendpage(sk->sk_socket, page, offset, size,
 					flags);
 
@@ -1175,7 +1175,7 @@ new_segment:
 			/*
 			 * Check whether we can use HW checksum.
 			 */
-			if (sk->sk_route_caps & NETIF_F_CSUM_MASK)
+			if (sk_check_csum_caps(sk))
 				skb->ip_summed = CHECKSUM_PARTIAL;
 
 			skb_entail(sk, skb);
-- 
cgit v1.2.3


From 64be0aed59ad519d6f2160868734f7e278290ac1 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 16 Dec 2015 12:30:03 +0900
Subject: net: diag: Add the ability to destroy a socket.

This patch adds a SOCK_DESTROY operation, a destroy function
pointer to sock_diag_handler, and a diag_destroy function
pointer.  It does not include any implementation code.

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h      |  2 ++
 include/net/sock.h             |  1 +
 include/uapi/linux/sock_diag.h |  1 +
 net/core/sock_diag.c           | 23 ++++++++++++++++++++---
 4 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index fddebc617469..4018b48f2b3b 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -15,6 +15,7 @@ struct sock_diag_handler {
 	__u8 family;
 	int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
 	int (*get_info)(struct sk_buff *skb, struct sock *sk);
+	int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh);
 };
 
 int sock_diag_register(const struct sock_diag_handler *h);
@@ -68,4 +69,5 @@ bool sock_diag_has_destroy_listeners(const struct sock *sk)
 }
 void sock_diag_broadcast_destroy(struct sock *sk);
 
+int sock_diag_destroy(struct sock *sk, int err);
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index ab0269f4b2cc..6e6e8a25d997 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1060,6 +1060,7 @@ struct proto {
 	void			(*destroy_cgroup)(struct mem_cgroup *memcg);
 	struct cg_proto		*(*proto_cgroup)(struct mem_cgroup *memcg);
 #endif
+	int			(*diag_destroy)(struct sock *sk, int err);
 };
 
 int proto_register(struct proto *prot, int alloc_slab);
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index 49230d36f9ce..bae2d80034d4 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 #define SOCK_DIAG_BY_FAMILY 20
+#define SOCK_DESTROY 21
 
 struct sock_diag_req {
 	__u8	sdiag_family;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 0c1d58d43f67..a996ce8c8fb2 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -214,7 +214,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld)
 }
 EXPORT_SYMBOL_GPL(sock_diag_unregister);
 
-static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	int err;
 	struct sock_diag_req *req = nlmsg_data(nlh);
@@ -234,8 +234,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	hndl = sock_diag_handlers[req->sdiag_family];
 	if (hndl == NULL)
 		err = -ENOENT;
-	else
+	else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
 		err = hndl->dump(skb, nlh);
+	else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
+		err = hndl->destroy(skb, nlh);
+	else
+		err = -EOPNOTSUPP;
 	mutex_unlock(&sock_diag_table_mutex);
 
 	return err;
@@ -261,7 +265,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 		return ret;
 	case SOCK_DIAG_BY_FAMILY:
-		return __sock_diag_rcv_msg(skb, nlh);
+	case SOCK_DESTROY:
+		return __sock_diag_cmd(skb, nlh);
 	default:
 		return -EINVAL;
 	}
@@ -295,6 +300,18 @@ static int sock_diag_bind(struct net *net, int group)
 	return 0;
 }
 
+int sock_diag_destroy(struct sock *sk, int err)
+{
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!sk->sk_prot->diag_destroy)
+		return -EOPNOTSUPP;
+
+	return sk->sk_prot->diag_destroy(sk, err);
+}
+EXPORT_SYMBOL_GPL(sock_diag_destroy);
+
 static int __net_init diag_net_init(struct net *net)
 {
 	struct netlink_kernel_cfg cfg = {
-- 
cgit v1.2.3


From 7bbadd2d1009575dad675afc16650ebb5aa10612 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 14 Dec 2015 23:30:43 +0100
Subject: net: fix warnings in 'make htmldocs' by moving macro definition out
 of field declaration

Docbook does not like the definition of macros inside a field declaration
and adds a warning. Move the definition out.

Fixes: 79462ad02e86180 ("net: add validation for the socket syscall protocol argument")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 28790fe18206..14d3c0734007 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -403,8 +403,8 @@ struct sock {
 				sk_no_check_rx : 1,
 				sk_userlocks : 4,
 				sk_protocol  : 8,
-#define SK_PROTOCOL_MAX U8_MAX
 				sk_type      : 16;
+#define SK_PROTOCOL_MAX U8_MAX
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
-- 
cgit v1.2.3


From ef456144da8ef507c8cf504284b6042e9201a05c Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 4 Jan 2016 17:41:45 -0500
Subject: soreuseport: define reuseport groups

struct sock_reuseport is an optional shared structure referenced by each
socket belonging to a reuseport group.  When a socket is bound to an
address/port not yet in use and the reuseport flag has been set, the
structure will be allocated and attached to the newly bound socket.
When subsequent calls to bind are made for the same address/port, the
shared structure will be updated to include the new socket and the
newly bound socket will reference the group structure.

Usually, when an incoming packet was destined for a reuseport group,
all sockets in the same group needed to be considered before a
dispatching decision was made.  With this structure, an appropriate
socket can be found after looking up just one socket in the group.

This shared structure will also allow for more complicated decisions to
be made when selecting a socket (eg a BPF filter).

This work is based off a similar implementation written by
Ying Cai <ycai@google.com> for implementing policy-based reuseport
selection.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h           |   2 +
 include/net/sock_reuseport.h |  20 +++++
 net/core/Makefile            |   2 +-
 net/core/sock_reuseport.c    | 173 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 include/net/sock_reuseport.h
 create mode 100644 net/core/sock_reuseport.c

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 3794cdde837a..e830c1006935 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -318,6 +318,7 @@ struct cg_proto;
   *	@sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
   *	@sk_backlog_rcv: callback to process the backlog
   *	@sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
+  *	@sk_reuseport_cb: reuseport group container
  */
 struct sock {
 	/*
@@ -453,6 +454,7 @@ struct sock {
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
+	struct sock_reuseport __rcu	*sk_reuseport_cb;
 };
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
new file mode 100644
index 000000000000..67d1eb8fd7af
--- /dev/null
+++ b/include/net/sock_reuseport.h
@@ -0,0 +1,20 @@
+#ifndef _SOCK_REUSEPORT_H
+#define _SOCK_REUSEPORT_H
+
+#include <linux/types.h>
+#include <net/sock.h>
+
+struct sock_reuseport {
+	struct rcu_head		rcu;
+
+	u16			max_socks;	/* length of socks */
+	u16			num_socks;	/* elements in socks */
+	struct sock		*socks[0];	/* array of sock pointers */
+};
+
+extern int reuseport_alloc(struct sock *sk);
+extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2);
+extern void reuseport_detach_sock(struct sock *sk);
+extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash);
+
+#endif  /* _SOCK_REUSEPORT_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 086b01fbe1bd..0b835de04de3 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
 obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
-			sock_diag.o dev_ioctl.o tso.o
+			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
 
 obj-$(CONFIG_XFRM) += flow.o
 obj-y += net-sysfs.o
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
new file mode 100644
index 000000000000..963c8d5f3027
--- /dev/null
+++ b/net/core/sock_reuseport.c
@@ -0,0 +1,173 @@
+/*
+ * To speed up listener socket lookup, create an array to store all sockets
+ * listening on the same port.  This allows a decision to be made after finding
+ * the first socket.
+ */
+
+#include <net/sock_reuseport.h>
+#include <linux/rcupdate.h>
+
+#define INIT_SOCKS 128
+
+static DEFINE_SPINLOCK(reuseport_lock);
+
+static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
+{
+	size_t size = sizeof(struct sock_reuseport) +
+		      sizeof(struct sock *) * max_socks;
+	struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+
+	if (!reuse)
+		return NULL;
+
+	reuse->max_socks = max_socks;
+
+	return reuse;
+}
+
+int reuseport_alloc(struct sock *sk)
+{
+	struct sock_reuseport *reuse;
+
+	/* bh lock used since this function call may precede hlist lock in
+	 * soft irq of receive path or setsockopt from process context
+	 */
+	spin_lock_bh(&reuseport_lock);
+	WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+					    lockdep_is_held(&reuseport_lock)),
+		  "multiple allocations for the same socket");
+	reuse = __reuseport_alloc(INIT_SOCKS);
+	if (!reuse) {
+		spin_unlock_bh(&reuseport_lock);
+		return -ENOMEM;
+	}
+
+	reuse->socks[0] = sk;
+	reuse->num_socks = 1;
+	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+	spin_unlock_bh(&reuseport_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(reuseport_alloc);
+
+static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
+{
+	struct sock_reuseport *more_reuse;
+	u32 more_socks_size, i;
+
+	more_socks_size = reuse->max_socks * 2U;
+	if (more_socks_size > U16_MAX)
+		return NULL;
+
+	more_reuse = __reuseport_alloc(more_socks_size);
+	if (!more_reuse)
+		return NULL;
+
+	more_reuse->max_socks = more_socks_size;
+	more_reuse->num_socks = reuse->num_socks;
+
+	memcpy(more_reuse->socks, reuse->socks,
+	       reuse->num_socks * sizeof(struct sock *));
+
+	for (i = 0; i < reuse->num_socks; ++i)
+		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
+				   more_reuse);
+
+	kfree_rcu(reuse, rcu);
+	return more_reuse;
+}
+
+/**
+ *  reuseport_add_sock - Add a socket to the reuseport group of another.
+ *  @sk:  New socket to add to the group.
+ *  @sk2: Socket belonging to the existing reuseport group.
+ *  May return ENOMEM and not add socket to group under memory pressure.
+ */
+int reuseport_add_sock(struct sock *sk, const struct sock *sk2)
+{
+	struct sock_reuseport *reuse;
+
+	spin_lock_bh(&reuseport_lock);
+	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock)),
+	WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+					    lockdep_is_held(&reuseport_lock)),
+		  "socket already in reuseport group");
+
+	if (reuse->num_socks == reuse->max_socks) {
+		reuse = reuseport_grow(reuse);
+		if (!reuse) {
+			spin_unlock_bh(&reuseport_lock);
+			return -ENOMEM;
+		}
+	}
+
+	reuse->socks[reuse->num_socks] = sk;
+	/* paired with smp_rmb() in reuseport_select_sock() */
+	smp_wmb();
+	reuse->num_socks++;
+	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+	spin_unlock_bh(&reuseport_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(reuseport_add_sock);
+
+void reuseport_detach_sock(struct sock *sk)
+{
+	struct sock_reuseport *reuse;
+	int i;
+
+	spin_lock_bh(&reuseport_lock);
+	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+
+	for (i = 0; i < reuse->num_socks; i++) {
+		if (reuse->socks[i] == sk) {
+			reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+			reuse->num_socks--;
+			if (reuse->num_socks == 0)
+				kfree_rcu(reuse, rcu);
+			break;
+		}
+	}
+	spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_detach_sock);
+
+/**
+ *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
+ *  @sk: First socket in the group.
+ *  @hash: Use this hash to select.
+ *  Returns a socket that should receive the packet (or NULL on error).
+ */
+struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
+{
+	struct sock_reuseport *reuse;
+	struct sock *sk2 = NULL;
+	u16 socks;
+
+	rcu_read_lock();
+	reuse = rcu_dereference(sk->sk_reuseport_cb);
+
+	/* if memory allocation failed or add call is not yet complete */
+	if (!reuse)
+		goto out;
+
+	socks = READ_ONCE(reuse->num_socks);
+	if (likely(socks)) {
+		/* paired with smp_wmb() in reuseport_add_sock() */
+		smp_rmb();
+
+		sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+	}
+
+out:
+	rcu_read_unlock();
+	return sk2;
+}
+EXPORT_SYMBOL(reuseport_select_sock);
-- 
cgit v1.2.3


From 8c2c2358b236530bc2c79b4c2a447cbdbc3d96d7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:20:59 -0800
Subject: net: tcp_memcontrol: properly detect ancestor socket pressure

When charging socket memory, the code currently checks only the local
page counter for excess to determine whether the memcg is under socket
pressure.  But even if the local counter is fine, one of the ancestors
could have breached its limit, which should also force this child to
enter socket pressure.  This currently doesn't happen.

Fix this by using page_counter_try_charge() first.  If that fails, it
means that either the local counter or one of the ancestors are in
excess of their limit, and the child should enter socket pressure.

Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/sock.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index e830c1006935..9ef3d7c984b4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1193,11 +1193,13 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 					      unsigned long amt,
 					      int *parent_status)
 {
-	page_counter_charge(&prot->memory_allocated, amt);
+	struct page_counter *counter;
+
+	if (page_counter_try_charge(&prot->memory_allocated, amt, &counter))
+		return;
 
-	if (page_counter_read(&prot->memory_allocated) >
-	    prot->memory_allocated.limit)
-		*parent_status = OVER_LIMIT;
+	page_counter_charge(&prot->memory_allocated, amt);
+	*parent_status = OVER_LIMIT;
 }
 
 static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
-- 
cgit v1.2.3


From 931f3f4beb031cd483c1c8ab159ef1f8bdbe8888 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:21:02 -0800
Subject: net: tcp_memcontrol: remove bogus hierarchy pressure propagation

When a cgroup currently breaches its socket memory limit, it enters
memory pressure mode for itself and its *ancestors*.  This throttles
transmission in unrelated sibling and cousin subtrees that have nothing
to do with the breached limit.

On the contrary, breaching a limit should make that group and its
*children* enter memory pressure mode.  But this happens already, albeit
lazily: if an ancestor limit is breached, siblings will enter memory
pressure on their own once the next packet arrives for them.

So no additional hierarchy code is needed.  Remove the bogus stuff.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/sock.h | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9ef3d7c984b4..d3b035c7362b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1155,14 +1155,8 @@ static inline void sk_leave_memory_pressure(struct sock *sk)
 	if (*memory_pressure)
 		*memory_pressure = 0;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-		struct cg_proto *cg_proto = sk->sk_cgrp;
-		struct proto *prot = sk->sk_prot;
-
-		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			cg_proto->memory_pressure = 0;
-	}
-
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		sk->sk_cgrp->memory_pressure = 0;
 }
 
 static inline void sk_enter_memory_pressure(struct sock *sk)
@@ -1170,13 +1164,8 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
 	if (!sk->sk_prot->enter_memory_pressure)
 		return;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-		struct cg_proto *cg_proto = sk->sk_cgrp;
-		struct proto *prot = sk->sk_prot;
-
-		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			cg_proto->memory_pressure = 1;
-	}
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		sk->sk_cgrp->memory_pressure = 1;
 
 	sk->sk_prot->enter_memory_pressure(sk);
 }
-- 
cgit v1.2.3


From af95d7df4059cfeab7e7c244f3564214aada7dad Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:21:08 -0800
Subject: net: tcp_memcontrol: remove dead per-memcg count of allocated sockets

The number of allocated sockets is used for calculations in the soft
limit phase, where packets are accepted but the socket is under memory
pressure.
 Since there is no soft limit phase in tcp_memcontrol, and memory
pressure is only entered when packets are already dropped, this is
actually dead code.  Remove it.

As this is the last user of parent_cg_proto(), remove that too.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 -
 include/net/sock.h         | 39 +++------------------------------------
 net/ipv4/tcp_memcontrol.c  |  3 ---
 3 files changed, 3 insertions(+), 40 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 85c437b0cbc0..15acc04ebdd3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -87,7 +87,6 @@ enum mem_cgroup_events_target {
 
 struct cg_proto {
 	struct page_counter	memory_allocated;	/* Current allocated memory. */
-	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
 	int			memory_pressure;
 	bool			active;
 	long			sysctl_mem[3];
diff --git a/include/net/sock.h b/include/net/sock.h
index d3b035c7362b..1f15937ec208 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1098,19 +1098,9 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET)
 extern struct static_key memcg_socket_limit_enabled;
-static inline struct cg_proto *parent_cg_proto(struct proto *proto,
-					       struct cg_proto *cg_proto)
-{
-	return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg));
-}
 #define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled)
 #else
 #define mem_cgroup_sockets_enabled 0
-static inline struct cg_proto *parent_cg_proto(struct proto *proto,
-					       struct cg_proto *cg_proto)
-{
-	return NULL;
-}
 #endif
 
 static inline bool sk_stream_memory_free(const struct sock *sk)
@@ -1236,41 +1226,18 @@ sk_memory_allocated_sub(struct sock *sk, int amt)
 
 static inline void sk_sockets_allocated_dec(struct sock *sk)
 {
-	struct proto *prot = sk->sk_prot;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-		struct cg_proto *cg_proto = sk->sk_cgrp;
-
-		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			percpu_counter_dec(&cg_proto->sockets_allocated);
-	}
-
-	percpu_counter_dec(prot->sockets_allocated);
+	percpu_counter_dec(sk->sk_prot->sockets_allocated);
 }
 
 static inline void sk_sockets_allocated_inc(struct sock *sk)
 {
-	struct proto *prot = sk->sk_prot;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-		struct cg_proto *cg_proto = sk->sk_cgrp;
-
-		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			percpu_counter_inc(&cg_proto->sockets_allocated);
-	}
-
-	percpu_counter_inc(prot->sockets_allocated);
+	percpu_counter_inc(sk->sk_prot->sockets_allocated);
 }
 
 static inline int
 sk_sockets_allocated_read_positive(struct sock *sk)
 {
-	struct proto *prot = sk->sk_prot;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return percpu_counter_read_positive(&sk->sk_cgrp->sockets_allocated);
-
-	return percpu_counter_read_positive(prot->sockets_allocated);
+	return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
 }
 
 static inline int
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index d07579ada001..6759e0d6bba1 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -32,7 +32,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 		counter_parent = &parent_cg->memory_allocated;
 
 	page_counter_init(&cg_proto->memory_allocated, counter_parent);
-	percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
 
 	return 0;
 }
@@ -46,8 +45,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 	if (!cg_proto)
 		return;
 
-	percpu_counter_destroy(&cg_proto->sockets_allocated);
-
 	if (cg_proto->active)
 		static_key_slow_dec(&memcg_socket_limit_enabled);
 
-- 
cgit v1.2.3


From 80f23124f57c77915a7b4201d8dcba38a38b23f0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:21:11 -0800
Subject: net: tcp_memcontrol: simplify the per-memcg limit access

tcp_memcontrol replicates the global sysctl_mem limit array per cgroup,
but it only ever sets these entries to the value of the memory_allocated
page_counter limit.  Use the latter directly.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 -
 include/net/sock.h         | 8 +++++---
 net/ipv4/tcp_memcontrol.c  | 8 --------
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 15acc04ebdd3..6c91c1b73951 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -89,7 +89,6 @@ struct cg_proto {
 	struct page_counter	memory_allocated;	/* Current allocated memory. */
 	int			memory_pressure;
 	bool			active;
-	long			sysctl_mem[3];
 	/*
 	 * memcg field is used to find which memcg we belong directly
 	 * Each memcg struct can hold more than one cg_proto, so container_of
diff --git a/include/net/sock.h b/include/net/sock.h
index 1f15937ec208..8b1f8e5d3a48 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1162,10 +1162,12 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
 
 static inline long sk_prot_mem_limits(const struct sock *sk, int index)
 {
-	long *prot = sk->sk_prot->sysctl_mem;
+	long limit = sk->sk_prot->sysctl_mem[index];
+
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		prot = sk->sk_cgrp->sysctl_mem;
-	return prot[index];
+		limit = min_t(long, limit, sk->sk_cgrp->memory_allocated.limit);
+
+	return limit;
 }
 
 static inline void memcg_memory_allocated_add(struct cg_proto *prot,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 6759e0d6bba1..ef4268d12e43 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -21,9 +21,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	if (!cg_proto)
 		return 0;
 
-	cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
-	cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
-	cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
 	cg_proto->memory_pressure = 0;
 	cg_proto->memcg = memcg;
 
@@ -54,7 +51,6 @@ EXPORT_SYMBOL(tcp_destroy_cgroup);
 static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
 {
 	struct cg_proto *cg_proto;
-	int i;
 	int ret;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
@@ -65,10 +61,6 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
 	if (ret)
 		return ret;
 
-	for (i = 0; i < 3; i++)
-		cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
-						sysctl_tcp_mem[i]);
-
 	if (!cg_proto->active) {
 		/*
 		 * The active flag needs to be written after the static_key
-- 
cgit v1.2.3


From e805605c721021879a1469bdae45c6f80bc985f4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:21:14 -0800
Subject: net: tcp_memcontrol: sanitize tcp memory accounting callbacks

There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.

Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit.  This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds.  After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.

Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable.  However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either.  However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed).  When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.

As described above, consumption is now checked on the per-memcg level
and the global level separately.  Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 +++++++++-----
 include/net/sock.h         | 64 ++++++----------------------------------------
 include/net/tcp.h          |  5 ++--
 mm/memcontrol.c            | 32 +++++++++++++++++++++++
 net/core/sock.c            | 26 +++++++++++--------
 net/ipv4/tcp_output.c      |  7 +++--
 6 files changed, 77 insertions(+), 76 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c91c1b73951..e4e77bd1dd39 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -660,12 +660,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
-enum {
-	UNDER_LIMIT,
-	SOFT_LIMIT,
-	OVER_LIMIT,
-};
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
@@ -694,6 +688,19 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 struct sock;
 void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
+bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages);
+void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages);
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+static inline bool mem_cgroup_under_socket_pressure(struct cg_proto *proto)
+{
+	return proto->memory_pressure;
+}
+#else
+static inline bool mem_cgroup_under_pressure(struct cg_proto *proto)
+{
+	return false;
+}
+#endif
 
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
diff --git a/include/net/sock.h b/include/net/sock.h
index 8b1f8e5d3a48..94a6c1a740b9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1129,8 +1129,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	if (!sk->sk_prot->memory_pressure)
 		return false;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return !!sk->sk_cgrp->memory_pressure;
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
+	    mem_cgroup_under_socket_pressure(sk->sk_cgrp))
+		return true;
 
 	return !!*sk->sk_prot->memory_pressure;
 }
@@ -1144,9 +1145,6 @@ static inline void sk_leave_memory_pressure(struct sock *sk)
 
 	if (*memory_pressure)
 		*memory_pressure = 0;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		sk->sk_cgrp->memory_pressure = 0;
 }
 
 static inline void sk_enter_memory_pressure(struct sock *sk)
@@ -1154,76 +1152,30 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
 	if (!sk->sk_prot->enter_memory_pressure)
 		return;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		sk->sk_cgrp->memory_pressure = 1;
-
 	sk->sk_prot->enter_memory_pressure(sk);
 }
 
 static inline long sk_prot_mem_limits(const struct sock *sk, int index)
 {
-	long limit = sk->sk_prot->sysctl_mem[index];
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		limit = min_t(long, limit, sk->sk_cgrp->memory_allocated.limit);
-
-	return limit;
-}
-
-static inline void memcg_memory_allocated_add(struct cg_proto *prot,
-					      unsigned long amt,
-					      int *parent_status)
-{
-	struct page_counter *counter;
-
-	if (page_counter_try_charge(&prot->memory_allocated, amt, &counter))
-		return;
-
-	page_counter_charge(&prot->memory_allocated, amt);
-	*parent_status = OVER_LIMIT;
-}
-
-static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
-					      unsigned long amt)
-{
-	page_counter_uncharge(&prot->memory_allocated, amt);
+	return sk->sk_prot->sysctl_mem[index];
 }
 
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
-	struct proto *prot = sk->sk_prot;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return page_counter_read(&sk->sk_cgrp->memory_allocated);
-
-	return atomic_long_read(prot->memory_allocated);
+	return atomic_long_read(sk->sk_prot->memory_allocated);
 }
 
 static inline long
-sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
+sk_memory_allocated_add(struct sock *sk, int amt)
 {
-	struct proto *prot = sk->sk_prot;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-		memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status);
-		/* update the root cgroup regardless */
-		atomic_long_add_return(amt, prot->memory_allocated);
-		return page_counter_read(&sk->sk_cgrp->memory_allocated);
-	}
-
-	return atomic_long_add_return(amt, prot->memory_allocated);
+	return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
 }
 
 static inline void
 sk_memory_allocated_sub(struct sock *sk, int amt)
 {
-	struct proto *prot = sk->sk_prot;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		memcg_memory_allocated_sub(sk->sk_cgrp, amt);
-
-	atomic_long_sub(amt, prot->memory_allocated);
+	atomic_long_sub(amt, sk->sk_prot->memory_allocated);
 }
 
 static inline void sk_sockets_allocated_dec(struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a80255f4ca33..d9df80deba31 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -289,8 +289,9 @@ extern int tcp_memory_pressure;
 /* optimized version of sk_under_memory_pressure() for TCP sockets */
 static inline bool tcp_under_memory_pressure(const struct sock *sk)
 {
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return !!sk->sk_cgrp->memory_pressure;
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
+	    mem_cgroup_under_socket_pressure(sk->sk_cgrp))
+		return true;
 
 	return tcp_memory_pressure;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d9344dad207e..f5de783860b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -338,6 +338,38 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 
+/**
+ * mem_cgroup_charge_skmem - charge socket memory
+ * @proto: proto to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Charges @nr_pages to @proto. Returns %true if the charge fit within
+ * @proto's configured limit, %false if the charge had to be forced.
+ */
+bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages)
+{
+	struct page_counter *counter;
+
+	if (page_counter_try_charge(&proto->memory_allocated,
+				    nr_pages, &counter)) {
+		proto->memory_pressure = 0;
+		return true;
+	}
+	page_counter_charge(&proto->memory_allocated, nr_pages);
+	proto->memory_pressure = 1;
+	return false;
+}
+
+/**
+ * mem_cgroup_uncharge_skmem - uncharge socket memory
+ * @proto - proto to uncharge
+ * @nr_pages - number of pages to uncharge
+ */
+void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages)
+{
+	page_counter_uncharge(&proto->memory_allocated, nr_pages);
+}
+
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c5dab01105b..89ae859d2dc5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2084,27 +2084,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 	struct proto *prot = sk->sk_prot;
 	int amt = sk_mem_pages(size);
 	long allocated;
-	int parent_status = UNDER_LIMIT;
 
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
 
-	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
+	allocated = sk_memory_allocated_add(sk, amt);
+
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
+	    !mem_cgroup_charge_skmem(sk->sk_cgrp, amt))
+		goto suppress_allocation;
 
 	/* Under limit. */
-	if (parent_status == UNDER_LIMIT &&
-			allocated <= sk_prot_mem_limits(sk, 0)) {
+	if (allocated <= sk_prot_mem_limits(sk, 0)) {
 		sk_leave_memory_pressure(sk);
 		return 1;
 	}
 
-	/* Under pressure. (we or our parents) */
-	if ((parent_status > SOFT_LIMIT) ||
-			allocated > sk_prot_mem_limits(sk, 1))
+	/* Under pressure. */
+	if (allocated > sk_prot_mem_limits(sk, 1))
 		sk_enter_memory_pressure(sk);
 
-	/* Over hard limit (we or our parents) */
-	if ((parent_status == OVER_LIMIT) ||
-			(allocated > sk_prot_mem_limits(sk, 2)))
+	/* Over hard limit. */
+	if (allocated > sk_prot_mem_limits(sk, 2))
 		goto suppress_allocation;
 
 	/* guarantee minimum buffer size under pressure */
@@ -2153,6 +2153,9 @@ suppress_allocation:
 
 	sk_memory_allocated_sub(sk, amt);
 
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		mem_cgroup_uncharge_skmem(sk->sk_cgrp, amt);
+
 	return 0;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2168,6 +2171,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 	sk_memory_allocated_sub(sk, amount);
 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
 
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		mem_cgroup_uncharge_skmem(sk->sk_cgrp, amount);
+
 	if (sk_under_memory_pressure(sk) &&
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
 		sk_leave_memory_pressure(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 412a920fe0ec..493b48945f0c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2813,13 +2813,16 @@ begin_fwd:
  */
 void sk_forced_mem_schedule(struct sock *sk, int size)
 {
-	int amt, status;
+	int amt;
 
 	if (size <= sk->sk_forward_alloc)
 		return;
 	amt = sk_mem_pages(size);
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-	sk_memory_allocated_add(sk, amt, &status);
+	sk_memory_allocated_add(sk, amt);
+
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		mem_cgroup_charge_skmem(sk->sk_cgrp, amt);
 }
 
 /* Send a FIN. The caller locks the socket for us.
-- 
cgit v1.2.3


From baac50bbc3cdfd184ebf586b1704edbfcee866df Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:21:17 -0800
Subject: net: tcp_memcontrol: simplify linkage between socket and page counter

There won't be any separate counters for socket memory consumed by
protocols other than TCP in the future.  Remove the indirection and link
sockets directly to their owning memory cgroup.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h   | 20 ++++---------
 include/net/sock.h           | 25 +++--------------
 include/net/tcp.h            |  4 +--
 include/net/tcp_memcontrol.h |  1 -
 mm/memcontrol.c              | 57 +++++++++++++++----------------------
 net/core/sock.c              | 52 +++++-----------------------------
 net/ipv4/tcp_ipv4.c          |  7 +----
 net/ipv4/tcp_memcontrol.c    | 67 +++++++++++++++++---------------------------
 net/ipv4/tcp_output.c        |  4 +--
 net/ipv6/tcp_ipv6.c          |  3 --
 10 files changed, 69 insertions(+), 171 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e77bd1dd39..7c085e4636ba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -89,16 +89,6 @@ struct cg_proto {
 	struct page_counter	memory_allocated;	/* Current allocated memory. */
 	int			memory_pressure;
 	bool			active;
-	/*
-	 * memcg field is used to find which memcg we belong directly
-	 * Each memcg struct can hold more than one cg_proto, so container_of
-	 * won't really cut.
-	 *
-	 * The elegant solution would be having an inverse function to
-	 * proto_cgroup in struct proto, but that means polluting the structure
-	 * for everybody, instead of just for memcg users.
-	 */
-	struct mem_cgroup	*memcg;
 };
 
 #ifdef CONFIG_MEMCG
@@ -688,15 +678,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 struct sock;
 void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
-bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages);
-void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages);
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
+void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-static inline bool mem_cgroup_under_socket_pressure(struct cg_proto *proto)
+static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
-	return proto->memory_pressure;
+	return memcg->tcp_mem.memory_pressure;
 }
 #else
-static inline bool mem_cgroup_under_pressure(struct cg_proto *proto)
+static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
 	return false;
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 94a6c1a740b9..be96a8dcbc74 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -71,22 +71,6 @@
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
 
-struct cgroup;
-struct cgroup_subsys;
-#ifdef CONFIG_NET
-int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
-void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg);
-#else
-static inline
-int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
-	return 0;
-}
-static inline
-void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
-{
-}
-#endif
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
@@ -245,7 +229,6 @@ struct sock_common {
 	/* public: */
 };
 
-struct cg_proto;
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
@@ -310,7 +293,7 @@ struct cg_proto;
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
   *	@sk_cgrp_data: cgroup data for this cgroup
-  *	@sk_cgrp: this socket's cgroup-specific proto data
+  *	@sk_memcg: this socket's memory cgroup association
   *	@sk_write_pending: a write to stream socket waits to start
   *	@sk_state_change: callback to indicate change in the state of the sock
   *	@sk_data_ready: callback to indicate there is data to be processed
@@ -446,7 +429,7 @@ struct sock {
 	void			*sk_security;
 #endif
 	struct sock_cgroup_data	sk_cgrp_data;
-	struct cg_proto		*sk_cgrp;
+	struct mem_cgroup	*sk_memcg;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
 	void			(*sk_write_space)(struct sock *sk);
@@ -1129,8 +1112,8 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	if (!sk->sk_prot->memory_pressure)
 		return false;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
-	    mem_cgroup_under_socket_pressure(sk->sk_cgrp))
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
 		return true;
 
 	return !!*sk->sk_prot->memory_pressure;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d9df80deba31..8ea19977ea53 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -289,8 +289,8 @@ extern int tcp_memory_pressure;
 /* optimized version of sk_under_memory_pressure() for TCP sockets */
 static inline bool tcp_under_memory_pressure(const struct sock *sk)
 {
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
-	    mem_cgroup_under_socket_pressure(sk->sk_cgrp))
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
 		return true;
 
 	return tcp_memory_pressure;
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 05b94d9453de..3a17b16ae8aa 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -1,7 +1,6 @@
 #ifndef _TCP_MEMCG_H
 #define _TCP_MEMCG_H
 
-struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
 #endif /* _TCP_MEMCG_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f5de783860b8..eaaa86126277 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -294,9 +294,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 void sock_update_memcg(struct sock *sk)
 {
 	struct mem_cgroup *memcg;
-	struct cg_proto *cg_proto;
-
-	BUG_ON(!sk->sk_prot->proto_cgroup);
 
 	/* Socket cloning can throw us here with sk_cgrp already
 	 * filled. It won't however, necessarily happen from
@@ -306,68 +303,58 @@ void sock_update_memcg(struct sock *sk)
 	 * Respecting the original socket's memcg is a better
 	 * decision in this case.
 	 */
-	if (sk->sk_cgrp) {
-		BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
-		css_get(&sk->sk_cgrp->memcg->css);
+	if (sk->sk_memcg) {
+		BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
+		css_get(&sk->sk_memcg->css);
 		return;
 	}
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(current);
-	cg_proto = sk->sk_prot->proto_cgroup(memcg);
-	if (cg_proto && cg_proto->active &&
-	    css_tryget_online(&memcg->css)) {
-		sk->sk_cgrp = cg_proto;
-	}
+	if (memcg != root_mem_cgroup &&
+	    memcg->tcp_mem.active &&
+	    css_tryget_online(&memcg->css))
+		sk->sk_memcg = memcg;
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(sock_update_memcg);
 
 void sock_release_memcg(struct sock *sk)
 {
-	WARN_ON(!sk->sk_cgrp->memcg);
-	css_put(&sk->sk_cgrp->memcg->css);
-}
-
-struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
-{
-	if (!memcg || mem_cgroup_is_root(memcg))
-		return NULL;
-
-	return &memcg->tcp_mem;
+	WARN_ON(!sk->sk_memcg);
+	css_put(&sk->sk_memcg->css);
 }
-EXPORT_SYMBOL(tcp_proto_cgroup);
 
 /**
  * mem_cgroup_charge_skmem - charge socket memory
- * @proto: proto to charge
+ * @memcg: memcg to charge
  * @nr_pages: number of pages to charge
  *
- * Charges @nr_pages to @proto. Returns %true if the charge fit within
- * @proto's configured limit, %false if the charge had to be forced.
+ * Charges @nr_pages to @memcg. Returns %true if the charge fit within
+ * @memcg's configured limit, %false if the charge had to be forced.
  */
-bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages)
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct page_counter *counter;
 
-	if (page_counter_try_charge(&proto->memory_allocated,
+	if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
 				    nr_pages, &counter)) {
-		proto->memory_pressure = 0;
+		memcg->tcp_mem.memory_pressure = 0;
 		return true;
 	}
-	page_counter_charge(&proto->memory_allocated, nr_pages);
-	proto->memory_pressure = 1;
+	page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
+	memcg->tcp_mem.memory_pressure = 1;
 	return false;
 }
 
 /**
  * mem_cgroup_uncharge_skmem - uncharge socket memory
- * @proto - proto to uncharge
+ * @memcg - memcg to uncharge
  * @nr_pages - number of pages to uncharge
  */
-void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages)
+void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-	page_counter_uncharge(&proto->memory_allocated, nr_pages);
+	page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages);
 }
 
 #endif
@@ -3653,7 +3640,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	if (ret)
 		return ret;
 
-	return mem_cgroup_sockets_init(memcg, ss);
+	return tcp_init_cgroup(memcg, ss);
 }
 
 static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
@@ -3709,7 +3696,7 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 		static_key_slow_dec(&memcg_kmem_enabled_key);
 		WARN_ON(page_counter_read(&memcg->kmem));
 	}
-	mem_cgroup_sockets_destroy(memcg);
+	tcp_destroy_cgroup(memcg);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
diff --git a/net/core/sock.c b/net/core/sock.c
index 89ae859d2dc5..3535bffa45f3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap)
 }
 EXPORT_SYMBOL(sk_net_capable);
 
-
-#ifdef CONFIG_MEMCG_KMEM
-int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
-	struct proto *proto;
-	int ret = 0;
-
-	mutex_lock(&proto_list_mutex);
-	list_for_each_entry(proto, &proto_list, node) {
-		if (proto->init_cgroup) {
-			ret = proto->init_cgroup(memcg, ss);
-			if (ret)
-				goto out;
-		}
-	}
-
-	mutex_unlock(&proto_list_mutex);
-	return ret;
-out:
-	list_for_each_entry_continue_reverse(proto, &proto_list, node)
-		if (proto->destroy_cgroup)
-			proto->destroy_cgroup(memcg);
-	mutex_unlock(&proto_list_mutex);
-	return ret;
-}
-
-void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
-{
-	struct proto *proto;
-
-	mutex_lock(&proto_list_mutex);
-	list_for_each_entry_reverse(proto, &proto_list, node)
-		if (proto->destroy_cgroup)
-			proto->destroy_cgroup(memcg);
-	mutex_unlock(&proto_list_mutex);
-}
-#endif
-
 /*
  * Each address family might have different locking rules, so we have
  * one slock key per address family:
@@ -1601,7 +1563,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		sk_set_socket(newsk, NULL);
 		newsk->sk_wq = NULL;
 
-		if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		if (mem_cgroup_sockets_enabled && sk->sk_memcg)
 			sock_update_memcg(newsk);
 
 		if (newsk->sk_prot->sockets_allocated)
@@ -2089,8 +2051,8 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 
 	allocated = sk_memory_allocated_add(sk, amt);
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
-	    !mem_cgroup_charge_skmem(sk->sk_cgrp, amt))
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
 		goto suppress_allocation;
 
 	/* Under limit. */
@@ -2153,8 +2115,8 @@ suppress_allocation:
 
 	sk_memory_allocated_sub(sk, amt);
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		mem_cgroup_uncharge_skmem(sk->sk_cgrp, amt);
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
 
 	return 0;
 }
@@ -2171,8 +2133,8 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 	sk_memory_allocated_sub(sk, amount);
 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		mem_cgroup_uncharge_skmem(sk->sk_cgrp, amount);
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
 
 	if (sk_under_memory_pressure(sk) &&
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eb39e02899e5..c7d1fb50f381 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1819,7 +1819,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	sk_sockets_allocated_dec(sk);
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
 		sock_release_memcg(sk);
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -2343,11 +2343,6 @@ struct proto tcp_prot = {
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
-#endif
-#ifdef CONFIG_MEMCG_KMEM
-	.init_cgroup		= tcp_init_cgroup,
-	.destroy_cgroup		= tcp_destroy_cgroup,
-	.proto_cgroup		= tcp_proto_cgroup,
 #endif
 	.diag_destroy		= tcp_abort,
 };
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index ef4268d12e43..e5078259cbe3 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -8,60 +8,47 @@
 
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	struct page_counter *counter_parent = NULL;
 	/*
 	 * The root cgroup does not use page_counters, but rather,
 	 * rely on the data already collected by the network
 	 * subsystem
 	 */
-	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	struct page_counter *counter_parent = NULL;
-	struct cg_proto *cg_proto, *parent_cg;
-
-	cg_proto = tcp_prot.proto_cgroup(memcg);
-	if (!cg_proto)
+	if (memcg == root_mem_cgroup)
 		return 0;
 
-	cg_proto->memory_pressure = 0;
-	cg_proto->memcg = memcg;
+	memcg->tcp_mem.memory_pressure = 0;
 
-	parent_cg = tcp_prot.proto_cgroup(parent);
-	if (parent_cg)
-		counter_parent = &parent_cg->memory_allocated;
+	if (parent)
+		counter_parent = &parent->tcp_mem.memory_allocated;
 
-	page_counter_init(&cg_proto->memory_allocated, counter_parent);
+	page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent);
 
 	return 0;
 }
-EXPORT_SYMBOL(tcp_init_cgroup);
 
 void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 {
-	struct cg_proto *cg_proto;
-
-	cg_proto = tcp_prot.proto_cgroup(memcg);
-	if (!cg_proto)
+	if (memcg == root_mem_cgroup)
 		return;
 
-	if (cg_proto->active)
+	if (memcg->tcp_mem.active)
 		static_key_slow_dec(&memcg_socket_limit_enabled);
-
 }
-EXPORT_SYMBOL(tcp_destroy_cgroup);
 
 static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
 {
-	struct cg_proto *cg_proto;
 	int ret;
 
-	cg_proto = tcp_prot.proto_cgroup(memcg);
-	if (!cg_proto)
+	if (memcg == root_mem_cgroup)
 		return -EINVAL;
 
-	ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
+	ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages);
 	if (ret)
 		return ret;
 
-	if (!cg_proto->active) {
+	if (!memcg->tcp_mem.active) {
 		/*
 		 * The active flag needs to be written after the static_key
 		 * update. This is what guarantees that the socket activation
@@ -79,7 +66,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
 		 * patched in yet.
 		 */
 		static_key_slow_inc(&memcg_socket_limit_enabled);
-		cg_proto->active = true;
+		memcg->tcp_mem.active = true;
 	}
 
 	return 0;
@@ -123,32 +110,32 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
 static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
 	u64 val;
 
 	switch (cft->private) {
 	case RES_LIMIT:
-		if (!cg_proto)
-			return PAGE_COUNTER_MAX;
-		val = cg_proto->memory_allocated.limit;
+		if (memcg == root_mem_cgroup)
+			val = PAGE_COUNTER_MAX;
+		else
+			val = memcg->tcp_mem.memory_allocated.limit;
 		val *= PAGE_SIZE;
 		break;
 	case RES_USAGE:
-		if (!cg_proto)
+		if (memcg == root_mem_cgroup)
 			val = atomic_long_read(&tcp_memory_allocated);
 		else
-			val = page_counter_read(&cg_proto->memory_allocated);
+			val = page_counter_read(&memcg->tcp_mem.memory_allocated);
 		val *= PAGE_SIZE;
 		break;
 	case RES_FAILCNT:
-		if (!cg_proto)
+		if (memcg == root_mem_cgroup)
 			return 0;
-		val = cg_proto->memory_allocated.failcnt;
+		val = memcg->tcp_mem.memory_allocated.failcnt;
 		break;
 	case RES_MAX_USAGE:
-		if (!cg_proto)
+		if (memcg == root_mem_cgroup)
 			return 0;
-		val = cg_proto->memory_allocated.watermark;
+		val = memcg->tcp_mem.memory_allocated.watermark;
 		val *= PAGE_SIZE;
 		break;
 	default:
@@ -161,19 +148,17 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg;
-	struct cg_proto *cg_proto;
 
 	memcg = mem_cgroup_from_css(of_css(of));
-	cg_proto = tcp_prot.proto_cgroup(memcg);
-	if (!cg_proto)
+	if (memcg == root_mem_cgroup)
 		return nbytes;
 
 	switch (of_cft(of)->private) {
 	case RES_MAX_USAGE:
-		page_counter_reset_watermark(&cg_proto->memory_allocated);
+		page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated);
 		break;
 	case RES_FAILCNT:
-		cg_proto->memory_allocated.failcnt = 0;
+		memcg->tcp_mem.memory_allocated.failcnt = 0;
 		break;
 	}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 493b48945f0c..fda379cd600d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2821,8 +2821,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
 	sk_memory_allocated_add(sk, amt);
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		mem_cgroup_charge_skmem(sk->sk_cgrp, amt);
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+		mem_cgroup_charge_skmem(sk->sk_memcg, amt);
 }
 
 /* Send a FIN. The caller locks the socket for us.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index db9f1c318afc..4ad8edb46f7c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1888,9 +1888,6 @@ struct proto tcpv6_prot = {
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
-#endif
-#ifdef CONFIG_MEMCG_KMEM
-	.proto_cgroup		= tcp_proto_cgroup,
 #endif
 	.clear_sk		= tcp_v6_clear_sk,
 	.diag_destroy		= tcp_abort,
-- 
cgit v1.2.3


From 80e95fe0fdcde2812c341ad4209d62dc1a7af53b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:21:20 -0800
Subject: mm: memcontrol: generalize the socket accounting jump label

The unified hierarchy memory controller is going to use this jump label
as well to control the networking callbacks.  Move it to the memory
controller code and give it a more generic name.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 3 +++
 include/net/sock.h         | 7 -------
 mm/memcontrol.c            | 3 +++
 net/core/sock.c            | 5 -----
 net/ipv4/tcp_memcontrol.c  | 4 ++--
 5 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7c085e4636ba..03090e8e7fff 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -681,11 +681,14 @@ void sock_release_memcg(struct sock *sk);
 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+extern struct static_key memcg_sockets_enabled_key;
+#define mem_cgroup_sockets_enabled static_key_false(&memcg_sockets_enabled_key)
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
 	return memcg->tcp_mem.memory_pressure;
 }
 #else
+#define mem_cgroup_sockets_enabled 0
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
 	return false;
diff --git a/include/net/sock.h b/include/net/sock.h
index be96a8dcbc74..b9e7b3d863a0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1079,13 +1079,6 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 #define sk_refcnt_debug_release(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET)
-extern struct static_key memcg_socket_limit_enabled;
-#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled)
-#else
-#define mem_cgroup_sockets_enabled 0
-#endif
-
 static inline bool sk_stream_memory_free(const struct sock *sk)
 {
 	if (sk->sk_wmem_queued >= sk->sk_sndbuf)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eaaa86126277..08ef3d2ca663 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -291,6 +291,9 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
+struct static_key memcg_sockets_enabled_key;
+EXPORT_SYMBOL(memcg_sockets_enabled_key);
+
 void sock_update_memcg(struct sock *sk)
 {
 	struct mem_cgroup *memcg;
diff --git a/net/core/sock.c b/net/core/sock.c
index 3535bffa45f3..6c1c8bc93412 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -202,11 +202,6 @@ EXPORT_SYMBOL(sk_net_capable);
 static struct lock_class_key af_family_keys[AF_MAX];
 static struct lock_class_key af_family_slock_keys[AF_MAX];
 
-#if defined(CONFIG_MEMCG_KMEM)
-struct static_key memcg_socket_limit_enabled;
-EXPORT_SYMBOL(memcg_socket_limit_enabled);
-#endif
-
 /*
  * Make lock validator output more readable. (we pre-construct these
  * strings build-time, so that runtime initialization of socket
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index e5078259cbe3..9a22e2dfd64a 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -34,7 +34,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 		return;
 
 	if (memcg->tcp_mem.active)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
+		static_key_slow_dec(&memcg_sockets_enabled_key);
 }
 
 static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
@@ -65,7 +65,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
 		 * because when this value change, the code to process it is not
 		 * patched in yet.
 		 */
-		static_key_slow_inc(&memcg_socket_limit_enabled);
+		static_key_slow_inc(&memcg_sockets_enabled_key);
 		memcg->tcp_mem.active = true;
 	}
 
-- 
cgit v1.2.3