From 7a1c8e5ab120a5f352e78bbc1fa5bb64e6f23639 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 20 Nov 2010 07:46:35 +0000
Subject: net: allow GFP_HIGHMEM in __vmalloc()

We forgot to use __GFP_HIGHMEM in several __vmalloc() calls.

In ceph, add the missing flag.

In fib_trie.c, xfrm_hash.c and request_sock.c, using vzalloc() is
cleaner and allows using HIGHMEM pages as well.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ceph/buffer.c       | 2 +-
 net/core/request_sock.c | 4 +---
 net/ipv4/fib_trie.c     | 2 +-
 net/xfrm/xfrm_hash.c    | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
index 53d8abfa25d5..bf3e6a13c215 100644
--- a/net/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -19,7 +19,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 	if (b->vec.iov_base) {
 		b->is_vmalloc = false;
 	} else {
-		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL);
 		if (!b->vec.iov_base) {
 			kfree(b);
 			return NULL;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 7552495aff7a..fceeb37d7161 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -45,9 +45,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
 	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
 	lopt_size += nr_table_entries * sizeof(struct request_sock *);
 	if (lopt_size > PAGE_SIZE)
-		lopt = __vmalloc(lopt_size,
-			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
-			PAGE_KERNEL);
+		lopt = vzalloc(lopt_size);
 	else
 		lopt = kzalloc(lopt_size, GFP_KERNEL);
 	if (lopt == NULL)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 200eb538fbb3..0f280348e0fd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size)
 	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
 	else
-		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+		return vzalloc(size);
 }
 
 static void __tnode_vfree(struct work_struct *arg)
diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c
index a2023ec52329..1e98bc0fe0a5 100644
--- a/net/xfrm/xfrm_hash.c
+++ b/net/xfrm/xfrm_hash.c
@@ -19,7 +19,7 @@ struct hlist_head *xfrm_hash_alloc(unsigned int sz)
 	if (sz <= PAGE_SIZE)
 		n = kzalloc(sz, GFP_KERNEL);
 	else if (hashdist)
-		n = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+		n = vzalloc(sz);
 	else
 		n = (struct hlist_head *)
 			__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
-- 
cgit v1.2.3


From 88b2a9a3d98a19496d64aadda7158c0ad51cbe7d Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Mon, 15 Nov 2010 20:29:21 +0000
Subject: ipv6: fix missing in6_ifa_put in addrconf

Fix ref count bug introduced by

commit 2de795707294972f6c34bae9de713e502c431296
Author: Lorenzo Colitti <lorenzo@google.com>
Date:   Wed Oct 27 18:16:49 2010 +0000

ipv6: addrconf: don't remove address state on ifdown if the address
is being kept

Fix logic so that addrconf_ifdown() decrements the inet6_ifaddr
refcnt correctly with in6_ifa_put().

Reported-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2fc35b32df9e..23cc8e1ce8d4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2758,13 +2758,13 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 			ifa->state = INET6_IFADDR_STATE_DEAD;
 			spin_unlock_bh(&ifa->state_lock);
 
-			if (state == INET6_IFADDR_STATE_DEAD) {
-				in6_ifa_put(ifa);
-			} else {
+			if (state != INET6_IFADDR_STATE_DEAD) {
 				__ipv6_ifa_notify(RTM_DELADDR, ifa);
 				atomic_notifier_call_chain(&inet6addr_chain,
 							   NETDEV_DOWN, ifa);
 			}
+
+			in6_ifa_put(ifa);
 			write_lock_bh(&idev->lock);
 		}
 	}
-- 
cgit v1.2.3


From c89ad7372232b69fd37edf90d6f5d2a8d6381214 Mon Sep 17 00:00:00 2001
From: "Gustavo F. Padovan" <padovan@profusion.mobi>
Date: Mon, 1 Nov 2010 19:08:50 +0000
Subject: Bluetooth: Fix not returning proper error in SCO

Return 0 in that situation could lead to errors in the caller.

Signed-off-by: Gustavo F. Padovan <padovan@profusion.mobi>
---
 net/bluetooth/sco.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index d0927d1fdada..66b9e5c0523a 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -882,7 +882,7 @@ static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type)
 	int lm = 0;
 
 	if (type != SCO_LINK && type != ESCO_LINK)
-		return 0;
+		return -EINVAL;
 
 	BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
 
@@ -908,7 +908,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
 	BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status);
 
 	if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
-		return 0;
+		return -EINVAL;
 
 	if (!status) {
 		struct sco_conn *conn;
@@ -927,7 +927,7 @@ static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
 	BT_DBG("hcon %p reason %d", hcon, reason);
 
 	if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
-		return 0;
+		return -EINVAL;
 
 	sco_conn_del(hcon, bt_err(reason));
 
-- 
cgit v1.2.3


From 9915672d41273f5b77f1b3c29b391ffb7732b84b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 24 Nov 2010 09:15:27 -0800
Subject: af_unix: limit unix_tot_inflight

Vegard Nossum found a unix socket OOM was possible, posting an exploit
program.

My analysis is we can eat all LOWMEM memory before unix_gc() being
called from unix_release_sock(). Moreover, the thread blocked in
unix_gc() can consume huge amount of time to perform cleanup because of
huge working set.

One way to handle this is to have a sensible limit on unix_tot_inflight,
tested from wait_for_unix_gc() and to force a call to unix_gc() if this
limit is hit.

This solves the OOM and also reduce overall latencies, and should not
slowdown normal workloads.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/garbage.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index c8df6fda0b1f..40df93d1cf35 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -259,9 +259,16 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 }
 
 static bool gc_in_progress = false;
+#define UNIX_INFLIGHT_TRIGGER_GC 16000
 
 void wait_for_unix_gc(void)
 {
+	/*
+	 * If number of inflight sockets is insane,
+	 * force a garbage collect right now.
+	 */
+	if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
+		unix_gc();
 	wait_event(unix_gc_wait, gc_in_progress == false);
 }
 
-- 
cgit v1.2.3


From c39508d6f118308355468314ff414644115a07f3 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 24 Nov 2010 11:47:22 -0800
Subject: tcp: Make TCP_MAXSEG minimum more correct.

Use TCP_MIN_MSS instead of constant 64.

Reported-by: Min Zhang <mzhang@mvista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 081419969485..f15c36a706ec 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		/* Values greater than interface MTU won't take effect. However
 		 * at the point when this call is done we typically don't yet
 		 * know which interface is going to be used */
-		if (val < 64 || val > MAX_TCP_WINDOW) {
+		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
 			err = -EINVAL;
 			break;
 		}
-- 
cgit v1.2.3


From fa0e846494792e722d817b9d3d625a4ef4896c96 Mon Sep 17 00:00:00 2001
From: Phil Blundell <philb@gnu.org>
Date: Wed, 24 Nov 2010 11:49:19 -0800
Subject: econet: disallow NULL remote addr for sendmsg(), fixes CVE-2010-3849

Later parts of econet_sendmsg() rely on saddr != NULL, so return early
with EINVAL if NULL was passed otherwise an oops may occur.

Signed-off-by: Phil Blundell <philb@gnu.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/econet/af_econet.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index f8c1ae4b41f0..e366f1bef91f 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -297,23 +297,14 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 	mutex_lock(&econet_mutex);
 
-	if (saddr == NULL) {
-		struct econet_sock *eo = ec_sk(sk);
-
-		addr.station = eo->station;
-		addr.net     = eo->net;
-		port	     = eo->port;
-		cb	     = eo->cb;
-	} else {
-		if (msg->msg_namelen < sizeof(struct sockaddr_ec)) {
-			mutex_unlock(&econet_mutex);
-			return -EINVAL;
-		}
-		addr.station = saddr->addr.station;
-		addr.net = saddr->addr.net;
-		port = saddr->port;
-		cb = saddr->cb;
-	}
+        if (saddr == NULL || msg->msg_namelen < sizeof(struct sockaddr_ec)) {
+                mutex_unlock(&econet_mutex);
+                return -EINVAL;
+        }
+        addr.station = saddr->addr.station;
+        addr.net = saddr->addr.net;
+        port = saddr->port;
+        cb = saddr->cb;
 
 	/* Look for a device with the right network number. */
 	dev = net2dev_map[addr.net];
@@ -351,7 +342,6 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 		eb = (struct ec_cb *)&skb->cb;
 
-		/* BUG: saddr may be NULL */
 		eb->cookie = saddr->cookie;
 		eb->sec = *saddr;
 		eb->sent = ec_tx_done;
-- 
cgit v1.2.3


From 16c41745c7b92a243d0874f534c1655196c64b74 Mon Sep 17 00:00:00 2001
From: Phil Blundell <philb@gnu.org>
Date: Wed, 24 Nov 2010 11:49:53 -0800
Subject: econet: fix CVE-2010-3850

Add missing check for capable(CAP_NET_ADMIN) in SIOCSIFADDR operation.

Signed-off-by: Phil Blundell <philb@gnu.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/econet/af_econet.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index e366f1bef91f..d41ba8e56c10 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -661,6 +661,9 @@ static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
 	err = 0;
 	switch (cmd) {
 	case SIOCSIFADDR:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
 		edev = dev->ec_ptr;
 		if (edev == NULL) {
 			/* Magic up a new one. */
-- 
cgit v1.2.3


From a27e13d370415add3487949c60810e36069a23a6 Mon Sep 17 00:00:00 2001
From: Phil Blundell <philb@gnu.org>
Date: Wed, 24 Nov 2010 11:51:47 -0800
Subject: econet: fix CVE-2010-3848

Don't declare variable sized array of iovecs on the stack since this
could cause stack overflow if msg->msgiovlen is large.  Instead, coalesce
the user-supplied data into a new buffer and use a single iovec for it.

Signed-off-by: Phil Blundell <philb@gnu.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/econet/af_econet.c | 62 +++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index d41ba8e56c10..13992e1d2726 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
 #include <linux/skbuff.h>
 #include <linux/udp.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <linux/stat.h>
@@ -276,12 +277,12 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 #endif
 #ifdef CONFIG_ECONET_AUNUDP
 	struct msghdr udpmsg;
-	struct iovec iov[msg->msg_iovlen+1];
+	struct iovec iov[2];
 	struct aunhdr ah;
 	struct sockaddr_in udpdest;
 	__kernel_size_t size;
-	int i;
 	mm_segment_t oldfs;
+	char *userbuf;
 #endif
 
 	/*
@@ -319,17 +320,17 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 		}
 	}
 
-	if (len + 15 > dev->mtu) {
-		mutex_unlock(&econet_mutex);
-		return -EMSGSIZE;
-	}
-
 	if (dev->type == ARPHRD_ECONET) {
 		/* Real hardware Econet.  We're not worthy etc. */
 #ifdef CONFIG_ECONET_NATIVE
 		unsigned short proto = 0;
 		int res;
 
+		if (len + 15 > dev->mtu) {
+			mutex_unlock(&econet_mutex);
+			return -EMSGSIZE;
+		}
+
 		dev_hold(dev);
 
 		skb = sock_alloc_send_skb(sk, len+LL_ALLOCATED_SPACE(dev),
@@ -405,6 +406,11 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 		return -ENETDOWN;		/* No socket - can't send */
 	}
 
+	if (len > 32768) {
+		err = -E2BIG;
+		goto error;
+	}
+
 	/* Make up a UDP datagram and hand it off to some higher intellect. */
 
 	memset(&udpdest, 0, sizeof(udpdest));
@@ -436,36 +442,26 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 	/* tack our header on the front of the iovec */
 	size = sizeof(struct aunhdr);
-	/*
-	 * XXX: that is b0rken.  We can't mix userland and kernel pointers
-	 * in iovec, since on a lot of platforms copy_from_user() will
-	 * *not* work with the kernel and userland ones at the same time,
-	 * regardless of what we do with set_fs().  And we are talking about
-	 * econet-over-ethernet here, so "it's only ARM anyway" doesn't
-	 * apply.  Any suggestions on fixing that code?		-- AV
-	 */
 	iov[0].iov_base = (void *)&ah;
 	iov[0].iov_len = size;
-	for (i = 0; i < msg->msg_iovlen; i++) {
-		void __user *base = msg->msg_iov[i].iov_base;
-		size_t iov_len = msg->msg_iov[i].iov_len;
-		/* Check it now since we switch to KERNEL_DS later. */
-		if (!access_ok(VERIFY_READ, base, iov_len)) {
-			mutex_unlock(&econet_mutex);
-			return -EFAULT;
-		}
-		iov[i+1].iov_base = base;
-		iov[i+1].iov_len = iov_len;
-		size += iov_len;
+
+	userbuf = vmalloc(len);
+	if (userbuf == NULL) {
+		err = -ENOMEM;
+		goto error;
 	}
 
+	iov[1].iov_base = userbuf;
+	iov[1].iov_len = len;
+	err = memcpy_fromiovec(userbuf, msg->msg_iov, len);
+	if (err)
+		goto error_free_buf;
+
 	/* Get a skbuff (no data, just holds our cb information) */
 	if ((skb = sock_alloc_send_skb(sk, 0,
 				       msg->msg_flags & MSG_DONTWAIT,
-				       &err)) == NULL) {
-		mutex_unlock(&econet_mutex);
-		return err;
-	}
+				       &err)) == NULL)
+		goto error_free_buf;
 
 	eb = (struct ec_cb *)&skb->cb;
 
@@ -481,7 +477,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 	udpmsg.msg_name = (void *)&udpdest;
 	udpmsg.msg_namelen = sizeof(udpdest);
 	udpmsg.msg_iov = &iov[0];
-	udpmsg.msg_iovlen = msg->msg_iovlen + 1;
+	udpmsg.msg_iovlen = 2;
 	udpmsg.msg_control = NULL;
 	udpmsg.msg_controllen = 0;
 	udpmsg.msg_flags=0;
@@ -489,9 +485,13 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 	oldfs = get_fs(); set_fs(KERNEL_DS);	/* More privs :-) */
 	err = sock_sendmsg(udpsock, &udpmsg, size);
 	set_fs(oldfs);
+
+error_free_buf:
+	vfree(userbuf);
 #else
 	err = -EPROTOTYPE;
 #endif
+	error:
 	mutex_unlock(&econet_mutex);
 
 	return err;
-- 
cgit v1.2.3


From 4cb6a614ba0e58cae8abdadbf73bcb4d37a3f599 Mon Sep 17 00:00:00 2001
From: Tracey Dent <tdent48227@gmail.com>
Date: Sun, 21 Nov 2010 15:23:50 +0000
Subject: Net: ceph: Makefile: Remove unnessary code

Remove the if and else conditional because the code is in mainline and there
is no need in it being there.

Signed-off-by: Tracey Dent <tdent48227@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ceph/Makefile | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'net')

diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index aab1cabb8035..5f19415ec9c0 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -1,9 +1,6 @@
 #
 # Makefile for CEPH filesystem.
 #
-
-ifneq ($(KERNELRELEASE),)
-
 obj-$(CONFIG_CEPH_LIB) += libceph.o
 
 libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
@@ -16,22 +13,3 @@ libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o \
 	pagevec.o
 
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-
-default: all
-
-all:
-	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
-
-modules_install:
-	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
-
-clean:
-	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-
-endif
-- 
cgit v1.2.3


From 8475ef9fd16cadbfc692f78e608d1941a340beb2 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 22 Nov 2010 03:26:12 +0000
Subject: netns: Don't leak others' openreq-s in proc

The /proc/net/tcp leaks openreq sockets from other namespaces.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 69ccbc1dde9c..e13da6de1fc7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2043,7 +2043,9 @@ get_req:
 	}
 get_sk:
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		if (sk->sk_family == st->family) {
 			cur = sk;
 			goto out;
 		}
-- 
cgit v1.2.3


From 0147fc058d11bd4009b126d09974d2c8f48fef15 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 22 Nov 2010 12:54:21 +0000
Subject: tcp: restrict net.ipv4.tcp_adv_win_scale (#20312)

tcp_win_from_space() does the following:

      if (sysctl_tcp_adv_win_scale <= 0)
              return space >> (-sysctl_tcp_adv_win_scale);
      else
              return space - (space >> sysctl_tcp_adv_win_scale);

"space" is int.

As per C99 6.5.7 (3) shifting int for 32 or more bits is
undefined behaviour.

Indeed, if sysctl_tcp_adv_win_scale is exactly 32,
space >> 32 equals space and function returns 0.

Which means we busyloop in tcp_fixup_rcvbuf().

Restrict net.ipv4.tcp_adv_win_scale to [-31, 31].

Fix https://bugzilla.kernel.org/show_bug.cgi?id=20312

Steps to reproduce:

      echo 32 >/proc/sys/net/ipv4/tcp_adv_win_scale
      wget www.kernel.org
      [softlockup]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index fe95105992c5..3c5e465296e1 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -144,6 +144,7 @@ tcp_adv_win_scale - INTEGER
 	Count buffering overhead as bytes/2^tcp_adv_win_scale
 	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
 	if it is <= 0.
+	Possible values are [-31, 31], inclusive.
 	Default: 2
 
 tcp_allowed_congestion_control - STRING
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e91911d7aae2..1b4ec21497a4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -26,6 +26,8 @@ static int zero;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
 
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
@@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = {
 		.data		= &sysctl_tcp_adv_win_scale,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_adv_win_scale_min,
+		.extra2		= &tcp_adv_win_scale_max,
 	},
 	{
 		.procname	= "tcp_tw_reuse",
-- 
cgit v1.2.3


From 0ac78870220b6e0ac74dd9292bcfa7b18718babd Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 23 Nov 2010 02:36:56 +0000
Subject: dccp: fix error in updating the GAR

This fixes a bug in updating the Greatest Acknowledgment number Received (GAR):
the current implementation does not track the greatest received value -
lower values in the range AWL..AWH (RFC 4340, 7.5.1) erase higher ones.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 265985370fa1..e424a09e83f6 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -239,7 +239,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
 		dccp_update_gsr(sk, seqno);
 
 		if (dh->dccph_type != DCCP_PKT_SYNC &&
-		    (ackno != DCCP_PKT_WITHOUT_ACK_SEQ))
+		    ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+		    after48(ackno, dp->dccps_gar))
 			dp->dccps_gar = ackno;
 	} else {
 		unsigned long now = jiffies;
-- 
cgit v1.2.3


From 3c6f27bf33052ea6ba9d82369fb460726fb779c0 Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Tue, 23 Nov 2010 11:02:13 +0000
Subject: DECnet: don't leak uninitialized stack byte

A single uninitialized padding byte is leaked to userspace.

Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
CC: stable <stable@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/af_decnet.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index a76b78de679f..6f97268ed85f 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1556,6 +1556,8 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us
 			if (r_len > sizeof(struct linkinfo_dn))
 				r_len = sizeof(struct linkinfo_dn);
 
+			memset(&link, 0, sizeof(link));
+
 			switch(sock->state) {
 				case SS_CONNECTING:
 					link.idn_linkstate = LL_CONNECTING;
-- 
cgit v1.2.3


From b4ff3c90e6066bacc8a92111752fe9e4f4c45cca Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <tomer_iisc@yahoo.com>
Date: Fri, 26 Nov 2010 14:26:27 +0000
Subject: inet: Fix __inet_inherit_port() to correctly increment bsockets and
 num_owners

inet sockets corresponding to passive connections are added to the bind hash
using ___inet_inherit_port(). These sockets are later removed from the bind
hash using __inet_put_port(). These two functions are not exactly symmetrical.
__inet_put_port() decrements hashinfo->bsockets and tb->num_owners, whereas
___inet_inherit_port() does not increment them. This results in both of these
going to -ve values.

This patch fixes this by calling inet_bind_hash() from ___inet_inherit_port(),
which does the right thing.

'bsockets' and 'num_owners' were introduced by commit a9d8f9110d7e953c
(inet: Allowing more than 64k connections and heavily optimize bind(0))

Signed-off-by: Nagendra Singh Tomar <tomer_iisc@yahoo.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 1b344f30b463..3c0369a3a663 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -133,8 +133,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 			}
 		}
 	}
-	sk_add_bind_node(child, &tb->owners);
-	inet_csk(child)->icsk_bind_hash = tb;
+	inet_bind_hash(child, tb, port);
 	spin_unlock(&head->lock);
 
 	return 0;
-- 
cgit v1.2.3


From 25888e30319f8896fc656fc68643e6a078263060 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 25 Nov 2010 04:11:39 +0000
Subject: af_unix: limit recursion level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Its easy to eat all kernel memory and trigger NMI watchdog, using an
exploit program that queues unix sockets on top of others.

lkml ref : http://lkml.org/lkml/2010/11/25/8

This mechanism is used in applications, one choice we have is to have a
recursion limit.

Other limits might be needed as well (if we queue other types of files),
since the passfd mechanism is currently limited by socket receive queue
sizes only.

Add a recursion_level to unix socket, allowing up to 4 levels.

Each time we send an unix socket through sendfd mechanism, we copy its
recursion level (plus one) to receiver. This recursion level is cleared
when socket receive queue is emptied.

Reported-by: Марк Коренберг <socketpair@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  2 ++
 net/unix/af_unix.c    | 37 ++++++++++++++++++++++++++++++++-----
 net/unix/garbage.c    |  2 +-
 3 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 90c9e2872f27..18e5c3f67580 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -10,6 +10,7 @@ extern void unix_inflight(struct file *fp);
 extern void unix_notinflight(struct file *fp);
 extern void unix_gc(void);
 extern void wait_for_unix_gc(void);
+extern struct sock *unix_get_socket(struct file *filp);
 
 #define UNIX_HASH_SIZE	256
 
@@ -56,6 +57,7 @@ struct unix_sock {
 	spinlock_t		lock;
 	unsigned int		gc_candidate : 1;
 	unsigned int		gc_maybe_cycle : 1;
+	unsigned char		recursion_level;
 	struct socket_wq	peer_wq;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3c95304a0817..2268e6798124 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
+#define MAX_RECURSION_LEVEL 4
+
 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
+	unsigned char max_level = 0;
+	int unix_sock_count = 0;
+
+	for (i = scm->fp->count - 1; i >= 0; i--) {
+		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
+
+		if (sk) {
+			unix_sock_count++;
+			max_level = max(max_level,
+					unix_sk(sk)->recursion_level);
+		}
+	}
+	if (unlikely(max_level > MAX_RECURSION_LEVEL))
+		return -ETOOMANYREFS;
 
 	/*
 	 * Need to duplicate file references for the sake of garbage
@@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	if (!UNIXCB(skb).fp)
 		return -ENOMEM;
 
-	for (i = scm->fp->count-1; i >= 0; i--)
-		unix_inflight(scm->fp->fp[i]);
-	return 0;
+	if (unix_sock_count) {
+		for (i = scm->fp->count - 1; i >= 0; i--)
+			unix_inflight(scm->fp->fp[i]);
+	}
+	return max_level;
 }
 
 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie tmp_scm;
+	int max_level;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 
 	err = unix_scm_to_skb(siocb->scm, skb, true);
-	if (err)
+	if (err < 0)
 		goto out_free;
+	max_level = err + 1;
 	unix_get_secdata(siocb->scm, skb);
 
 	skb_reset_transport_header(skb);
@@ -1514,6 +1534,8 @@ restart:
 	if (sock_flag(other, SOCK_RCVTSTAMP))
 		__net_timestamp(skb);
 	skb_queue_tail(&other->sk_receive_queue, skb);
+	if (max_level > unix_sk(other)->recursion_level)
+		unix_sk(other)->recursion_level = max_level;
 	unix_state_unlock(other);
 	other->sk_data_ready(other, len);
 	sock_put(other);
@@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int sent = 0;
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
+	int max_level;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 		/* Only send the fds in the first buffer */
 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
-		if (err) {
+		if (err < 0) {
 			kfree_skb(skb);
 			goto out_err;
 		}
+		max_level = err + 1;
 		fds_sent = true;
 
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
@@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			goto pipe_err_free;
 
 		skb_queue_tail(&other->sk_receive_queue, skb);
+		if (max_level > unix_sk(other)->recursion_level)
+			unix_sk(other)->recursion_level = max_level;
 		unix_state_unlock(other);
 		other->sk_data_ready(other, size);
 		sent += size;
@@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		unix_state_lock(sk);
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		if (skb == NULL) {
+			unix_sk(sk)->recursion_level = 0;
 			if (copied >= target)
 				goto unlock;
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 40df93d1cf35..f89f83bf828e 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -96,7 +96,7 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 unsigned int unix_tot_inflight;
 
 
-static struct sock *unix_get_socket(struct file *filp)
+struct sock *unix_get_socket(struct file *filp)
 {
 	struct sock *u_sock = NULL;
 	struct inode *inode = filp->f_path.dentry->d_inode;
-- 
cgit v1.2.3


From 7dff3125534c1d035a910052335a3a39fbb31aa7 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Fri, 26 Nov 2010 20:41:55 +0200
Subject: mac80211: Fix frame injection using non-AP vif

In order for frame injection to work properly for some use cases
(e.g., finding the station entry and keys for encryption), mac80211
needs to find the correct sdata entry. This works when the main vif
is in AP mode, but commit a2c1e3dad516618cb0fbfb1a62c36d0b0744573a
broke this particular use case for station main vif. While this type of
injection is quite unusual operation, it has some uses and we should fix
it. Do this by changing the monitor vif sdata selection to allow station
vif to be selected instead of limiting it to just AP vifs. We still need
to skip some iftypes to avoid selecting unsuitable vif for injection.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/tx.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 96c594309506..df6aac523532 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1587,7 +1587,12 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 						list) {
 				if (!ieee80211_sdata_running(tmp_sdata))
 					continue;
-				if (tmp_sdata->vif.type != NL80211_IFTYPE_AP)
+				if (tmp_sdata->vif.type ==
+				    NL80211_IFTYPE_MONITOR ||
+				    tmp_sdata->vif.type ==
+				    NL80211_IFTYPE_AP_VLAN ||
+					tmp_sdata->vif.type ==
+				    NL80211_IFTYPE_WDS)
 					continue;
 				if (compare_ether_addr(tmp_sdata->vif.addr,
 						       hdr->addr2) == 0) {
-- 
cgit v1.2.3


From 2c31333a8fde7e26936a9f5371d02ff12c490993 Mon Sep 17 00:00:00 2001
From: Christian Lamparter <chunkeey@googlemail.com>
Date: Mon, 29 Nov 2010 20:53:23 +0100
Subject: mac80211: ignore non-bcast mcast deauth/disassoc franes

This patch fixes an curious issue due to insufficient
rx frame filtering.

Saqeb Akhter reported frequent disconnects while streaming
videos over samba: <http://marc.info/?m=128600031109136>
> [ 1166.512087] wlan1: deauthenticated from 30:46:9a:10:49:f7 (Reason: 7)
> [ 1526.059997] wlan1: deauthenticated from 30:46:9a:10:49:f7 (Reason: 7)
> [ 2125.324356] wlan1: deauthenticated from 30:46:9a:10:49:f7 (Reason: 7)
> [...]

The reason is that the device generates frames with slightly
bogus SA/TA addresses.

e.g.:
 [ 2314.402316] Ignore 9f:1f:31:f8:64:ff
 [ 2314.402321] Ignore 9f:1f:31:f8:64:ff
 [ 2352.453804] Ignore 0d:1f:31:f8:64:ff
 [ 2352.453808] Ignore 0d:1f:31:f8:64:ff
 					   ^^ the group-address flag is set!
 (the correct SA/TA would be: 00:1f:31:f8:64:ff)

Since the AP does not know from where the frames come, it
generates a DEAUTH response for the (invalid) mcast address.
This mcast deauth frame then passes through all filters and
tricks the stack into thinking that the AP brutally kicked
us!

This patch fixes the problem by simply ignoring
non-broadcast, group-addressed deauth/disassoc frames.

Cc: Jouni Malinen <j@w1.fi>
Cc: Johannes Berg <johannes@sipsolutions.net>
Reported-by: Saqeb Akhter <saqeb.akhter@gmail.com>
Signed-off-by: Christian Lamparter <chunkeey@googlemail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 902b03ee8f60..3c87293cb078 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2247,6 +2247,10 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 		break;
 	case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
 	case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
+		if (is_multicast_ether_addr(mgmt->da) &&
+		    !is_broadcast_ether_addr(mgmt->da))
+			return RX_DROP_MONITOR;
+
 		/* process only for station */
 		if (sdata->vif.type != NL80211_IFTYPE_STATION)
 			return RX_DROP_MONITOR;
-- 
cgit v1.2.3


From 8e26d5ad2f9c038609d42eebc676cd1107709eef Mon Sep 17 00:00:00 2001
From: Senthil Balasubramanian <senthilkumar@atheros.com>
Date: Tue, 30 Nov 2010 20:15:38 +0530
Subject: mac80211: Fix STA disconnect due to MIC failure

Th commit titled "mac80211: clean up rx handling wrt. found_sta"
removed found_sta variable which caused a MIC failure event
to be reported twice for a single failure to supplicant resulted
in STA disconnect.

This should fix WPA specific countermeasures WiFi test case (5.2.17)
issues with mac80211 based drivers which report MIC failure events in
rx status.

Cc: Stable <stable@kernel.org> (2.6.37)
Signed-off-by: Senthil Balasubramanian <senthilkumar@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3c87293cb078..54fb4a0e76f0 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2745,6 +2745,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 
 			if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
 				return;
+			goto out;
 		}
 	}
 
@@ -2784,6 +2785,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 			return;
 	}
 
+ out:
 	dev_kfree_skb(skb);
 }
 
-- 
cgit v1.2.3


From 381601e5bbae78d7c18d946fe874a63957edea13 Mon Sep 17 00:00:00 2001
From: Anders Franzen <Anders.Franzen@ericsson.com>
Date: Wed, 24 Nov 2010 05:47:18 +0000
Subject: Make the ip6_tunnel reflect the true mtu.

The ip6_tunnel always assumes it consumes 40 bytes (ip6 hdr) of the mtu of the
underlaying device. So for a normal ethernet bearer, the mtu of the ip6_tunnel is
1460.
However, when creating a tunnel the encap limit option is enabled by default, and it
consumes 8 bytes more, so the true mtu shall be 1452.

I dont really know if this breaks some statement in some RFC, so this is a request for
comments.

Signed-off-by: Anders Franzen <anders.franzen@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2a59610c2a58..70e891a20fb9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1175,6 +1175,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 				sizeof (struct ipv6hdr);
 
 			dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr);
+			if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+				dev->mtu-=8;
 
 			if (dev->mtu < IPV6_MIN_MTU)
 				dev->mtu = IPV6_MIN_MTU;
@@ -1363,12 +1365,17 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
 
 static void ip6_tnl_dev_setup(struct net_device *dev)
 {
+	struct ip6_tnl *t;
+
 	dev->netdev_ops = &ip6_tnl_netdev_ops;
 	dev->destructor = ip6_dev_free;
 
 	dev->type = ARPHRD_TUNNEL6;
 	dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
 	dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr);
+	t = netdev_priv(dev);
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		dev->mtu-=8;
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	dev->features |= NETIF_F_NETNS_LOCAL;
-- 
cgit v1.2.3


From 6dcdd1b3694a4fa2b85167a9c860c7613a7553c7 Mon Sep 17 00:00:00 2001
From: David McCullough <david_mccullough@mcafee.com>
Date: Mon, 29 Nov 2010 19:32:34 +0000
Subject: net/ipv6/sit.c: return unhandled skb to tunnel4_rcv

I found a problem using an IPv6 over IPv4 tunnel.  When CONFIG_IPV6_SIT
was enabled, the packets would be rejected as net/ipv6/sit.c was catching
all IPPROTO_IPV6 packets and returning an ICMP port unreachable error.

I think this patch fixes the problem cleanly.  I believe the code in
net/ipv4/tunnel4.c:tunnel4_rcv takes care of it properly if none of the
handlers claim the skb.

Signed-off-by: David McCullough <david_mccullough@mcafee.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d6bfaec3bbbf..8c4d00c7cd2b 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -606,8 +606,9 @@ static int ipip6_rcv(struct sk_buff *skb)
 		return 0;
 	}
 
-	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+	/* no tunnel matched,  let upstream know, ipsec may handle it */
 	rcu_read_unlock();
+	return 1;
 out:
 	kfree_skb(skb);
 	return 0;
-- 
cgit v1.2.3


From 46bcf14f44d8f31ecfdc8b6708ec15a3b33316d9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 6 Dec 2010 09:29:43 -0800
Subject: filter: fix sk_filter rcu handling

Pavel Emelyanov tried to fix a race between sk_filter_(de|at)tach and
sk_clone() in commit 47e958eac280c263397

Problem is we can have several clones sharing a common sk_filter, and
these clones might want to sk_filter_attach() their own filters at the
same time, and can overwrite old_filter->rcu, corrupting RCU queues.

We can not use filter->rcu without being sure no other thread could do
the same thing.

Switch code to a more conventional ref-counting technique : Do the
atomic decrement immediately and queue one rcu call back when last
reference is released.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  4 +++-
 net/core/filter.c  | 19 ++++++-------------
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index a6338d039857..659d968d95c5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1155,6 +1155,8 @@ extern void sk_common_release(struct sock *sk);
 /* Initialise core socket variables */
 extern void sock_init_data(struct socket *sock, struct sock *sk);
 
+extern void sk_filter_release_rcu(struct rcu_head *rcu);
+
 /**
  *	sk_filter_release - release a socket filter
  *	@fp: filter to remove
@@ -1165,7 +1167,7 @@ extern void sock_init_data(struct socket *sock, struct sock *sk);
 static inline void sk_filter_release(struct sk_filter *fp)
 {
 	if (atomic_dec_and_test(&fp->refcnt))
-		kfree(fp);
+		call_rcu_bh(&fp->rcu, sk_filter_release_rcu);
 }
 
 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
diff --git a/net/core/filter.c b/net/core/filter.c
index c1ee800bc080..ae21a0d3c4a2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -589,23 +589,16 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 EXPORT_SYMBOL(sk_chk_filter);
 
 /**
- * 	sk_filter_rcu_release - Release a socket filter by rcu_head
+ * 	sk_filter_release_rcu - Release a socket filter by rcu_head
  *	@rcu: rcu_head that contains the sk_filter to free
  */
-static void sk_filter_rcu_release(struct rcu_head *rcu)
+void sk_filter_release_rcu(struct rcu_head *rcu)
 {
 	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
 
-	sk_filter_release(fp);
-}
-
-static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
-{
-	unsigned int size = sk_filter_len(fp);
-
-	atomic_sub(size, &sk->sk_omem_alloc);
-	call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
+	kfree(fp);
 }
+EXPORT_SYMBOL(sk_filter_release_rcu);
 
 /**
  *	sk_attach_filter - attach a socket filter
@@ -649,7 +642,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	rcu_assign_pointer(sk->sk_filter, fp);
 
 	if (old_fp)
-		sk_filter_delayed_uncharge(sk, old_fp);
+		sk_filter_uncharge(sk, old_fp);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sk_attach_filter);
@@ -663,7 +656,7 @@ int sk_detach_filter(struct sock *sk)
 					   sock_owned_by_user(sk));
 	if (filter) {
 		rcu_assign_pointer(sk->sk_filter, NULL);
-		sk_filter_delayed_uncharge(sk, filter);
+		sk_filter_uncharge(sk, filter);
 		ret = 0;
 	}
 	return ret;
-- 
cgit v1.2.3


From b1afde60f2b9ee8444fba4e012dc99a3b28d224d Mon Sep 17 00:00:00 2001
From: Nandita Dukkipati <nanditad@google.com>
Date: Fri, 3 Dec 2010 13:33:44 +0000
Subject: tcp: Bug fix in initialization of receive window.

The bug has to do with boundary checks on the initial receive window.
If the initial receive window falls between init_cwnd and the
receive window specified by the user, the initial window is incorrectly
brought down to init_cwnd. The correct behavior is to allow it to
remain unchanged.

Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 05b1ecf36763..3c59ab42df2b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -231,11 +231,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
 		/* when initializing use the value from init_rcv_wnd
 		 * rather than the default from above
 		 */
-		if (init_rcv_wnd &&
-		    (*rcv_wnd > init_rcv_wnd * mss))
-			*rcv_wnd = init_rcv_wnd * mss;
-		else if (*rcv_wnd > init_cwnd * mss)
-			*rcv_wnd = init_cwnd * mss;
+		if (init_rcv_wnd)
+			*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+		else
+			*rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
 	}
 
 	/* Set the clamp no higher than max representable value */
-- 
cgit v1.2.3


From 35d9b0c906ad92d32a0b8db5daa6fabfcc2f068d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 5 Dec 2010 02:03:26 +0000
Subject: llc: fix a device refcount imbalance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Le dimanche 05 décembre 2010 à 12:23 +0100, Eric Dumazet a écrit :
> Le dimanche 05 décembre 2010 à 09:19 +0100, Eric Dumazet a écrit :
>
> > Hmm..
> >
> > If somebody can explain why RTNL is held in arp_ioctl() (and therefore
> > in arp_req_delete()), we might first remove RTNL use in arp_ioctl() so
> > that your patch can be applied.
> >
> > Right now it is not good, because RTNL wont be necessarly held when you
> > are going to call arp_invalidate() ?
>
> While doing this analysis, I found a refcount bug in llc, I'll send a
> patch for net-2.6

Oh well, of course I must first fix the bug in net-2.6, and wait David
pull the fix in net-next-2.6 before sending this rcu conversion.

Note: this patch should be sent to stable teams (2.6.34 and up)

[PATCH net-2.6] llc: fix a device refcount imbalance

commit abf9d537fea225 (llc: add support for SO_BINDTODEVICE) added one
refcount imbalance in llc_ui_bind(), because dev_getbyhwaddr() doesnt
take a reference on device, while dev_get_by_index() does.

Fix this using RCU locking. And since an RCU conversion will be done for
2.6.38 for dev_getbyhwaddr(), put the rcu_read_lock/unlock exactly at
their final place.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@kernel.org
Cc: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 582612998211..e35dbe55f520 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -317,8 +317,9 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 		goto out;
 	rc = -ENODEV;
 	rtnl_lock();
+	rcu_read_lock();
 	if (sk->sk_bound_dev_if) {
-		llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
+		llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
 		if (llc->dev) {
 			if (!addr->sllc_arphrd)
 				addr->sllc_arphrd = llc->dev->type;
@@ -329,13 +330,13 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 			    !llc_mac_match(addr->sllc_mac,
 					   llc->dev->dev_addr)) {
 				rc = -EINVAL;
-				dev_put(llc->dev);
 				llc->dev = NULL;
 			}
 		}
 	} else
 		llc->dev = dev_getbyhwaddr(&init_net, addr->sllc_arphrd,
 					   addr->sllc_mac);
+	rcu_read_unlock();
 	rtnl_unlock();
 	if (!llc->dev)
 		goto out;
-- 
cgit v1.2.3


From 0c62fc6dd02c8d793c75ae76a9b6881fc36388ad Mon Sep 17 00:00:00 2001
From: Nelson Elhage <nelhage@ksplice.com>
Date: Wed, 8 Dec 2010 10:13:55 -0800
Subject: econet: Do the correct cleanup after an unprivileged SIOCSIFADDR.

We need to drop the mutex and do a dev_put, so set an error code and break like
the other paths, instead of returning directly.

Signed-off-by: Nelson Elhage <nelhage@ksplice.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/econet/af_econet.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 13992e1d2726..f180371fa415 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -661,8 +661,10 @@ static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
 	err = 0;
 	switch (cmd) {
 	case SIOCSIFADDR:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
+		if (!capable(CAP_NET_ADMIN)) {
+			err = -EPERM;
+			break;
+		}
 
 		edev = dev->ec_ptr;
 		if (edev == NULL) {
-- 
cgit v1.2.3


From e8d34a884e4ff118920bb57664def8a73b1b784f Mon Sep 17 00:00:00 2001
From: Michal Marek <mmarek@suse.cz>
Date: Mon, 6 Dec 2010 02:39:12 +0000
Subject: l2tp: Fix modalias of l2tp_ip

Using the SOCK_DGRAM enum results in
"net-pf-2-proto-SOCK_DGRAM-type-115", so use the numeric value like it
is done in net/dccp.

Signed-off-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ip.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 0bf6a59545ab..522e219f3558 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -674,4 +674,8 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
 MODULE_DESCRIPTION("L2TP over IP");
 MODULE_VERSION("1.0");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, SOCK_DGRAM, IPPROTO_L2TP);
+
+/* Use the value of SOCK_DGRAM (2) directory, because __stringify does't like
+ * enums
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP);
-- 
cgit v1.2.3


From 171995e5d82dcc92bea37a7d2a2ecc21068a0f19 Mon Sep 17 00:00:00 2001
From: Apollon Oikonomopoulos <apollon@noc.grnet.gr>
Date: Tue, 7 Dec 2010 09:43:30 +0000
Subject: x25: decrement netdev reference counts on unload

x25 does not decrement the network device reference counts on module unload.
Thus unregistering any pre-existing interface after unloading the x25 module
hangs and results in

 unregister_netdevice: waiting for tap0 to become free. Usage count = 1

This patch decrements the reference counts of all interfaces in x25_link_free,
the way it is already done in x25_link_device_down for NETDEV_DOWN events.

Signed-off-by: Apollon Oikonomopoulos <apollon@noc.grnet.gr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/x25_link.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index 73e7b954ad28..b25c6463c3e9 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -394,6 +394,7 @@ void __exit x25_link_free(void)
 	list_for_each_safe(entry, tmp, &x25_neigh_list) {
 		nb = list_entry(entry, struct x25_neigh, node);
 		__x25_remove_neigh(nb);
+		dev_put(nb->dev);
 	}
 	write_unlock_bh(&x25_neigh_list_lock);
 }
-- 
cgit v1.2.3


From 67631510a318d5a930055fe927607f483716e100 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Wed, 8 Dec 2010 12:16:33 -0800
Subject: tcp: Replace time wait bucket msg by counter

Rather than printing the message to the log, use a mib counter to keep
track of the count of occurences of time wait bucket overflow.  Reduces
spam in logs.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h     | 1 +
 net/ipv4/proc.c          | 1 +
 net/ipv4/tcp_minisocks.c | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index ebb0c80ffd6e..12b2b18e50c1 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -230,6 +230,7 @@ enum
 	LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
 	LINUX_MIB_TCPDEFERACCEPTDROP,
 	LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
+	LINUX_MIB_TCPTIMEWAITOVERFLOW,		/* TCPTimeWaitOverflow */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1b48eb1ed453..b14ec7d03b6e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
 	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+	SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 43cf901d7659..a66735f75963 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -347,7 +347,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		 * socket up.  We've got bigger problems than
 		 * non-graceful socket closings.
 		 */
-		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
 	}
 
 	tcp_update_metrics(sk);
-- 
cgit v1.2.3


From ad9f4f50fe9288bbe65b7dfd76d8820afac6a24c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 7 Dec 2010 12:03:55 +0000
Subject: tcp: avoid a possible divide by zero

sysctl_tcp_tso_win_divisor might be set to zero while one cpu runs in
tcp_tso_should_defer(). Make sure we dont allow a divide by zero by
reading sysctl_tcp_tso_win_divisor exactly once.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3c59ab42df2b..0d4a3cebfb46 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1512,6 +1512,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
+	int win_divisor;
 
 	if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
 		goto send_now;
@@ -1543,13 +1544,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
 		goto send_now;
 
-	if (sysctl_tcp_tso_win_divisor) {
+	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+	if (win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
 		/* If at least some fraction of a window is available,
 		 * just use it.
 		 */
-		chunk /= sysctl_tcp_tso_win_divisor;
+		chunk /= win_divisor;
 		if (limit >= chunk)
 			goto send_now;
 	} else {
-- 
cgit v1.2.3


From f19872575ff7819a3723154657a497d9bca66b33 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 7 Dec 2010 12:20:47 +0000
Subject: tcp: protect sysctl_tcp_cookie_size reads

Make sure sysctl_tcp_cookie_size is read once in
tcp_cookie_size_check(), or we might return an illegal value to caller
if sysctl_tcp_cookie_size is changed by another cpu.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: William Allen Simpson <william.allen.simpson@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0d4a3cebfb46..61c2463e2753 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,27 +385,30 @@ struct tcp_out_options {
  */
 static u8 tcp_cookie_size_check(u8 desired)
 {
-	if (desired > 0) {
+	int cookie_size;
+
+	if (desired > 0)
 		/* previously specified */
 		return desired;
-	}
-	if (sysctl_tcp_cookie_size <= 0) {
+
+	cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
+	if (cookie_size <= 0)
 		/* no default specified */
 		return 0;
-	}
-	if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
+
+	if (cookie_size <= TCP_COOKIE_MIN)
 		/* value too small, specify minimum */
 		return TCP_COOKIE_MIN;
-	}
-	if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
+
+	if (cookie_size >= TCP_COOKIE_MAX)
 		/* value too large, specify maximum */
 		return TCP_COOKIE_MAX;
-	}
-	if (0x1 & sysctl_tcp_cookie_size) {
+
+	if (cookie_size & 1)
 		/* 8-bit multiple, illegal, fix it */
-		return (u8)(sysctl_tcp_cookie_size + 0x1);
-	}
-	return (u8)sysctl_tcp_cookie_size;
+		cookie_size++;
+
+	return (u8)cookie_size;
 }
 
 /* Write previously computed TCP options to the packet.
-- 
cgit v1.2.3