From 288b3de55aace830f13280985ec9e6bcbff33b1b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:54 -0800
Subject: bpf: offload: move offload device validation out to the drivers

With TC shared block changes we can't depend on correct netdev
pointer being available in cls_bpf.  Move the device validation
to the driver.  Core will only make sure that offloaded programs
are always attached in the driver (or in HW by the driver).  We
trust that drivers which implement offload callbacks will perform
necessary checks.

Moving the checks to the driver is generally a useful thing,
in practice the check should be against a switchdev instance,
not a netdev, given that most ASICs will probably allow using
the same program on many ports.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c      | 7 ++-----
 net/sched/cls_bpf.c | 8 +++-----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..09525a27319c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7139,11 +7139,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		if (bpf_op == ops->ndo_bpf)
-			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
-						     dev);
-		else
-			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+					     bpf_op == ops->ndo_bpf);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fb680dafac5a..a9f3e317055c 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -382,15 +382,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 {
 	struct bpf_prog *fp;
 	char *name = NULL;
+	bool skip_sw;
 	u32 bpf_fd;
 
 	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
+	skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW;
 
-	if (gen_flags & TCA_CLS_FLAGS_SKIP_SW)
-		fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS,
-					   qdisc_dev(tp->q));
-	else
-		fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
+	fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
-- 
cgit v1.2.3


From 441a33031fe5a3e828fbc17a4f9a5bab9143243c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:55 -0800
Subject: net: xdp: don't allow device-bound programs in driver mode

Currently device-bound programs are not able to run on the host
to save resources (host JIT is not invoked).  Don't allow XDP
programs to be attached without the HW_MODE flag.  In theory
if program is already translated for device offload the driver
should choose to offload it instead of loading it in the driver.
However, offloading translated program may still fail resulting
in device-bound program being run on the host.

Prevent this by refusing to attach device bound programs if
XDP_FLAGS_HW_MODE is not set.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 09525a27319c..5e2ba133fba7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7143,6 +7143,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 					     bpf_op == ops->ndo_bpf);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
+
+		if (!(flags & XDP_FLAGS_HW_MODE) &&
+		    bpf_prog_is_dev_bound(prog->aux)) {
+			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
+			bpf_prog_put(prog);
+			return -EINVAL;
+		}
 	}
 
 	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
-- 
cgit v1.2.3


From db1ac4964fa172803a0fea83033cd35d380a8a77 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 22 Nov 2017 18:32:53 +0000
Subject: bpf: introduce ARG_PTR_TO_MEM_OR_NULL

With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper
argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO
and the verifier can prove the value of this next argument is 0. However,
most helpers are just interested in handling <!NULL, 0>, so forcing them to
deal with <NULL, 0> makes the implementation of those helpers more
complicated for no apparent benefits, requiring them to explicitly handle
those corner cases with checks that bpf programs could start relying upon,
preventing the possibility of removing them later.

Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL
even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type
ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case.

Currently, the only helper that needs this is bpf_csum_diff_proto(), so
change arg1 and arg3 to this new type as well.

Also add a new battery of tests that explicitly test the
!ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the
various <NULL, 0> variations are focused on bpf_csum_diff, so cover also
other helpers.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h                         |   1 +
 kernel/bpf/verifier.c                       |   4 +-
 net/core/filter.c                           |   4 +-
 tools/testing/selftests/bpf/test_verifier.c | 113 ++++++++++++++++++++++++++--
 4 files changed, 112 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 76c577281d78..e55e4255a210 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -78,6 +78,7 @@ enum bpf_arg_type {
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
 				 * helper function must fill all bytes or clear
 				 * them in error case.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dd54d20ace2f..308b0638ec5d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1384,13 +1384,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		if (type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
 		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a SCALAR_VALUE type. Final test
 		 * happens during stack boundary checking.
 		 */
-		if (register_is_null(*reg))
+		if (register_is_null(*reg) &&
+		    arg_type == ARG_PTR_TO_MEM_OR_NULL)
 			/* final test in check_stack_boundary() */;
 		else if (!type_is_pkt_pointer(type) &&
 			 type != PTR_TO_MAP_VALUE &&
diff --git a/net/core/filter.c b/net/core/filter.c
index 1afa17935954..6a85e67fafce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1646,9 +1646,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM_OR_NULL,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 2a5267bef160..3c64f30cf63c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -5631,7 +5631,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on NULL",
+		"helper access to variable memory: size = 0 allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
@@ -5645,7 +5645,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size > 0 not allowed on NULL",
+		"helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
@@ -5663,7 +5663,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on != NULL stack pointer",
+		"helper access to variable memory: size = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
@@ -5680,7 +5680,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on != NULL map pointer",
+		"helper access to variable memory: size = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5702,7 +5702,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5727,7 +5727,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5750,7 +5750,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL packet pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL packet pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
 				    offsetof(struct __sk_buff, data)),
@@ -5771,6 +5771,105 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=inv expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size > 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=inv expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 	{
 		"helper access to variable memory: 8 bytes leak",
 		.insns = {
-- 
cgit v1.2.3


From d7aa04a5e82b4f254d306926c81eae8df69e5200 Mon Sep 17 00:00:00 2001
From: Roman Kapl <code@rkapl.cz>
Date: Mon, 20 Nov 2017 22:21:13 +0100
Subject: net: sched: fix crash when deleting secondary chains

If you flush (delete) a filter chain other than chain 0 (such as when
deleting the device), the kernel may run into a use-after-free. The
chain refcount must not be decremented unless we are sure we are done
with the chain.

To reproduce the bug, run:
    ip link add dtest type dummy
    tc qdisc add dev dtest ingress
    tc filter add dev dtest chain 1  parent ffff: flower
    ip link del dtest

Introduced in: commit f93e1cdcf42c ("net/sched: fix filter flushing"),
but unless you have KAsan or luck, you won't notice it until
commit 0dadc117ac8b ("cls_flower: use tcf_exts_get_net() before call_rcu()")

Fixes: f93e1cdcf42c ("net/sched: fix filter flushing")
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Roman Kapl <code@rkapl.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ab255b421781..7d97f612c9b9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -205,13 +205,14 @@ static void tcf_chain_head_change(struct tcf_chain *chain,
 
 static void tcf_chain_flush(struct tcf_chain *chain)
 {
-	struct tcf_proto *tp;
+	struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
 
 	tcf_chain_head_change(chain, NULL);
-	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
+	while (tp) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);
-		tcf_chain_put(chain);
 		tcf_proto_destroy(tp);
+		tp = rtnl_dereference(chain->filter_chain);
+		tcf_chain_put(chain);
 	}
 }
 
-- 
cgit v1.2.3


From bbfcd77631573ac4a9f57eb6169e04256a111bc1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 21 Nov 2017 09:50:12 +0200
Subject: ipv6: Do not consider linkdown nexthops during multipath

When the 'ignore_routes_with_linkdown' sysctl is set, we should not
consider linkdown nexthops during route lookup.

While the code correctly verifies that the initially selected route
('match') has a carrier, it does not perform the same check in the
subsequent multipath selection, resulting in a potential packet loss.

In case the chosen route does not have a carrier and the sysctl is set,
choose the initially selected route.

Fixes: 35103d11173b ("net: ipv6 sysctl option to ignore routes when nexthop link is down")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Acked-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 05eb7bc36156..0363db914c7a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -472,6 +472,11 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 				&match->rt6i_siblings, rt6i_siblings) {
 			route_choosen--;
 			if (route_choosen == 0) {
+				struct inet6_dev *idev = sibling->rt6i_idev;
+
+				if (!netif_carrier_ok(sibling->dst.dev) &&
+				    idev->cnf.ignore_routes_with_linkdown)
+					break;
 				if (rt6_score_route(sibling, oif, strict) < 0)
 					break;
 				match = sibling;
-- 
cgit v1.2.3


From 4e1061f4a2bba1669c7297455c73ddafbebf2b12 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ursula.braun@de.ibm.com>
Date: Tue, 21 Nov 2017 13:23:53 +0100
Subject: net/smc: use sk_rcvbuf as start for rmb creation

Commit 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers")
merged handling of SMC receive and send buffers. It introduced sk_buf_size
as merged start value for size determination. But since sk_buf_size is not
used at all, sk_sndbuf is erroneously used as start for rmb creation.
This patch makes sure, sk_buf_size is really used as intended, and
sk_rcvbuf is used as start value for rmb creation.

Fixes: 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers")
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reviewed-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2578fbd95664..3b5e5d4bc763 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -575,7 +575,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
 		/* use socket send buffer size (w/o overhead) as start value */
 		sk_buf_size = smc->sk.sk_sndbuf / 2;
 
-	for (bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
 	     bufsize_short >= 0; bufsize_short--) {
 
 		if (is_rmb) {
-- 
cgit v1.2.3


From 688703702584dd513b50001bd1eb068655631e9b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 21 Nov 2017 13:23:54 +0100
Subject: net/smc: Fix preinitialization of buf_desc in __smc_buf_create()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2:

    net/smc/smc_core.c: In function ‘__smc_buf_create’:
    net/smc/smc_core.c:567: warning: ‘bufsize’ may be used uninitialized in this function

Indeed, if the for-loop is never executed, bufsize is used
uninitialized.  In addition, buf_desc is stored for later use, while it
is still a NULL pointer.

Before, error handling was done by checking if buf_desc is non-NULL.
The cleanup changed this to an error check, but forgot to update the
preinitialization of buf_desc to an error pointer.

Update the preinitializatin of buf_desc to fix this.

Fixes: b33982c3a6838d13 ("net/smc: cleanup function __smc_buf_create()")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 3b5e5d4bc763..94f21116dac5 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -562,7 +562,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
 {
 	struct smc_connection *conn = &smc->conn;
 	struct smc_link_group *lgr = conn->lgr;
-	struct smc_buf_desc *buf_desc = NULL;
+	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
 	struct list_head *buf_list;
 	int bufsize, bufsize_short;
 	int sk_buf_size;
-- 
cgit v1.2.3


From 98d11291d189cb5adf49694d0ad1b971c0212697 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 21 Nov 2017 07:08:57 -0800
Subject: net: ipv6: Fixup device for anycast routes during copy

Florian reported a breakage with anycast routes due to commit
4832c30d5458 ("net: ipv6: put host and anycast routes on device with
address"). Prior to this commit anycast routes were added against the
loopback device causing repetitive route entries with no insight into
why they existed. e.g.:
  $ ip -6 ro ls  table local type anycast
  anycast 2001:db8:1:: dev lo proto kernel metric 0 pref medium
  anycast 2001:db8:2:: dev lo proto kernel metric 0 pref medium
  anycast fe80:: dev lo proto kernel metric 0 pref medium
  anycast fe80:: dev lo proto kernel metric 0 pref medium

The point of commit 4832c30d5458 is to add the routes using the device
with the address which is causing the route to be added. e.g.,:
  $ ip -6 ro ls  table local type anycast
  anycast 2001:db8:1:: dev eth1 proto kernel metric 0 pref medium
  anycast 2001:db8:2:: dev eth2 proto kernel metric 0 pref medium
  anycast fe80:: dev eth2 proto kernel metric 0 pref medium
  anycast fe80:: dev eth1 proto kernel metric 0 pref medium

For traffic to work as it did before, the dst device needs to be switched
to the loopback when the copy is created similar to local routes.

Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0363db914c7a..7a8d1500d374 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1024,7 +1024,7 @@ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
 {
 	struct net_device *dev = rt->dst.dev;
 
-	if (rt->rt6i_flags & RTF_LOCAL) {
+	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 		/* for copies of local routes, dst->dev needs to be the
 		 * device if it is a master device, the master device if
 		 * device is enslaved, and the loopback as the default
-- 
cgit v1.2.3


From 0c19f846d582af919db66a5914a0189f9f92c936 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 21 Nov 2017 10:22:25 -0500
Subject: net: accept UFO datagrams from tuntap and packet

Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.

Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.

Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.

It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.

To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").

(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.

Tested
  Booted a v4.13 guest kernel with QEMU. On a host kernel before this
  patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
  enabled, same as on a v4.13 host kernel.

  A UFO packet sent from the guest appears on the tap device:
    host:
      nc -l -p -u 8000 &
      tcpdump -n -i tap0

    guest:
      dd if=/dev/zero of=payload.txt bs=1 count=2000
      nc -u 192.16.1.1 8000 < payload.txt

  Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
  packets arriving fragmented:

    ./with_tap_pair.sh ./tap_send_ufo tap0 tap1
    (from https://github.com/wdebruij/kerneltools/tree/master/tests)

Changes
  v1 -> v2
    - simplified set_offload change (review comment)
    - documented test procedure

Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c               |  2 +-
 drivers/net/tun.c               |  2 +
 include/linux/netdev_features.h |  4 +-
 include/linux/netdevice.h       |  1 +
 include/linux/skbuff.h          |  2 +
 include/linux/virtio_net.h      |  5 ++-
 include/net/ipv6.h              |  1 +
 net/core/dev.c                  |  3 +-
 net/ipv4/af_inet.c              | 12 +++++-
 net/ipv4/udp_offload.c          | 49 ++++++++++++++++++++++--
 net/ipv6/output_core.c          | 31 +++++++++++++++
 net/ipv6/udp_offload.c          | 85 +++++++++++++++++++++++++++++++++++++++--
 net/openvswitch/datapath.c      | 14 +++++++
 net/openvswitch/flow.c          |  6 ++-
 net/sched/act_csum.c            |  6 +++
 15 files changed, 209 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index b13890953ebb..e9489b88407c 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1077,7 +1077,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 	case TUNSETOFFLOAD:
 		/* let the user check for future flags */
 		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			    TUN_F_TSO_ECN))
+			    TUN_F_TSO_ECN | TUN_F_UFO))
 			return -EINVAL;
 
 		rtnl_lock();
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5a2ea78a008f..6a7bde9bc4b2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2370,6 +2370,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
 				features |= NETIF_F_TSO6;
 			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
 		}
+
+		arg &= ~TUN_F_UFO;
 	}
 
 	/* This gives the user a way to test for new features in future by
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index dc8b4896b77b..b1b0ca7ccb2b 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -54,8 +54,9 @@ enum {
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
 	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
+	NETIF_F_GSO_UDP_BIT,		/* ... UFO, deprecated except tuntap */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_ESP_BIT,
+		NETIF_F_GSO_UDP_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -132,6 +133,7 @@ enum {
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_GSO_SCTP	__NETIF_F(GSO_SCTP)
 #define NETIF_F_GSO_ESP		__NETIF_F(GSO_ESP)
+#define NETIF_F_GSO_UDP		__NETIF_F(GSO_UDP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6b274bfe489f..ef789e1d679e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4140,6 +4140,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ed06e1c28fc7..bc486ef23f20 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -568,6 +568,8 @@ enum {
 	SKB_GSO_SCTP = 1 << 14,
 
 	SKB_GSO_ESP = 1 << 15,
+
+	SKB_GSO_UDP = 1 << 16,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 210034c896e3..f144216febc6 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 					const struct virtio_net_hdr *hdr,
 					bool little_endian)
 {
-	unsigned short gso_type = 0;
+	unsigned int gso_type = 0;
 
 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
@@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		case VIRTIO_NET_HDR_GSO_TCPV6:
 			gso_type = SKB_GSO_TCPV6;
 			break;
+		case VIRTIO_NET_HDR_GSO_UDP:
+			gso_type = SKB_GSO_UDP;
+			break;
 		default:
 			return -EINVAL;
 		}
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ec14f0d5a3a1..f73797e2fa60 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -767,6 +767,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr);
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..bbba19112f02 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2746,7 +2746,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
 	if (tx_path)
-		return skb->ip_summed != CHECKSUM_PARTIAL;
+		return skb->ip_summed != CHECKSUM_PARTIAL &&
+		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 
 	return skb->ip_summed == CHECKSUM_NONE;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ce4aa827be05..f00499a46927 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
-	bool fixedid = false, gso_partial, encap;
+	bool udpfrag = false, fixedid = false, gso_partial, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
+	unsigned int offset = 0;
 	struct iphdr *iph;
 	int proto, tot_len;
 	int nhoff;
@@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	if (!skb->encapsulation || encap) {
+		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
 
 		/* fixed ID is invalid if DF bit is not set */
@@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
-		if (skb_is_gso(skb)) {
+		if (udpfrag) {
+			iph->frag_off = htons(offset >> 3);
+			if (skb->next)
+				iph->frag_off |= htons(IP_MF);
+			offset += skb->len - nhoff - ihl;
+			tot_len = skb->len - nhoff;
+		} else if (skb_is_gso(skb)) {
 			if (!fixedid) {
 				iph->id = htons(id);
 				id += skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index e360d55be555..01801b77bd0d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,16 +187,57 @@ out_unlock:
 }
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
-static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb,
-					   netdev_features_t features)
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+					 netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	__wsum csum;
+	struct udphdr *uh;
+	struct iphdr *iph;
 
 	if (skb->encapsulation &&
 	    (skb_shinfo(skb)->gso_type &
-	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)))
+	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
 		segs = skb_udp_tunnel_segment(skb, features, false);
+		goto out;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto out;
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	/* Do software UFO. Complete and fill in the UDP checksum as
+	 * HW cannot do checksum of UDP packets sent as multiple
+	 * IP fragments.
+	 */
 
+	uh = udp_hdr(skb);
+	iph = ip_hdr(skb);
+
+	uh->check = 0;
+	csum = skb_checksum(skb, 0, skb->len, 0);
+	uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+	if (uh->check == 0)
+		uh->check = CSUM_MANGLED_0;
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* If there is no outer header we can fake a checksum offload
+	 * due to the fact that we have already done the checksum in
+	 * software prior to segmenting the frame.
+	 */
+	if (!skb->encap_hdr_csum)
+		features |= NETIF_F_HW_CSUM;
+
+	/* Fragment the skb. IP headers of the fragments are updated in
+	 * inet_gso_segment()
+	 */
+	segs = skb_segment(skb, features);
+out:
 	return segs;
 }
 
@@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv4_offload = {
 	.callbacks = {
-		.gso_segment = udp4_tunnel_segment,
+		.gso_segment = udp4_ufo_fragment,
 		.gro_receive  =	udp4_gro_receive,
 		.gro_complete =	udp4_gro_complete,
 	},
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 4a7e5ffa5108..4fe7c90962dd 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -31,6 +31,37 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
 	return id;
 }
 
+/* This function exists only for tap drivers that must support broken
+ * clients requesting UFO without specifying an IPv6 fragment ID.
+ *
+ * This is similar to ipv6_select_ident() but we use an independent hash
+ * seed to limit information leakage.
+ *
+ * The network header must be set before calling this.
+ */
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
+{
+	static u32 ip6_proxy_idents_hashrnd __read_mostly;
+	struct in6_addr buf[2];
+	struct in6_addr *addrs;
+	u32 id;
+
+	addrs = skb_header_pointer(skb,
+				   skb_network_offset(skb) +
+				   offsetof(struct ipv6hdr, saddr),
+				   sizeof(buf), buf);
+	if (!addrs)
+		return 0;
+
+	net_get_random_once(&ip6_proxy_idents_hashrnd,
+			    sizeof(ip6_proxy_idents_hashrnd));
+
+	id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
+				 &addrs[1], &addrs[0]);
+	return htonl(id);
+}
+EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
+
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 455fd4e39333..a0f89ad76f9d 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,15 +17,94 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb,
-					   netdev_features_t features)
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
+					 netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	unsigned int unfrag_ip6hlen, unfrag_len;
+	struct frag_hdr *fptr;
+	u8 *packet_start, *prevhdr;
+	u8 nexthdr;
+	u8 frag_hdr_sz = sizeof(struct frag_hdr);
+	__wsum csum;
+	int tnl_hlen;
+	int err;
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
 
 	if (skb->encapsulation && skb_shinfo(skb)->gso_type &
 	    (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
 		segs = skb_udp_tunnel_segment(skb, features, true);
+	else {
+		const struct ipv6hdr *ipv6h;
+		struct udphdr *uh;
+
+		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+			goto out;
+
+		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+		 * do checksum of UDP packets sent as multiple IP fragments.
+		 */
+
+		uh = udp_hdr(skb);
+		ipv6h = ipv6_hdr(skb);
+
+		uh->check = 0;
+		csum = skb_checksum(skb, 0, skb->len, 0);
+		uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
+					  &ipv6h->daddr, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		/* If there is no outer header we can fake a checksum offload
+		 * due to the fact that we have already done the checksum in
+		 * software prior to segmenting the frame.
+		 */
+		if (!skb->encap_hdr_csum)
+			features |= NETIF_F_HW_CSUM;
+
+		/* Check if there is enough headroom to insert fragment header. */
+		tnl_hlen = skb_tnl_header_len(skb);
+		if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
+			if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+				goto out;
+		}
+
+		/* Find the unfragmentable header and shift it left by frag_hdr_sz
+		 * bytes to insert fragment header.
+		 */
+		err = ip6_find_1stfragopt(skb, &prevhdr);
+		if (err < 0)
+			return ERR_PTR(err);
+		unfrag_ip6hlen = err;
+		nexthdr = *prevhdr;
+		*prevhdr = NEXTHDR_FRAGMENT;
+		unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+			     unfrag_ip6hlen + tnl_hlen;
+		packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+		memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+		SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+		skb->mac_header -= frag_hdr_sz;
+		skb->network_header -= frag_hdr_sz;
+
+		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+		fptr->nexthdr = nexthdr;
+		fptr->reserved = 0;
+		fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb);
+
+		/* Fragment the skb. ipv6 header and the remaining fields of the
+		 * fragment header are updated in ipv6_gso_segment()
+		 */
+		segs = skb_segment(skb, features);
+	}
 
+out:
 	return segs;
 }
 
@@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv6_offload = {
 	.callbacks = {
-		.gso_segment	=	udp6_tunnel_segment,
+		.gso_segment	=	udp6_ufo_fragment,
 		.gro_receive	=	udp6_gro_receive,
 		.gro_complete	=	udp6_gro_complete,
 	},
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0dab33fb9844..99cfafc2a139 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -308,6 +308,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 			     const struct dp_upcall_info *upcall_info,
 				 uint32_t cutlen)
 {
+	unsigned short gso_type = skb_shinfo(skb)->gso_type;
+	struct sw_flow_key later_key;
 	struct sk_buff *segs, *nskb;
 	int err;
 
@@ -318,9 +320,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	if (segs == NULL)
 		return -EINVAL;
 
+	if (gso_type & SKB_GSO_UDP) {
+		/* The initial flow key extracted by ovs_flow_key_extract()
+		 * in this case is for a first fragment, so we need to
+		 * properly mark later fragments.
+		 */
+		later_key = *key;
+		later_key.ip.frag = OVS_FRAG_TYPE_LATER;
+	}
+
 	/* Queue all of the segments. */
 	skb = segs;
 	do {
+		if (gso_type & SKB_GSO_UDP && skb != segs)
+			key = &later_key;
+
 		err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
 		if (err)
 			break;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 864ddb1e3642..dbe2379329c5 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -631,7 +631,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			key->ip.frag = OVS_FRAG_TYPE_LATER;
 			return 0;
 		}
-		if (nh->frag_off & htons(IP_MF))
+		if (nh->frag_off & htons(IP_MF) ||
+			skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
 		else
 			key->ip.frag = OVS_FRAG_TYPE_NONE;
@@ -747,6 +748,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
 			return 0;
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+			key->ip.frag = OVS_FRAG_TYPE_FIRST;
+
 		/* Transport layer. */
 		if (key->ip.proto == NEXTHDR_TCP) {
 			if (tcphdr_ok(skb)) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 1c40caadcff9..d836f998117b 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
 	const struct iphdr *iph;
 	u16 ul;
 
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+		return 1;
+
 	/*
 	 * Support both UDP and UDPLITE checksum algorithms, Don't use
 	 * udph->len to get the real length without any protocol check,
@@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
 	const struct ipv6hdr *ip6h;
 	u16 ul;
 
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+		return 1;
+
 	/*
 	 * Support both UDP and UDPLITE checksum algorithms, Don't use
 	 * udph->len to get the real length without any protocol check,
-- 
cgit v1.2.3