From 3e4e4c1f2da66b29ee9379ca29f8dd620c2b5a1f Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Sun, 13 Jan 2013 05:01:39 +0000 Subject: ipv6: Introduce ip6_flow_hdr() to fill version, tclass and flowlabel. This is not only for readability but also for optimization. What we do here is to build the 32bit word at the beginning of the ipv6 header (the "ip6_flow" virtual member of struct ip6_hdr in RFC3542) and we do not need to read the tclass portion of the target buffer. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5af66b26ebdd..fcbc6464b014 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -546,6 +546,15 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); +/* + * Header manipulation + */ +static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, + __be32 flowlabel) +{ + *(__be32 *)hdr = ntohl(0x60000000 | (tclass << 20)) | flowlabel; +} + /* * Prototypes exported by ipv6 */ -- cgit v1.2.3 From 6502ca527f8ed2c3bb327d9db8e7e6e7dcbef511 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Sun, 13 Jan 2013 05:01:51 +0000 Subject: ipv6: Introduce ip6_flowinfo() to extract flowinfo (tclass + flowlabel). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++++ net/ipv6/datagram.c | 9 +++++---- net/ipv6/route.c | 6 +++--- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fcbc6464b014..ed672080c690 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -555,6 +555,11 @@ static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, *(__be32 *)hdr = ntohl(0x60000000 | (tclass << 20)) | flowlabel; } +static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) +{ + return *(__be32 *)hdr & IPV6_FLOWINFO_MASK; +} + /* * Prototypes exported by ipv6 */ diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 56b692bb94a7..7f65df41c396 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -360,7 +360,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; if (np->sndflow) - sin->sin6_flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; + sin->sin6_flowinfo = ip6_flowinfo(ip6h); if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) sin->sin6_scope_id = IP6CB(skb)->iif; } else { @@ -491,9 +491,10 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } - if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { - __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; - put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); + if (flowinfo) + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 621b68ecf16f..6238eb5037a7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -994,7 +994,7 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, - .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, + .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; @@ -1159,7 +1159,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; - fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; + fl6.flowlabel = ip6_flowinfo(iph); dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) @@ -1187,7 +1187,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; - fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; + fl6.flowlabel = ip6_flowinfo(iph); dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) -- cgit v1.2.3 From 9f2e73345ad9466618edfec35d98c074f2e0570d Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 14 Jan 2013 07:09:54 +0000 Subject: ipv6: 64bit version of ipv6_addr_diff(). Introduce __ipv6_addr_diff64() to to find the first different bit between two addresses on 64bit architectures. 32bit version is still available as __ipv6_addr_diff32(), and __ipv6_addr_diff() automatically selects appropriate version. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ed672080c690..ace1113bef85 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -507,7 +507,7 @@ static inline void ipv6_addr_set_v4mapped(const __be32 addr, * find the first different bit between two addresses * length of address must be a multiple of 32bits */ -static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) +static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen) { const __be32 *a1 = token1, *a2 = token2; int i; @@ -539,6 +539,33 @@ static inline int __ipv6_addr_diff(const void *token1, const void *token2, int a return addrlen << 5; } +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 +static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen) +{ + const __be64 *a1 = token1, *a2 = token2; + int i; + + addrlen >>= 3; + + for (i = 0; i < addrlen; i++) { + __be64 xb = a1[i] ^ a2[i]; + if (xb) + return i * 64 + 63 - __fls(be64_to_cpu(xb)); + } + + return addrlen << 6; +} +#endif + +static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + if (__builtin_constant_p(addrlen) && !(addrlen & 7)) + return __ipv6_addr_diff64(token1, token2, addrlen); +#endif + return __ipv6_addr_diff32(token1, token2, addrlen); +} + static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); -- cgit v1.2.3 From e287656b365f8db4486371c76ea54cbdbed7129c Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 14 Jan 2013 07:10:06 +0000 Subject: ipv6: 64bit version of ipv6_addr_loopback(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ace1113bef85..c5049b019332 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -475,8 +475,14 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a) static inline bool ipv6_addr_loopback(const struct in6_addr *a) { +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const unsigned long *ul = (const unsigned long *)a; + + return (ul[0] | (ul[1] ^ cpu_to_be64(1))) == 0UL; +#else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0; +#endif } static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) -- cgit v1.2.3 From a04d40b8956010d2a6a7abc9254792c8f8649242 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 14 Jan 2013 07:10:14 +0000 Subject: ipv6: 64bit version of ipv6_addr_v4mapped(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c5049b019332..77cef3321571 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -487,8 +487,13 @@ static inline bool ipv6_addr_loopback(const struct in6_addr *a) static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { - return (a->s6_addr32[0] | a->s6_addr32[1] | - (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0; + return ( +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + *(__be64 *)a | +#else + (a->s6_addr32[0] | a->s6_addr32[1]) | +#endif + (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL; } /* -- cgit v1.2.3 From 5206c579da0f5b93c3ae17bd8010d8dbd9fcdbe9 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 14 Jan 2013 07:10:24 +0000 Subject: ipv6: 64bit version of ipv6_addr_set(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 77cef3321571..fb7f7c645e15 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -355,14 +355,32 @@ static inline void ipv6_addr_prefix(struct in6_addr *pfx, pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } +static inline void __ipv6_addr_set_half(__be32 *addr, + __be32 wh, __be32 wl) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 +#if defined(__BIG_ENDIAN) + if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) { + *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl)); + return; + } +#elif defined(__LITTLE_ENDIAN) + if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) { + *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh)); + return; + } +#endif +#endif + addr[0] = wh; + addr[1] = wl; +} + static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { - addr->s6_addr32[0] = w1; - addr->s6_addr32[1] = w2; - addr->s6_addr32[2] = w3; - addr->s6_addr32[3] = w4; + __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2); + __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4); } static inline bool ipv6_addr_equal(const struct in6_addr *a1, -- cgit v1.2.3 From 2ef973320349cd536b33427aed98512e532d4cc6 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 14 Jan 2013 07:10:31 +0000 Subject: ipv6: Remove __ipv6_prefix_equal(). ipv6_prefix_equal() just casts its arguments and it is the only user of __ipv6_prefix_equal(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fb7f7c645e15..fe95053d02ce 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -399,9 +399,12 @@ static inline bool ipv6_addr_equal(const struct in6_addr *a1, #endif } -static inline bool __ipv6_prefix_equal(const __be32 *a1, const __be32 *a2, - unsigned int prefixlen) +static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, + const struct in6_addr *addr2, + unsigned int prefixlen) { + const __be32 *a1 = addr1->s6_addr32; + const __be32 *a2 = addr2->s6_addr32; unsigned int pdw, pbi; /* check complete u32 in prefix */ @@ -417,14 +420,6 @@ static inline bool __ipv6_prefix_equal(const __be32 *a1, const __be32 *a2, return true; } -static inline bool ipv6_prefix_equal(const struct in6_addr *a1, - const struct in6_addr *a2, - unsigned int prefixlen) -{ - return __ipv6_prefix_equal(a1->s6_addr32, a2->s6_addr32, - prefixlen); -} - struct inet_frag_queue; enum ip6_defrag_users { -- cgit v1.2.3 From 38675170e4235f859bb85564c33ffa077a1c1ef7 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 14 Jan 2013 07:10:38 +0000 Subject: ipv6: 64bit version of ipv6_prefix_equal(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fe95053d02ce..dfc1363a97b5 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -399,6 +399,31 @@ static inline bool ipv6_addr_equal(const struct in6_addr *a1, #endif } +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 +static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, + const __be64 *a2, + unsigned int len) +{ + if (len && ((*a1 ^ *a2) & cpu_to_be64(~0UL) << (64 - len))) + return false; + return true; +} + +static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, + const struct in6_addr *addr2, + unsigned int prefixlen) +{ + const __be64 *a1 = (const __be64 *)addr1; + const __be64 *a2 = (const __be64 *)addr2; + + if (prefixlen >= 64) { + if (a1[0] ^ a2[0]) + return false; + return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64); + } + return __ipv6_prefix_equal64_half(a1, a2, prefixlen); +} +#else static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) @@ -419,6 +444,7 @@ static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, return true; } +#endif struct inet_frag_queue; -- cgit v1.2.3 From 07f623d3b278abe88508c1c59d7c1089b4a56789 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 17 Jan 2013 12:10:57 +0900 Subject: ipv6: Fix endianess warning in ip6_flow_hdr(). Commit 3e4e4c1f ("ipv6: Introduce ip6_flow_hdr() to fill version, tclass and flowlabel.) uses ntohl(), which should be htonl(). Found by Fengguang Wu . Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index dfc1363a97b5..a0909c79374b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -629,7 +629,7 @@ extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, __be32 flowlabel) { - *(__be32 *)hdr = ntohl(0x60000000 | (tclass << 20)) | flowlabel; + *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; } static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) -- cgit v1.2.3 From c2a936600f78aea00d3312ea4b66a79a4619f9b4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Jan 2013 07:16:35 +0000 Subject: net: increase fragment memory usage limits Increase the amount of memory usage limits for incomplete IP fragments. Arguing for new thresh high/low values: High threshold = 4 MBytes Low threshold = 3 MBytes The fragmentation memory accounting code, tries to account for the real memory usage, by measuring both the size of frag queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) and the SKB's truesize. We want to be able to handle/hold-on-to enough fragments, to ensure good performance, without causing incomplete fragments to hurt scalability, by causing the number of inet_frag_queue to grow too much (resulting longer searches for frag queues). For IPv4, how much memory does the largest frag consume. Maximum size fragment is 64K, which is approx 44 fragments with MTU(1500) sized packets. Sizeof(struct ipq) is 200. A 1500 byte packet results in a truesize of 2944 (not 2048 as I first assumed) (44*2944)+200 = 129736 bytes The current default high thresh of 262144 bytes, is obviously problematic, as only two 64K fragments can fit in the queue at the same time. How many 64K fragment can we fit into 4 MBytes: 4*2^20/((44*2944)+200) = 32.34 fragment in queues An attacker could send a separate/distinct fake fragment packets per queue, causing us to allocate one inet_frag_queue per packet, and thus attacking the hash table and its lists. How many frag queue do we need to store, and given a current hash size of 64, what is the average list length. Using one MTU sized fragment per inet_frag_queue, each consuming (2944+200) 3144 bytes. 4*2^20/(2944+200) = 1334 frag queues -> 21 avg list length An attack could send small fragments, the smallest packet I could send resulted in a truesize of 896 bytes (I'm a little surprised by this). 4*2^20/(896+200) = 3827 frag queues -> 59 avg list length When increasing these number, we also need to followup with improvements, that is going to help scalability. Simply increasing the hash size, is not enough as the current implementation does not have a per hash bucket locking. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/ipv6.h | 4 ++-- net/ipv4/ip_fragment.c | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a0909c79374b..51a068ad064b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -292,8 +292,8 @@ static inline int ip6_frag_mem(struct net *net) } #endif -#define IPV6_FRAG_HIGH_THRESH (256 * 1024) /* 262144 */ -#define IPV6_FRAG_LOW_THRESH (192 * 1024) /* 196608 */ +#define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ +#define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ extern int __ipv6_addr_type(const struct in6_addr *addr); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb9d63a570cd..f55a4e61bfb8 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -851,14 +851,22 @@ static inline void ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. + /* Fragment cache limits. + * + * The fragment memory accounting code, (tries to) account for + * the real memory usage, by measuring both the size of frag + * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) + * and the SKB's truesize. + * + * A 64K fragment consumes 129736 bytes (44*2944)+200 + * (1500 truesize == 2944, sizeof(struct ipq) == 200) + * + * We will commit 4MB at one time. Should we cross that limit + * we will prune down to 3MB, making room for approx 8 big 64K + * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 256 * 1024; - net->ipv4.frags.low_thresh = 192 * 1024; + net->ipv4.frags.high_thresh = 4 * 1024 * 1024; + net->ipv4.frags.low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival -- cgit v1.2.3 From 512613d7dde7b679597485e9335bd64e90df78fa Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Wed, 16 Jan 2013 22:30:17 +0100 Subject: ipv6: fix ipv6_prefix_equal64_half mask conversion Fix the 64bit optimized version of ipv6_prefix_equal to convert the bitmask to network byte order only after the bit-shift. The bug was introduced in: 3867517 ipv6: 64bit version of ipv6_prefix_equal(). Signed-off-by: Fabio Baltieri Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 51a068ad064b..464c6f70eca1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -404,7 +404,7 @@ static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, const __be64 *a2, unsigned int len) { - if (len && ((*a1 ^ *a2) & cpu_to_be64(~0UL) << (64 - len))) + if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len)))) return false; return true; } -- cgit v1.2.3 From 2576f17dfad402e2446244238ed22dddf35c2e53 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Mon, 21 Jan 2013 06:48:19 +0000 Subject: ipv6: Unshare ip6_nd_hdr() and change return type to void. - move ip6_nd_hdr() to its users' source files. In net/ipv6/mcast.c, it will be called ip6_mc_hdr(). - make return type to void since this function never fails. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 ------- net/ipv6/ip6_output.c | 33 --------------------------------- net/ipv6/mcast.c | 29 +++++++++++++++++++++++++++-- net/ipv6/ndisc.c | 25 +++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 42 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 464c6f70eca1..c1878f7049c8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -661,13 +661,6 @@ extern int ip6_xmit(struct sock *sk, struct ipv6_txoptions *opt, int tclass); -extern int ip6_nd_hdr(struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int proto, int len); - extern int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); extern int ip6_append_data(struct sock *sk, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b0895f5d5fc6..7eee94c27f8d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -254,39 +254,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, EXPORT_SYMBOL(ip6_xmit); -/* - * To avoid extra problems ND packets are send through this - * routine. It's code duplication but I really want to avoid - * extra checks since ipv6_build_header is used by TCP (which - * is for us performance critical) - */ - -int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, - const struct in6_addr *saddr, const struct in6_addr *daddr, - int proto, int len) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6hdr *hdr; - - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - - skb_reset_network_header(skb); - skb_put(skb, sizeof(struct ipv6hdr)); - hdr = ipv6_hdr(skb); - - ip6_flow_hdr(hdr, 0, 0); - - hdr->payload_len = htons(len); - hdr->nexthdr = proto; - hdr->hop_limit = np->hop_limit; - - hdr->saddr = *saddr; - hdr->daddr = *daddr; - - return 0; -} - static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 587a84530a57..f25002aaf624 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1313,6 +1313,31 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } +static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + ip6_flow_hdr(hdr, 0, 0); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = inet6_sk(sk)->hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} + static struct sk_buff *mld_newpack(struct net_device *dev, int size) { struct net *net = dev_net(dev); @@ -1348,7 +1373,7 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) } else saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); @@ -1740,7 +1765,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } else saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); + ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1776a0deee7f..7ce266f34cc7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -389,6 +389,31 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, return skb; } +static void ip6_nd_hdr(struct sock *sk, + struct sk_buff *skb, struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + ip6_flow_hdr(hdr, 0, 0); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = inet6_sk(sk)->hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} + static struct sk_buff *ndisc_build_skb(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *saddr, -- cgit v1.2.3 From d433673e5f9180e05a770c4b2ab18c08ad51cc21 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 28 Jan 2013 23:45:12 +0000 Subject: net: frag helper functions for mem limit tracking This change is primarily a preparation to ease the extension of memory limit tracking. The change does reduce the number atomic operation, during freeing of a frag queue. This does introduce a some performance improvement, as these atomic operations are at the core of the performance problems seen on NUMA systems. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/inet_frag.h | 27 +++++++++++++++++++++++++++ include/net/ipv6.h | 2 +- net/ipv4/inet_fragment.c | 25 ++++++++++++------------- net/ipv4/ip_fragment.c | 24 +++++++++++------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +++--- net/ipv6/reassembly.c | 6 +++--- 6 files changed, 57 insertions(+), 33 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 8e4c42523f59..f2fabc2a79de 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -79,4 +79,31 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f inet_frag_destroy(q, f, NULL); } +/* Memory Tracking Functions. */ + +static inline int frag_mem_limit(struct netns_frags *nf) +{ + return atomic_read(&nf->mem); +} + +static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i) +{ + atomic_sub(i, &q->net->mem); +} + +static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) +{ + atomic_add(i, &q->net->mem); +} + +static inline void init_frag_mem_limit(struct netns_frags *nf) +{ + atomic_set(&nf->mem, 0); +} + +static inline int sum_frag_mem_limit(struct netns_frags *nf) +{ + return atomic_read(&nf->mem); +} + #endif diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c1878f7049c8..dc30b60975ef 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -288,7 +288,7 @@ static inline int ip6_frag_nqueues(struct net *net) static inline int ip6_frag_mem(struct net *net) { - return atomic_read(&net->ipv6.frags.mem); + return sum_frag_mem_limit(&net->ipv6.frags); } #endif diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 4750d2b74d79..e348c849c5a3 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -73,7 +73,7 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { nf->nqueues = 0; - atomic_set(&nf->mem, 0); + init_frag_mem_limit(nf); INIT_LIST_HEAD(&nf->lru_list); } EXPORT_SYMBOL(inet_frags_init_net); @@ -117,12 +117,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb, int *work) + struct sk_buff *skb) { - if (work) - *work -= skb->truesize; - - atomic_sub(skb->truesize, &nf->mem); if (f->skb_free) f->skb_free(skb); kfree_skb(skb); @@ -133,6 +129,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, { struct sk_buff *fp; struct netns_frags *nf; + unsigned int sum, sum_truesize = 0; WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -143,13 +140,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(nf, f, fp, work); + sum_truesize += fp->truesize; + frag_kfree_skb(nf, f, fp); fp = xp; } - + sum = sum_truesize + f->qsize; if (work) - *work -= f->qsize; - atomic_sub(f->qsize, &nf->mem); + *work -= sum; + sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); @@ -164,11 +162,11 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) int work, evicted = 0; if (!force) { - if (atomic_read(&nf->mem) <= nf->high_thresh) + if (frag_mem_limit(nf) <= nf->high_thresh) return 0; } - work = atomic_read(&nf->mem) - nf->low_thresh; + work = frag_mem_limit(nf) - nf->low_thresh; while (work > 0) { read_lock(&f->lock); if (list_empty(&nf->lru_list)) { @@ -250,7 +248,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, q->net = nf; f->constructor(q, arg); - atomic_add(f->qsize, &nf->mem); + add_frag_mem_limit(q, f->qsize); + setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index f55a4e61bfb8..927fe58caf46 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -122,7 +122,7 @@ int ip_frag_nqueues(struct net *net) int ip_frag_mem(struct net *net) { - return atomic_read(&net->ipv4.frags.mem); + return sum_frag_mem_limit(&net->ipv4.frags); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -161,13 +161,6 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); @@ -340,6 +333,7 @@ static inline int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; + unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); @@ -349,9 +343,12 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp); + + sum_truesize += fp->truesize; + kfree_skb(fp); fp = xp; } while (fp); + sub_frag_mem_limit(&qp->q, sum_truesize); qp->q.last_in = 0; qp->q.len = 0; @@ -496,7 +493,8 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it); + sub_frag_mem_limit(&qp->q, free_it->truesize); + kfree_skb(free_it); } } @@ -519,7 +517,7 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - atomic_add(skb->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; @@ -617,7 +615,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, clone->truesize); } skb_push(head, head->data - skb_network_header(head)); @@ -645,7 +643,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &qp->q.net->mem); + sub_frag_mem_limit(&qp->q, sum_truesize); head->next = NULL; head->dev = dev; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3dacecc99065..07ef2949c946 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -319,7 +319,7 @@ found: fq->q.meat += skb->len; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - atomic_add(skb->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -398,7 +398,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -422,7 +422,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } - atomic_sub(head->truesize, &fq->q.net->mem); + sub_frag_mem_limit(&fq->q, head->truesize); head->local_df = 1; head->next = NULL; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e5253ec9e0fc..18cb8defcdb2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -327,7 +327,7 @@ found: } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -429,7 +429,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -467,7 +467,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &fq->q.net->mem); + sub_frag_mem_limit(&fq->q, sum_truesize); head->next = NULL; head->dev = dev; -- cgit v1.2.3 From d3aedd5ebd4b0b925b0bcda548066803e1318499 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Wed, 30 Jan 2013 09:27:47 +0000 Subject: ipv6 flowlabel: Convert hash list to RCU. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/ip6_flowlabel.c | 94 +++++++++++++++++++++++++++--------------------- 2 files changed, 55 insertions(+), 40 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index dc30b60975ef..1d457161def2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -222,6 +222,7 @@ struct ip6_flowlabel { struct in6_addr dst; struct ipv6_txoptions *opt; unsigned long linger; + struct rcu_head rcu; u8 share; union { struct pid *pid; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 5d767f1b8780..da156015d827 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -51,25 +51,33 @@ #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); -static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; +static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); /* FL hash table lock: it protects only of GC */ -static DEFINE_RWLOCK(ip6_fl_lock); +static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_RWLOCK(ip6_sk_fl_lock); +#define for_each_fl_rcu(hash, fl) \ + for (fl = rcu_dereference(fl_ht[(hash)]); \ + fl != NULL; \ + fl = rcu_dereference(fl->next)) +#define for_each_fl_continue_rcu(fl) \ + for (fl = rcu_dereference(fl->next); \ + fl != NULL; \ + fl = rcu_dereference(fl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } @@ -80,11 +88,11 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); fl = __fl_lookup(net, label); - if (fl) - atomic_inc(&fl->users); - read_unlock_bh(&ip6_fl_lock); + if (fl && !atomic_inc_not_zero(&fl->users)) + fl = NULL; + rcu_read_unlock_bh(); return fl; } @@ -96,13 +104,13 @@ static void fl_free(struct ip6_flowlabel *fl) put_pid(fl->owner.pid); release_net(fl->fl_net); kfree(fl->opt); + kfree_rcu(fl, rcu); } - kfree(fl); } static void fl_release(struct ip6_flowlabel *fl) { - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { @@ -119,7 +127,7 @@ static void fl_release(struct ip6_flowlabel *fl) time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(unsigned long dummy) @@ -128,12 +136,13 @@ static void ip6_fl_gc(unsigned long dummy) unsigned long now = jiffies; unsigned long sched = 0; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i=0; i<=FL_HASH_MASK; i++) { struct ip6_flowlabel *fl, **flp; flp = &fl_ht[i]; - while ((fl=*flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) @@ -156,18 +165,19 @@ static void ip6_fl_gc(unsigned long dummy) if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl, **flp; flp = &fl_ht[i]; - while ((fl = *flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; @@ -178,7 +188,7 @@ static void __net_exit ip6_fl_purge(struct net *net) flp = &fl->next; } } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, @@ -188,7 +198,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->label = label & IPV6_FLOWLABEL_MASK; - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; @@ -210,16 +220,16 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl != NULL) { atomic_inc(&lfl->users); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; - fl_ht[FL_HASH(fl->label)] = fl; + rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return NULL; } @@ -650,13 +660,13 @@ static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { - fl = fl_ht[state->bucket]; - - while (fl && !net_eq(fl->fl_net, net)) - fl = fl->next; - if (fl) - break; + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } } + fl = NULL; +out: return fl; } @@ -665,18 +675,22 @@ static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flo struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); - fl = fl->next; + for_each_fl_continue_rcu(fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + try_again: - while (fl && !net_eq(fl->fl_net, net)) - fl = fl->next; - - while (!fl) { - if (++state->bucket <= FL_HASH_MASK) { - fl = fl_ht[state->bucket]; - goto try_again; - } else - break; + if (++state->bucket <= FL_HASH_MASK) { + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + goto try_again; } + fl = NULL; + +out: return fl; } @@ -690,9 +704,9 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(ip6_fl_lock) + __acquires(RCU) { - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -709,9 +723,9 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip6fl_seq_stop(struct seq_file *seq, void *v) - __releases(ip6_fl_lock) + __releases(RCU) { - read_unlock_bh(&ip6_fl_lock); + rcu_read_unlock_bh(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) -- cgit v1.2.3 From 18367681a10bd29c3f2305e6b7b984de5b33d548 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki / 吉藤英明 Date: Wed, 30 Jan 2013 09:27:52 +0000 Subject: ipv6 flowlabel: Convert np->ipv6_fl_list to RCU. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- include/net/ipv6.h | 1 + net/ipv6/ip6_flowlabel.c | 72 +++++++++++++++++++++++++++--------------------- 3 files changed, 42 insertions(+), 33 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e971e3742172..850e95bc766c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -214,7 +214,7 @@ struct ipv6_pinfo { struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; - struct ipv6_fl_socklist *ipv6_fl_list; + struct ipv6_fl_socklist __rcu *ipv6_fl_list; struct ipv6_txoptions *opt; struct sk_buff *pktoptions; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 1d457161def2..851d5412a299 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -239,6 +239,7 @@ struct ip6_flowlabel { struct ipv6_fl_socklist { struct ipv6_fl_socklist *next; struct ip6_flowlabel *fl; + struct rcu_head rcu; }; extern struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index da156015d827..22494afd981c 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -62,7 +62,7 @@ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ -static DEFINE_RWLOCK(ip6_sk_fl_lock); +static DEFINE_SPINLOCK(ip6_sk_fl_lock); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ @@ -73,6 +73,11 @@ static DEFINE_RWLOCK(ip6_sk_fl_lock); fl != NULL; \ fl = rcu_dereference(fl->next)) +#define for_each_sk_fl_rcu(np, sfl) \ + for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \ + sfl != NULL; \ + sfl = rcu_dereference_bh(sfl->next)) + static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; @@ -244,17 +249,17 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) label &= IPV6_FLOWLABEL_MASK; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label) { fl->lastuse = jiffies; atomic_inc(&fl->users); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return fl; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return NULL; } @@ -265,20 +270,21 @@ void fl6_free_socklist(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; - if (!np->ipv6_fl_list) + if (!rcu_access_pointer(np->ipv6_fl_list)) return; - write_lock_bh(&ipv6_sk_fl_lock); - sfl = np->ipv6_fl_list; - np->ipv6_fl_list = NULL; - write_unlock_bh(&ipv6_sk_fl_lock); + spin_lock_bh(&ip6_sk_fl_lock); + while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { + np->ipv6_fl_list = sfl->next; + spin_unlock_bh(&ip6_sk_fl_lock); - while (sfl) { - struct ipv6_fl_socklist *next = sfl->next; fl_release(sfl->fl); - kfree(sfl); - sfl = next; + kfree_rcu(sfl, rcu); + + spin_lock_bh(&ip6_sk_fl_lock); } + spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ @@ -443,7 +449,7 @@ static int mem_check(struct sock *sk) if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + for_each_sk_fl_rcu(np, sfl) count++; if (room <= 0 || @@ -486,11 +492,11 @@ static bool ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { - write_lock_bh(&ip6_sk_fl_lock); + spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; - np->ipv6_fl_list = sfl; - write_unlock_bh(&ip6_sk_fl_lock); + rcu_assign_pointer(np->ipv6_fl_list, sfl); + spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) @@ -512,31 +518,33 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) switch (freq.flr_action) { case IPV6_FL_A_PUT: - write_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + spin_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; + (sfl = rcu_dereference(*sflp))!=NULL; + sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = sfl->next; - write_unlock_bh(&ip6_sk_fl_lock); + *sflp = rcu_dereference(sfl->next); + spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); return 0; } } - write_unlock_bh(&ip6_sk_fl_lock); + spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; case IPV6_FL_A_RENEW: - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return err; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); if (freq.flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { @@ -560,11 +568,11 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) if (freq.flr_label) { err = -EEXIST; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_flags&IPV6_FL_F_EXCL) { - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); goto done; } fl1 = sfl->fl; @@ -572,7 +580,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) break; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); if (fl1 == NULL) fl1 = fl_lookup(net, freq.flr_label); -- cgit v1.2.3 From 08dcdbf6a7b9d14c2302c5bd0c5390ddf122f664 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Feb 2013 12:18:52 +0000 Subject: ipv6: use a stronger hash for tcp It looks like its possible to open thousands of TCP IPv6 sessions on a server, all landing in a single slot of TCP hash table. Incoming packets have to lookup sockets in a very long list. We should hash all bits from foreign IPv6 addresses, using a salt and hash mix, not a simple XOR. inet6_ehashfn() can also separately use the ports, instead of xoring them. Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 8 ++++---- include/net/inet_sock.h | 1 + include/net/ipv6.h | 12 ++++++++++++ net/ipv4/af_inet.c | 9 +++++++-- 4 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/net/ipv6.h') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 7ca75cbbf75e..fd4ee016ba5c 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -28,16 +28,16 @@ struct inet_hashinfo; -/* I have no idea if this is a good hash for v6 or not. -DaveM */ static inline unsigned int inet6_ehashfn(struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { - u32 ports = (lport ^ (__force u16)fport); + u32 ports = (((u32)lport) << 16) | (__force u32)fport; return jhash_3words((__force u32)laddr->s6_addr32[3], - (__force u32)faddr->s6_addr32[3], - ports, inet_ehash_secret + net_hash_mix(net)); + ipv6_addr_jhash(faddr), + ports, + inet_ehash_secret + net_hash_mix(net)); } static inline int inet6_sk_ehashfn(const struct sock *sk) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a4196cbc84ec..7235ae73a1e8 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -203,6 +203,7 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, extern int inet_sk_rebuild_header(struct sock *sk); extern u32 inet_ehash_secret; +extern u32 ipv6_hash_secret; extern void build_ehash_secret(void); static inline unsigned int inet_ehashfn(struct net *net, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 851d5412a299..64d12e77719a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -514,6 +515,17 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a) #endif } +/* more secured version of ipv6_addr_hash() */ +static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +{ + u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; + + return jhash_3words(v, + (__force u32)a->s6_addr32[2], + (__force u32)a->s6_addr32[3], + ipv6_hash_secret); +} + static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e225a4e5b572..9cbcb94a4c6d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -248,8 +248,12 @@ EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + /* - * inet_ehash_secret must be set exactly once + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. */ void build_ehash_secret(void) { @@ -259,7 +263,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - cmpxchg(&inet_ehash_secret, 0, rnd); + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); } EXPORT_SYMBOL(build_ehash_secret); -- cgit v1.2.3