summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig24
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c34
-rw-r--r--net/ipv4/arp.c166
-rw-r--r--net/ipv4/cipso_ipv4.c42
-rw-r--r--net/ipv4/datagram.c16
-rw-r--r--net/ipv4/devinet.c107
-rw-r--r--net/ipv4/esp4.c200
-rw-r--r--net/ipv4/fib_frontend.c163
-rw-r--r--net/ipv4/fib_lookup.h5
-rw-r--r--net/ipv4/fib_rules.c44
-rw-r--r--net/ipv4/fib_semantics.c181
-rw-r--r--net/ipv4/fib_trie.c1768
-rw-r--r--net/ipv4/fou.c233
-rw-r--r--net/ipv4/geneve_core.c (renamed from net/ipv4/geneve.c)24
-rw-r--r--net/ipv4/gre_offload.c4
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c234
-rw-r--r--net/ipv4/inet_connection_sock.c248
-rw-r--r--net/ipv4/inet_diag.c520
-rw-r--r--net/ipv4/inet_fragment.c44
-rw-r--r--net/ipv4/inet_hashtables.c128
-rw-r--r--net/ipv4/inet_timewait_sock.c277
-rw-r--r--net/ipv4/ip_forward.c29
-rw-r--r--net/ipv4/ip_fragment.c76
-rw-r--r--net/ipv4/ip_gre.c14
-rw-r--r--net/ipv4/ip_input.c17
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c161
-rw-r--r--net/ipv4/ip_sockglue.c52
-rw-r--r--net/ipv4/ip_tunnel.c29
-rw-r--r--net/ipv4/ip_tunnel_core.c23
-rw-r--r--net/ipv4/ip_vti.c26
-rw-r--r--net/ipv4/ipcomp.c2
-rw-r--r--net/ipv4/ipconfig.c6
-rw-r--r--net/ipv4/ipip.c15
-rw-r--r--net/ipv4/ipmr.c81
-rw-r--r--net/ipv4/netfilter.c13
-rw-r--r--net/ipv4/netfilter/Kconfig41
-rw-r--r--net/ipv4/netfilter/arp_tables.c128
-rw-r--r--net/ipv4/netfilter/arptable_filter.c7
-rw-r--r--net/ipv4/netfilter/ip_tables.c118
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c22
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c17
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c10
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c2
-rw-r--r--net/ipv4/netfilter/iptable_filter.c8
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c19
-rw-r--r--net/ipv4/netfilter/iptable_nat.c29
-rw-r--r--net/ipv4/netfilter/iptable_raw.c7
-rw-r--r--net/ipv4/netfilter/iptable_security.c8
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c28
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c33
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c29
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c6
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c12
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c29
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c9
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c9
-rw-r--r--net/ipv4/ping.c18
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c20
-rw-r--r--net/ipv4/route.c84
-rw-r--r--net/ipv4/syncookies.c28
-rw-r--r--net/ipv4/sysctl_net_ipv4.c45
-rw-r--r--net/ipv4/tcp.c213
-rw-r--r--net/ipv4/tcp_cdg.c433
-rw-r--r--net/ipv4/tcp_cong.c7
-rw-r--r--net/ipv4/tcp_dctcp.c47
-rw-r--r--net/ipv4/tcp_diag.c12
-rw-r--r--net/ipv4/tcp_fastopen.c27
-rw-r--r--net/ipv4/tcp_illinois.c23
-rw-r--r--net/ipv4/tcp_input.c417
-rw-r--r--net/ipv4/tcp_ipv4.c239
-rw-r--r--net/ipv4/tcp_metrics.c208
-rw-r--r--net/ipv4/tcp_minisocks.c77
-rw-r--r--net/ipv4/tcp_offload.c8
-rw-r--r--net/ipv4/tcp_output.c335
-rw-r--r--net/ipv4/tcp_timer.c25
-rw-r--r--net/ipv4/tcp_vegas.c18
-rw-r--r--net/ipv4/tcp_vegas.h3
-rw-r--r--net/ipv4/tcp_westwood.c17
-rw-r--r--net/ipv4/udp.c68
-rw-r--r--net/ipv4/udp_diag.c26
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/udp_offload.c4
-rw-r--r--net/ipv4/udp_tunnel.c12
-rw-r--r--net/ipv4/xfrm4_input.c7
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_output.c12
-rw-r--r--net/ipv4/xfrm4_policy.c3
97 files changed, 4777 insertions, 3290 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index bd2901604842..6fb3c90ad726 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -331,8 +331,8 @@ config NET_FOU_IP_TUNNELS
When this option is enabled IP tunnels can be configured to use
FOU or GUE encapsulation.
-config GENEVE
- tristate "Generic Network Virtualization Encapsulation (Geneve)"
+config GENEVE_CORE
+ tristate "Generic Network Virtualization Encapsulation library"
depends on INET
select NET_UDP_TUNNEL
---help---
@@ -615,6 +615,22 @@ config TCP_CONG_DCTCP
For further details see:
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+config TCP_CONG_CDG
+ tristate "CAIA Delay-Gradient (CDG)"
+ default n
+ ---help---
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
+
+ o Use the delay gradient as a congestion signal.
+ o Back off with an average probability that is independent of the RTT.
+ o Coexist with flows that use loss-based congestion control.
+ o Tolerate packet loss unrelated to congestion.
+
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -646,6 +662,9 @@ choice
config DEFAULT_DCTCP
bool "DCTCP" if TCP_CONG_DCTCP=y
+ config DEFAULT_CDG
+ bool "CDG" if TCP_CONG_CDG=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
@@ -668,6 +687,7 @@ config DEFAULT_TCP_CONG
default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
default "dctcp" if DEFAULT_DCTCP
+ default "cdg" if DEFAULT_CDG
default "cubic"
config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 518c04ed666e..efc43f300b8c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -56,7 +57,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
-obj-$(CONFIG_GENEVE) += geneve.o
+obj-$(CONFIG_GENEVE_CORE) += geneve_core.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d2e49baaff63..9532ee87151f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -217,7 +217,7 @@ int inet_listen(struct socket *sock, int backlog)
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
- inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+ !inet_csk(sk)->icsk_accept_queue.fastopenq) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
err = fastopen_init_queue(sk, backlog);
else if ((sysctl_tcp_fastopen &
@@ -228,6 +228,8 @@ int inet_listen(struct socket *sock, int backlog)
err = 0;
if (err)
goto out;
+
+ tcp_fastopen_init_key_once(true);
}
err = inet_csk_listen_start(sk, backlog);
if (err)
@@ -314,11 +316,11 @@ lookup_protocol:
answer_flags = answer->flags;
rcu_read_unlock();
- WARN_ON(answer_prot->slab == NULL);
+ WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
- sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
- if (sk == NULL)
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
+ if (!sk)
goto out;
err = 0;
@@ -488,7 +490,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
- if (sk->sk_prot->get_port(sk, snum)) {
+ if ((snum || !inet->bind_address_no_port) &&
+ sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
@@ -716,8 +719,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
}
EXPORT_SYMBOL(inet_getname);
-int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size)
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
@@ -728,7 +730,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+ return sk->sk_prot->sendmsg(sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);
@@ -750,8 +752,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
}
EXPORT_SYMBOL(inet_sendpage);
-int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
@@ -759,7 +761,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
sock_rps_record_flow(sk);
- err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
@@ -1270,7 +1272,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (udpfrag) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
- if (skb->next != NULL)
+ if (skb->next)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
} else {
@@ -1431,7 +1433,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
struct net *net)
{
struct socket *sock;
- int rc = sock_create_kern(family, type, protocol, &sock);
+ int rc = sock_create_kern(net, family, type, protocol, &sock);
if (rc == 0) {
*sk = sock->sk;
@@ -1441,8 +1443,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
* we do not wish this socket to see incoming packets.
*/
(*sk)->sk_prot->unhash(*sk);
-
- sk_change_net(*sk, net);
}
return rc;
}
@@ -1598,7 +1598,7 @@ static __net_init int inet_init_net(struct net *net)
*/
seqlock_init(&net->ipv4.ip_local_ports.lock);
net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 61000;
+ net->ipv4.ip_local_ports.range[1] = 60999;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
@@ -1675,7 +1675,7 @@ static int __init inet_init(void)
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
+ sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
rc = proto_register(&tcp_prot, 1);
if (rc)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 205e1472aa78..6c8b1fbafce8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -122,6 +122,7 @@
* Interface to generic neighbour cache.
*/
static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -149,18 +150,12 @@ static const struct neigh_ops arp_direct_ops = {
.connected_output = neigh_direct_output,
};
-static const struct neigh_ops arp_broken_ops = {
- .family = AF_INET,
- .solicit = arp_solicit,
- .error_report = arp_error_report,
- .output = neigh_compat_output,
- .connected_output = neigh_compat_output,
-};
-
struct neigh_table arp_tbl = {
.family = AF_INET,
.key_len = 4,
+ .protocol = cpu_to_be16(ETH_P_IP),
.hash = arp_hash,
+ .key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache",
@@ -216,7 +211,12 @@ static u32 arp_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd)
{
- return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
+ return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq32(neigh, pkey);
}
static int arp_constructor(struct neighbour *neigh)
@@ -228,7 +228,7 @@ static int arp_constructor(struct neighbour *neigh)
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev == NULL) {
+ if (!in_dev) {
rcu_read_unlock();
return -EINVAL;
}
@@ -260,35 +260,6 @@ static int arp_constructor(struct neighbour *neigh)
in old paradigm.
*/
-#if 1
- /* So... these "amateur" devices are hopeless.
- The only thing, that I can say now:
- It is very sad that we need to keep ugly obsolete
- code to make them happy.
-
- They should be moved to more reasonable state, now
- they use rebuild_header INSTEAD OF hard_start_xmit!!!
- Besides that, they are sort of out of date
- (a lot of redundant clones/copies, useless in 2.1),
- I wonder why people believe that they work.
- */
- switch (dev->type) {
- default:
- break;
- case ARPHRD_ROSE:
-#if IS_ENABLED(CONFIG_AX25)
- case ARPHRD_AX25:
-#if IS_ENABLED(CONFIG_NETROM)
- case ARPHRD_NETROM:
-#endif
- neigh->ops = &arp_broken_ops;
- neigh->output = neigh->ops->output;
- return 0;
-#else
- break;
-#endif
- }
-#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
@@ -433,71 +404,6 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
return flag;
}
-/* OBSOLETE FUNCTIONS */
-
-/*
- * Find an arp mapping in the cache. If not found, post a request.
- *
- * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
- * even if it exists. It is supposed that skb->dev was mangled
- * by a virtual device (eql, shaper). Nobody but broken devices
- * is allowed to use this function, it is scheduled to be removed. --ANK
- */
-
-static int arp_set_predefined(int addr_hint, unsigned char *haddr,
- __be32 paddr, struct net_device *dev)
-{
- switch (addr_hint) {
- case RTN_LOCAL:
- pr_debug("arp called for own IP address\n");
- memcpy(haddr, dev->dev_addr, dev->addr_len);
- return 1;
- case RTN_MULTICAST:
- arp_mc_map(paddr, haddr, dev, 1);
- return 1;
- case RTN_BROADCAST:
- memcpy(haddr, dev->broadcast, dev->addr_len);
- return 1;
- }
- return 0;
-}
-
-
-int arp_find(unsigned char *haddr, struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- __be32 paddr;
- struct neighbour *n;
-
- if (!skb_dst(skb)) {
- pr_debug("arp_find is called with dst==NULL\n");
- kfree_skb(skb);
- return 1;
- }
-
- paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
- if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
- paddr, dev))
- return 0;
-
- n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
-
- if (n) {
- n->used = jiffies;
- if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
- neigh_ha_snapshot(haddr, n, dev);
- neigh_release(n);
- return 0;
- }
- neigh_release(n);
- } else
- kfree_skb(skb);
- return 1;
-}
-EXPORT_SYMBOL(arp_find);
-
-/* END OF OBSOLETE FUNCTIONS */
-
/*
* Check if we can use proxy ARP for this path
*/
@@ -569,7 +475,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev,
*/
/*
- * Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ * Create an arp packet. If dest_hw is not set, we create a broadcast
* message.
*/
struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
@@ -589,7 +495,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
*/
skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
return NULL;
skb_reserve(skb, hlen);
@@ -597,9 +503,9 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
skb->dev = dev;
skb->protocol = htons(ETH_P_ARP);
- if (src_hw == NULL)
+ if (!src_hw)
src_hw = dev->dev_addr;
- if (dest_hw == NULL)
+ if (!dest_hw)
dest_hw = dev->broadcast;
/*
@@ -663,7 +569,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
break;
#endif
default:
- if (target_hw != NULL)
+ if (target_hw)
memcpy(arp_ptr, target_hw, dev->addr_len);
else
memset(arp_ptr, 0, dev->addr_len);
@@ -685,7 +591,8 @@ EXPORT_SYMBOL(arp_create);
void arp_xmit(struct sk_buff *skb)
{
/* Send it off, maybe filter it using firewalling first. */
- NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
+ NULL, skb->dev, dev_queue_xmit_sk);
}
EXPORT_SYMBOL(arp_xmit);
@@ -708,7 +615,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
dest_hw, src_hw, target_hw);
- if (skb == NULL)
+ if (!skb)
return;
arp_xmit(skb);
@@ -719,7 +626,7 @@ EXPORT_SYMBOL(arp_send);
* Process an arp request.
*/
-static int arp_process(struct sk_buff *skb)
+static int arp_process(struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -738,7 +645,7 @@ static int arp_process(struct sk_buff *skb)
* is ARP'able.
*/
- if (in_dev == NULL)
+ if (!in_dev)
goto out;
arp = arp_hdr(skb);
@@ -902,7 +809,7 @@ static int arp_process(struct sk_buff *skb)
is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
inet_addr_type(net, sip) == RTN_UNICAST;
- if (n == NULL &&
+ if (!n &&
((arp->ar_op == htons(ARPOP_REPLY) &&
inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
@@ -940,7 +847,7 @@ out:
static void parp_redo(struct sk_buff *skb)
{
- arp_process(skb);
+ arp_process(NULL, skb);
}
@@ -973,7 +880,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
- return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
+ dev, NULL, arp_process);
consumeskb:
consume_skb(skb);
@@ -994,7 +902,7 @@ out_of_mem:
static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
{
- if (dev == NULL) {
+ if (!dev) {
IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
return 0;
}
@@ -1020,7 +928,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
return -ENODEV;
}
if (mask) {
- if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL)
+ if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
return -ENOBUFS;
return 0;
}
@@ -1041,7 +949,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
- if (dev == NULL) {
+ if (!dev) {
struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
if (IS_ERR(rt))
@@ -1109,14 +1017,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
- r->arp_ha.sa_family = dev->type;
- strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ if (!(neigh->nud_state & NUD_NOARP)) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ err = 0;
+ }
neigh_release(neigh);
- err = 0;
}
return err;
}
@@ -1161,7 +1071,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
return arp_req_delete_public(net, r, dev);
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (dev == NULL) {
+ if (!dev) {
struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -1210,7 +1120,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (r.arp_dev[0]) {
err = -ENODEV;
dev = __dev_get_by_name(net, r.arp_dev);
- if (dev == NULL)
+ if (!dev)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index e361ea6f3fc8..bdb2a07ec363 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -255,7 +255,7 @@ static int __init cipso_v4_cache_init(void)
cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
sizeof(struct cipso_v4_map_cache_bkt),
GFP_KERNEL);
- if (cipso_v4_cache == NULL)
+ if (!cipso_v4_cache)
return -ENOMEM;
for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
@@ -339,7 +339,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
secattr->cache = entry->lsm_data;
secattr->flags |= NETLBL_SECATTR_CACHE;
secattr->type = NETLBL_NLTYPE_CIPSOV4;
- if (prev_entry == NULL) {
+ if (!prev_entry) {
spin_unlock_bh(&cipso_v4_cache[bkt].lock);
return 0;
}
@@ -393,10 +393,10 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr,
cipso_ptr_len = cipso_ptr[1];
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
- if (entry == NULL)
+ if (!entry)
return -ENOMEM;
entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
- if (entry->key == NULL) {
+ if (!entry->key) {
ret_val = -ENOMEM;
goto cache_add_failure;
}
@@ -502,7 +502,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
atomic_set(&doi_def->refcount, 1);
spin_lock(&cipso_v4_doi_list_lock);
- if (cipso_v4_doi_search(doi_def->doi) != NULL) {
+ if (cipso_v4_doi_search(doi_def->doi)) {
spin_unlock(&cipso_v4_doi_list_lock);
ret_val = -EEXIST;
goto doi_add_return;
@@ -513,7 +513,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
doi_add_return:
audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
- if (audit_buf != NULL) {
+ if (audit_buf) {
const char *type_str;
switch (doi_type) {
case CIPSO_V4_MAP_TRANS:
@@ -547,7 +547,7 @@ doi_add_return:
*/
void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
{
- if (doi_def == NULL)
+ if (!doi_def)
return;
switch (doi_def->type) {
@@ -598,7 +598,7 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
spin_lock(&cipso_v4_doi_list_lock);
doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL) {
+ if (!doi_def) {
spin_unlock(&cipso_v4_doi_list_lock);
ret_val = -ENOENT;
goto doi_remove_return;
@@ -617,7 +617,7 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
doi_remove_return:
audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
- if (audit_buf != NULL) {
+ if (audit_buf) {
audit_log_format(audit_buf,
" cipso_doi=%u res=%u",
doi, ret_val == 0 ? 1 : 0);
@@ -644,7 +644,7 @@ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
rcu_read_lock();
doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL)
+ if (!doi_def)
goto doi_getdef_return;
if (!atomic_inc_not_zero(&doi_def->refcount))
doi_def = NULL;
@@ -664,7 +664,7 @@ doi_getdef_return:
*/
void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
{
- if (doi_def == NULL)
+ if (!doi_def)
return;
if (!atomic_dec_and_test(&doi_def->refcount))
@@ -1642,7 +1642,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
rcu_read_lock();
doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
- if (doi_def == NULL) {
+ if (!doi_def) {
err_offset = 2;
goto validate_return_locked;
}
@@ -1736,7 +1736,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
* not the loopback device drop the packet. Further,
* there is no legitimate reason for setting this from
* userspace so reject it if skb is NULL. */
- if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
+ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
err_offset = opt_iter;
goto validate_return_locked;
}
@@ -1897,7 +1897,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
* defined yet but it is not a problem as the only users of these
* "lite" PF_INET sockets are functions which do an accept() call
* afterwards so we will label the socket as part of the accept(). */
- if (sk == NULL)
+ if (!sk)
return 0;
/* We allocate the maximum CIPSO option size here so we are probably
@@ -1905,7 +1905,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
* on and after all we are only talking about 40 bytes. */
buf_len = CIPSO_V4_OPT_LEN_MAX;
buf = kmalloc(buf_len, GFP_ATOMIC);
- if (buf == NULL) {
+ if (!buf) {
ret_val = -ENOMEM;
goto socket_setattr_failure;
}
@@ -1921,7 +1921,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
* set the IPOPT_CIPSO option. */
opt_len = (buf_len + 3) & ~3;
opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
- if (opt == NULL) {
+ if (!opt) {
ret_val = -ENOMEM;
goto socket_setattr_failure;
}
@@ -1981,7 +1981,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
* on and after all we are only talking about 40 bytes. */
buf_len = CIPSO_V4_OPT_LEN_MAX;
buf = kmalloc(buf_len, GFP_ATOMIC);
- if (buf == NULL) {
+ if (!buf) {
ret_val = -ENOMEM;
goto req_setattr_failure;
}
@@ -1997,7 +1997,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
* set the IPOPT_CIPSO option. */
opt_len = (buf_len + 3) & ~3;
opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
- if (opt == NULL) {
+ if (!opt) {
ret_val = -ENOMEM;
goto req_setattr_failure;
}
@@ -2102,7 +2102,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
sk_inet = inet_sk(sk);
opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
- if (opt == NULL || opt->opt.cipso == 0)
+ if (!opt || opt->opt.cipso == 0)
return;
hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
@@ -2128,7 +2128,7 @@ void cipso_v4_req_delattr(struct request_sock *req)
req_inet = inet_rsk(req);
opt = req_inet->opt;
- if (opt == NULL || opt->opt.cipso == 0)
+ if (!opt || opt->opt.cipso == 0)
return;
cipso_v4_delopt(&req_inet->opt);
@@ -2157,7 +2157,7 @@ int cipso_v4_getattr(const unsigned char *cipso,
doi = get_unaligned_be32(&cipso[2]);
rcu_read_lock();
doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL)
+ if (!doi_def)
goto getattr_return;
/* XXX - This code assumes only one tag per CIPSO option which isn't
* really a good assumption to make but since we only support the MAC
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 90c0e8386116..574fad9cca05 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -20,7 +20,7 @@
#include <net/route.h>
#include <net/tcp_states.h>
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -39,8 +39,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_dst_reset(sk);
- lock_sock(sk);
-
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -82,9 +80,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_dst_set(sk, &rt->dst);
err = 0;
out:
- release_sock(sk);
return err;
}
+EXPORT_SYMBOL(__ip4_datagram_connect);
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip4_datagram_connect(sk, uaddr, addr_len);
+ release_sock(sk);
+ return res;
+}
EXPORT_SYMBOL(ip4_datagram_connect);
/* Because UDP xmit path can manipulate sk_dst_cache without holding
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3a8985c94581..2d9cb1748f81 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -107,7 +107,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
-static u32 inet_addr_hash(struct net *net, __be32 addr)
+static u32 inet_addr_hash(const struct net *net, __be32 addr)
{
u32 val = (__force u32) addr ^ net_hash_mix(net);
@@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
return NULL;
}
+static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
+{
+ struct ip_mreqn mreq = {
+ .imr_multiaddr.s_addr = ifa->ifa_address,
+ .imr_ifindex = ifa->ifa_dev->dev->ifindex,
+ };
+ int ret;
+
+ ASSERT_RTNL();
+
+ lock_sock(sk);
+ if (join)
+ ret = ip_mc_join_group(sk, &mreq);
+ else
+ ret = ip_mc_leave_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -565,7 +585,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
ifm = nlmsg_data(nlh);
in_dev = inetdev_by_index(net, ifm->ifa_index);
- if (in_dev == NULL) {
+ if (!in_dev) {
err = -ENODEV;
goto errout;
}
@@ -573,7 +593,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
- ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL]))
+ ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
continue;
if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
@@ -581,9 +601,11 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
if (tb[IFA_ADDRESS] &&
(ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
- !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
+ !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
continue;
+ if (ipv4_is_multicast(ifa->ifa_address))
+ ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
return 0;
}
@@ -733,21 +755,21 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
err = -EINVAL;
- if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL)
+ if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL])
goto errout;
dev = __dev_get_by_index(net, ifm->ifa_index);
err = -ENODEV;
- if (dev == NULL)
+ if (!dev)
goto errout;
in_dev = __in_dev_get_rtnl(dev);
err = -ENOBUFS;
- if (in_dev == NULL)
+ if (!in_dev)
goto errout;
ifa = inet_alloc_ifa();
- if (ifa == NULL)
+ if (!ifa)
/*
* A potential indev allocation can be left alive, it stays
* assigned to its device and is destroy with it.
@@ -758,7 +780,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
neigh_parms_data_state_setall(in_dev->arp_parms);
in_dev_hold(in_dev);
- if (tb[IFA_ADDRESS] == NULL)
+ if (!tb[IFA_ADDRESS])
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
INIT_HLIST_NODE(&ifa->hash);
@@ -769,11 +791,11 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ifa->ifa_scope = ifm->ifa_scope;
ifa->ifa_dev = in_dev;
- ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]);
- ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]);
+ ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
+ ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
if (tb[IFA_BROADCAST])
- ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
+ ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
if (tb[IFA_LABEL])
nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
@@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
* userspace already relies on not having to provide this.
*/
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
+ int ret = ip_mc_config(net->ipv4.mc_autojoin_sk,
+ true, ifa);
+
+ if (ret < 0) {
+ inet_free_ifa(ifa);
+ return ret;
+ }
+ }
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
} else {
inet_free_ifa(ifa);
@@ -851,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
queue_delayed_work(system_power_efficient_wq,
&check_lifetime_work, 0);
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
- blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
}
return 0;
}
@@ -1259,7 +1289,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
__be32 addr = 0;
struct net_device *dev;
- if (in_dev != NULL)
+ if (in_dev)
return confirm_addr_indev(in_dev, dst, local, scope);
rcu_read_lock();
@@ -1309,7 +1339,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
if (named++ == 0)
goto skip;
dot = strchr(old, ':');
- if (dot == NULL) {
+ if (!dot) {
sprintf(old, ":%d", named);
dot = old;
}
@@ -1478,7 +1508,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
u32 preferred, valid;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
- if (nlh == NULL)
+ if (!nlh)
return -EMSGSIZE;
ifm = nlmsg_data(nlh);
@@ -1510,11 +1540,11 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
valid = INFINITY_LIFE_TIME;
}
if ((ifa->ifa_address &&
- nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
(ifa->ifa_local &&
- nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
(ifa->ifa_broadcast &&
- nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
@@ -1597,7 +1627,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
net = dev_net(ifa->ifa_dev->dev);
skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
@@ -1634,7 +1664,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
return -ENODATA;
nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
- if (nla == NULL)
+ if (!nla)
return -EMSGSIZE;
for (i = 0; i < IPV4_DEVCONF_MAX; i++)
@@ -1709,6 +1739,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (type == -1 || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ size += nla_total_size(4);
return size;
}
@@ -1723,7 +1755,7 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
flags);
- if (nlh == NULL)
+ if (!nlh)
return -EMSGSIZE;
ncm = nlmsg_data(nlh);
@@ -1749,6 +1781,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -1765,7 +1801,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
int err = -ENOBUFS;
skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
@@ -1788,6 +1824,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
[NETCONFA_FORWARDING] = { .len = sizeof(int) },
[NETCONFA_RP_FILTER] = { .len = sizeof(int) },
[NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+ [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
@@ -1822,10 +1859,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
break;
default:
dev = __dev_get_by_index(net, ifindex);
- if (dev == NULL)
+ if (!dev)
goto errout;
in_dev = __in_dev_get_rtnl(dev);
- if (in_dev == NULL)
+ if (!in_dev)
goto errout;
devconf = &in_dev->cnf;
break;
@@ -1833,7 +1870,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
err = -ENOBUFS;
skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = inet_netconf_fill_devconf(skb, ifindex, devconf,
@@ -2017,6 +2054,12 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
ifindex, cnf);
}
+ if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ ifindex, cnf);
+ }
}
return ret;
@@ -2138,6 +2181,8 @@ static struct devinet_sysctl_table {
"igmpv2_unsolicited_report_interval"),
DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
"igmpv3_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
+ "ignore_routes_with_linkdown"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -2184,7 +2229,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
{
struct devinet_sysctl_table *t = cnf->sysctl;
- if (t == NULL)
+ if (!t)
return;
cnf->sysctl = NULL;
@@ -2245,16 +2290,16 @@ static __net_init int devinet_init_net(struct net *net)
if (!net_eq(net, &init_net)) {
all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
- if (all == NULL)
+ if (!all)
goto err_alloc_all;
dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
- if (dflt == NULL)
+ if (!dflt)
goto err_alloc_dflt;
#ifdef CONFIG_SYSCTL
tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
- if (tbl == NULL)
+ if (!tbl)
goto err_alloc_ctl;
tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
@@ -2274,7 +2319,7 @@ static __net_init int devinet_init_net(struct net *net)
err = -ENOMEM;
forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
- if (forw_hdr == NULL)
+ if (!forw_hdr)
goto err_reg_ctl;
net->ipv4.forw_hdr = forw_hdr;
#endif
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 60173d4d3a0e..477937465a20 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -49,7 +49,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
len = ALIGN(len, crypto_tfm_ctx_alignment());
}
- len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
len = ALIGN(len, __alignof__(struct scatterlist));
len += sizeof(struct scatterlist) * nfrags;
@@ -68,17 +68,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
-static inline struct aead_givcrypt_request *esp_tmp_givreq(
- struct crypto_aead *aead, u8 *iv)
-{
- struct aead_givcrypt_request *req;
-
- req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
- crypto_tfm_ctx_alignment());
- aead_givcrypt_set_tfm(req, aead);
- return req;
-}
-
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
{
struct aead_request *req;
@@ -97,14 +86,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static inline struct scatterlist *esp_givreq_sg(
- struct crypto_aead *aead, struct aead_givcrypt_request *req)
-{
- return (void *)ALIGN((unsigned long)(req + 1) +
- crypto_aead_reqsize(aead),
- __alignof__(struct scatterlist));
-}
-
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -113,14 +94,37 @@ static void esp_output_done(struct crypto_async_request *base, int err)
xfrm_output_resume(skb, err);
}
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+ struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ __be32 *seqhi = esp_tmp_seqhi(tmp);
+
+ esph->seq_no = esph->spi;
+ esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+}
+
+static void esp_output_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_output_restore_header(skb);
+ esp_output_done(base, err);
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
- struct aead_givcrypt_request *req;
+ struct aead_request *req;
struct scatterlist *sg;
- struct scatterlist *asg;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -129,17 +133,19 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int clen;
int alen;
int plen;
+ int ivlen;
int tfclen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
__be32 *seqhi;
+ __be64 seqno;
/* skb is pure payload to encrypt */
aead = x->data;
alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
tfclen = 0;
if (x->tfcpad) {
@@ -160,16 +166,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp) {
err = -ENOMEM;
goto error;
@@ -177,9 +181,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
- req = esp_tmp_givreq(aead, iv);
- asg = esp_givreq_sg(aead, req);
- sg = asg + sglists;
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
/* Fill padding... */
tail = skb_tail_pointer(trailer);
@@ -235,36 +238,53 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_UDP;
}
- esph->spi = x->id.spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+ *seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ }
+
+ esph->spi = x->id.spi;
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
- esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
- clen + alen);
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
- if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
+ aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
- aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
- aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, assoclen);
- aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output.low);
+ memset(iv, 0, ivlen);
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8),
+ min(ivlen, 8));
ESP_SKB_CB(skb)->tmp = tmp;
- err = crypto_aead_givencrypt(req);
- if (err == -EINPROGRESS)
+ err = crypto_aead_encrypt(req);
+
+ switch (err) {
+ case -EINPROGRESS:
goto error;
- if (err == -EBUSY)
+ case -EBUSY:
err = NET_XMIT_DROP;
+ break;
+
+ case 0:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_output_restore_header(skb);
+ }
kfree(tmp);
@@ -363,6 +383,20 @@ static void esp_input_done(struct crypto_async_request *base, int err)
xfrm_input_resume(skb, esp_input_done2(skb, err));
}
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, 0);
+ __skb_pull(skb, 4);
+}
+
+static void esp_input_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_input_restore_header(skb);
+ esp_input_done(base, err);
+}
+
/*
* Note: detecting truncated vs. non-truncated authentication data is very
* expensive, so we only support truncated data, which is the recommended
@@ -374,19 +408,18 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
- int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int ivlen = crypto_aead_ivsize(aead);
+ int elen = skb->len - sizeof(*esph) - ivlen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
__be32 *seqhi;
void *tmp;
u8 *iv;
struct scatterlist *sg;
- struct scatterlist *asg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
goto out;
if (elen <= 0)
@@ -399,17 +432,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
err = -ENOMEM;
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
goto out;
@@ -417,36 +448,39 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
- asg = esp_req_sg(aead, req);
- sg = asg + sglists;
+ sg = esp_req_sg(aead, req);
skb->ip_summed = CHECKSUM_NONE;
esph = (struct ip_esp_hdr *)skb->data;
- /* Get ivec. This can be wrong, check against another impls. */
- iv = esph->enc_data;
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+ }
- aead_request_set_callback(req, 0, esp_input_done, skb);
- aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, assoclen);
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+ aead_request_set_ad(req, assoclen);
err = crypto_aead_decrypt(req);
if (err == -EINPROGRESS)
goto out;
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_input_restore_header(skb);
+
err = esp_input_done2(skb, err);
out:
@@ -518,10 +552,16 @@ static void esp_destroy(struct xfrm_state *x)
static int esp_init_aead(struct xfrm_state *x)
{
+ char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = -ENAMETOOLONG;
+ if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+
+ aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -553,22 +593,26 @@ static int esp_init_authenc(struct xfrm_state *x)
int err;
err = -EINVAL;
- if (x->ealg == NULL)
+ if (!x->ealg)
goto error;
err = -ENAMETOOLONG;
if ((x->props.flags & XFRM_STATE_ESN)) {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authencesn(%s,%s)",
+ "%s%sauthencesn(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authenc(%s,%s)",
+ "%s%sauthenc(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 23b9b3e86f4c..6bbc54940eb4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -52,12 +52,12 @@ static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;
- local_table = fib_trie_table(RT_TABLE_LOCAL);
- if (local_table == NULL)
+ main_table = fib_trie_table(RT_TABLE_MAIN, NULL);
+ if (!main_table)
return -ENOMEM;
- main_table = fib_trie_table(RT_TABLE_MAIN);
- if (main_table == NULL)
+ local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
+ if (!local_table)
goto fail;
hlist_add_head_rcu(&local_table->tb_hlist,
@@ -67,14 +67,14 @@ static int __net_init fib4_rules_init(struct net *net)
return 0;
fail:
- fib_free_table(local_table);
+ fib_free_table(main_table);
return -ENOMEM;
}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
{
- struct fib_table *tb;
+ struct fib_table *tb, *alias = NULL;
unsigned int h;
if (id == 0)
@@ -83,23 +83,23 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- tb = fib_trie_table(id);
+ if (id == RT_TABLE_LOCAL)
+ alias = fib_new_table(net, RT_TABLE_MAIN);
+
+ tb = fib_trie_table(id, alias);
if (!tb)
return NULL;
switch (id) {
case RT_TABLE_LOCAL:
- net->ipv4.fib_local = tb;
+ rcu_assign_pointer(net->ipv4.fib_local, tb);
break;
-
case RT_TABLE_MAIN:
- net->ipv4.fib_main = tb;
+ rcu_assign_pointer(net->ipv4.fib_main, tb);
break;
-
case RT_TABLE_DEFAULT:
- net->ipv4.fib_default = tb;
+ rcu_assign_pointer(net->ipv4.fib_default, tb);
break;
-
default:
break;
}
@@ -129,16 +129,62 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
+static void fib_replace_table(struct net *net, struct fib_table *old,
+ struct fib_table *new)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ switch (new->tb_id) {
+ case RT_TABLE_LOCAL:
+ rcu_assign_pointer(net->ipv4.fib_local, new);
+ break;
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, new);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, new);
+ break;
+ default:
+ break;
+ }
+
+#endif
+ /* replace the old table in the hlist */
+ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
+}
+
+int fib_unmerge(struct net *net)
+{
+ struct fib_table *old, *new;
+
+ /* attempt to fetch local table if it has been allocated */
+ old = fib_get_table(net, RT_TABLE_LOCAL);
+ if (!old)
+ return 0;
+
+ new = fib_trie_unmerge(old);
+ if (!new)
+ return -ENOMEM;
+
+ /* replace merged table with clean table */
+ if (new != old) {
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+ }
+
+ return 0;
+}
+
static void fib_flush(struct net *net)
{
int flushed = 0;
- struct fib_table *tb;
- struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist)
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
flushed += fib_table_flush(tb);
}
@@ -146,6 +192,19 @@ static void fib_flush(struct net *net)
rt_cache_flush(net);
}
+void fib_flush_external(struct net *net)
+{
+ struct fib_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, head, tb_hlist)
+ fib_table_flush_external(tb);
+ }
+}
+
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -221,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
- if (!fib_lookup(net, &fl4, &res))
+ if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
scope = RT_SCOPE_LINK;
@@ -260,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
net = dev_net(dev);
- if (fib_lookup(net, &fl4, &res))
+ if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
@@ -295,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(net, &fl4, &res) == 0) {
+ if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
}
@@ -427,7 +486,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
if (strcmp(ifa->ifa_label, devname) == 0)
break;
- if (ifa == NULL)
+ if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
}
@@ -455,7 +514,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
int len = 0;
mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
- if (mx == NULL)
+ if (!mx)
return -ENOMEM;
if (rt->rt_flags & RTF_MTU)
@@ -617,7 +676,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
tb = fib_get_table(net, cfg.fc_table);
- if (tb == NULL) {
+ if (!tb) {
err = -ESRCH;
goto errout;
}
@@ -639,7 +698,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
tb = fib_new_table(net, cfg.fc_table);
- if (tb == NULL) {
+ if (!tb) {
err = -ENOBUFS;
goto errout;
}
@@ -665,10 +724,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[0];
s_e = cb->args[1];
+ rcu_read_lock();
+
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
@@ -682,6 +743,8 @@ next:
}
}
out:
+ rcu_read_unlock();
+
cb->args[1] = e;
cb->args[0] = h;
@@ -716,7 +779,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
else
tb = fib_new_table(net, RT_TABLE_LOCAL);
- if (tb == NULL)
+ if (!tb)
return;
cfg.fc_table = tb->tb_id;
@@ -743,7 +806,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
- if (prim == NULL) {
+ if (!prim) {
pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
@@ -797,7 +860,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
- if (prim == NULL) {
+ if (!prim) {
pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
@@ -967,7 +1030,7 @@ static void nl_fib_input(struct sk_buff *skb)
return;
skb = netlink_skb_clone(skb, GFP_KERNEL);
- if (skb == NULL)
+ if (!skb)
return;
nlh = nlmsg_hdr(skb);
@@ -988,7 +1051,7 @@ static int __net_init nl_fib_lookup_init(struct net *net)
};
sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
- if (sk == NULL)
+ if (!sk)
return -EAFNOSUPPORT;
net->ipv4.fibnl = sk;
return 0;
@@ -1000,9 +1063,9 @@ static void nl_fib_lookup_exit(struct net *net)
net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, unsigned long event)
{
- if (fib_sync_down_dev(dev, force))
+ if (fib_sync_down_dev(dev, event))
fib_flush(dev_net(dev));
rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
@@ -1018,7 +1081,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
case NETDEV_UP:
fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(dev_net(dev));
@@ -1026,11 +1089,11 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
case NETDEV_DOWN:
fib_del_ifaddr(ifa, NULL);
atomic_inc(&net->ipv4.dev_addr_genid);
- if (ifa->ifa_dev->ifa_list == NULL) {
+ if (!ifa->ifa_dev->ifa_list) {
/* Last address was deleted from this interface.
* Disable IP.
*/
- fib_disable_ip(dev, 1);
+ fib_disable_ip(dev, event);
} else {
rt_cache_flush(dev_net(dev));
}
@@ -1044,9 +1107,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ unsigned int flags;
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2);
+ fib_disable_ip(dev, event);
rt_flush_dev(dev);
return NOTIFY_DONE;
}
@@ -1061,16 +1125,22 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_add_ifaddr(ifa);
} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(net);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0);
+ fib_disable_ip(dev, event);
break;
- case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
+ flags = dev_get_flags(dev);
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ fib_sync_up(dev, RTNH_F_LINKDOWN);
+ else
+ fib_sync_down_dev(dev, event);
+ /* fall through */
+ case NETDEV_CHANGEMTU:
rt_cache_flush(net);
break;
}
@@ -1094,7 +1164,7 @@ static int __net_init ip_fib_net_init(struct net *net)
size = max_t(size_t, size, L1_CACHE_BYTES);
net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
- if (net->ipv4.fib_table_hash == NULL)
+ if (!net->ipv4.fib_table_hash)
return -ENOMEM;
err = fib4_rules_init(net);
@@ -1113,20 +1183,25 @@ static void ip_fib_net_exit(struct net *net)
rtnl_lock();
#ifdef CONFIG_IP_MULTIPLE_TABLES
- fib4_rules_exit(net);
+ RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
#endif
for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
- struct fib_table *tb;
- struct hlist_head *head;
+ struct hlist_head *head = &net->ipv4.fib_table_hash[i];
struct hlist_node *tmp;
+ struct fib_table *tb;
- head = &net->ipv4.fib_table_hash[i];
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
hlist_del(&tb->tb_hlist);
fib_table_flush(tb);
fib_free_table(tb);
}
}
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
+#endif
rtnl_unlock();
kfree(net->ipv4.fib_table_hash);
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 825981b1049a..9c02920725db 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -6,11 +6,14 @@
#include <net/ip_fib.h>
struct fib_alias {
- struct list_head fa_list;
+ struct hlist_node fa_list;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
u8 fa_state;
+ u8 fa_slen;
+ u32 tb_id;
+ s16 fa_default;
struct rcu_head rcu;
};
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index d3db718be51d..18123d50f576 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,11 +47,12 @@ struct fib4_rule {
#endif
};
-int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+ struct fib_result *res, unsigned int flags)
{
struct fib_lookup_arg arg = {
.result = res,
- .flags = FIB_LOOKUP_NOREF,
+ .flags = flags,
};
int err;
@@ -153,7 +154,7 @@ static struct fib_table *fib_empty_table(struct net *net)
u32 id;
for (id = 1; id <= RT_TABLE_MAX; id++)
- if (fib_get_table(net, id) == NULL)
+ if (!fib_get_table(net, id))
return fib_new_table(net, id);
return NULL;
}
@@ -174,12 +175,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->tos & ~IPTOS_TOS_MASK)
goto errout;
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
if (rule->table == RT_TABLE_UNSPEC) {
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
table = fib_empty_table(net);
- if (table == NULL) {
+ if (!table) {
err = -ENOBUFS;
goto errout;
}
@@ -189,10 +195,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
}
if (frh->src_len)
- rule4->src = nla_get_be32(tb[FRA_SRC]);
+ rule4->src = nla_get_in_addr(tb[FRA_SRC]);
if (frh->dst_len)
- rule4->dst = nla_get_be32(tb[FRA_DST]);
+ rule4->dst = nla_get_in_addr(tb[FRA_DST]);
#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW]) {
@@ -209,21 +215,31 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+
err = 0;
errout:
return err;
}
-static void fib4_rule_delete(struct fib_rule *rule)
+static int fib4_rule_delete(struct fib_rule *rule)
{
struct net *net = rule->fr_net;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ int err;
- if (rule4->tclassid)
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (((struct fib4_rule *)rule)->tclassid)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+errout:
+ return err;
}
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
@@ -245,10 +261,10 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
return 0;
#endif
- if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC])))
+ if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
return 0;
- if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST])))
+ if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
return 0;
return 1;
@@ -264,9 +280,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->tos = rule4->tos;
if ((rule4->dst_len &&
- nla_put_be32(skb, FRA_DST, rule4->dst)) ||
+ nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
(rule4->src_len &&
- nla_put_be32(skb, FRA_SRC, rule4->src)))
+ nla_put_in_addr(skb, FRA_SRC, rule4->src)))
goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (rule4->tclassid &&
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1e2090ea663e..3a06586b170c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head)
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
- release_net(fi->fib_net);
if (fi->fib_metrics != (u32 *) dst_default_metrics)
kfree(fi->fib_metrics);
kfree(fi);
@@ -267,7 +266,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
return -1;
onh++;
} endfor_nexthops(fi);
@@ -319,7 +318,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
nfi->fib_type == fi->fib_type &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(u32) * RTAX_MAX) == 0 &&
- ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+ !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -391,7 +390,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
int err = -ENOBUFS;
skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = fib_dump_info(skb, info->portid, seq, event, tb_id,
@@ -469,7 +468,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
+ nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
@@ -504,7 +503,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (cfg->fc_mp == NULL)
+ if (!cfg->fc_mp)
return 0;
rtnh = cfg->fc_mp;
@@ -524,7 +523,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- if (nla && nla_get_be32(nla) != nh->nh_gw)
+ if (nla && nla_get_in_addr(nla) != nh->nh_gw)
return 1;
#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
@@ -605,6 +604,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
return -ENODEV;
if (!(dev->flags & IFF_UP))
return -ENETDOWN;
+ if (!netif_carrier_ok(dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
@@ -622,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl4.flowi4_scope < RT_SCOPE_LINK)
fl4.flowi4_scope = RT_SCOPE_LINK;
- err = fib_lookup(net, &fl4, &res);
+ err = fib_lookup(net, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE);
if (err) {
rcu_read_unlock();
return err;
@@ -637,6 +639,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
if (!dev)
goto out;
dev_hold(dev);
+ if (!netif_carrier_ok(dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
@@ -647,7 +651,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
rcu_read_lock();
err = -ENODEV;
in_dev = inetdev_by_index(net, nh->nh_oif);
- if (in_dev == NULL)
+ if (!in_dev)
goto out;
err = -ENETDOWN;
if (!(in_dev->dev->flags & IFF_UP))
@@ -655,6 +659,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
+ if (!netif_carrier_ok(nh->nh_dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
err = 0;
}
out:
@@ -714,8 +720,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *dest;
unsigned int new_hash;
- hlist_del(&fi->fib_hash);
-
new_hash = fib_info_hashfn(fi);
dest = &new_info_hash[new_hash];
hlist_add_head(&fi->fib_hash, dest);
@@ -732,8 +736,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *ldest;
unsigned int new_hash;
- hlist_del(&fi->fib_lhash);
-
new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
ldest = &new_laddrhash[new_hash];
hlist_add_head(&fi->fib_lhash, ldest);
@@ -804,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
}
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
- if (fi == NULL)
+ if (!fi)
goto failure;
fib_info_cnt++;
if (cfg->fc_mx) {
@@ -814,7 +816,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
} else
fi->fib_metrics = (u32 *) dst_default_metrics;
- fi->fib_net = hold_net(net);
+ fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
fi->fib_scope = cfg->fc_scope;
fi->fib_flags = cfg->fc_flags;
@@ -922,14 +924,20 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_scope = RT_SCOPE_NOWHERE;
nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
err = -ENODEV;
- if (nh->nh_dev == NULL)
+ if (!nh->nh_dev)
goto failure;
} else {
+ int linkdown = 0;
+
change_nexthops(fi) {
err = fib_check_nh(cfg, fi, nexthop_nh);
if (err != 0)
goto failure;
+ if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+ linkdown++;
} endfor_nexthops(fi)
+ if (linkdown == fi->fib_nhs)
+ fi->fib_flags |= RTNH_F_LINKDOWN;
}
if (fi->fib_prefsrc) {
@@ -996,7 +1004,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
struct rtmsg *rtm;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
- if (nlh == NULL)
+ if (!nlh)
return -EMSGSIZE;
rtm = nlmsg_data(nlh);
@@ -1016,7 +1024,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtm->rtm_protocol = fi->fib_protocol;
if (rtm->rtm_dst_len &&
- nla_put_be32(skb, RTA_DST, dst))
+ nla_put_in_addr(skb, RTA_DST, dst))
goto nla_put_failure;
if (fi->fib_priority &&
nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
@@ -1025,15 +1033,23 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
if (fi->fib_prefsrc &&
- nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
if (fi->fib_nhs == 1) {
+ struct in_device *in_dev;
+
if (fi->fib_nh->nh_gw &&
- nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
goto nla_put_failure;
if (fi->fib_nh->nh_oif &&
nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
goto nla_put_failure;
+ if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+ in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+ rtm->rtm_flags |= RTNH_F_DEAD;
+ }
#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid &&
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
@@ -1046,20 +1062,28 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
struct nlattr *mp;
mp = nla_nest_start(skb, RTA_MULTIPATH);
- if (mp == NULL)
+ if (!mp)
goto nla_put_failure;
for_nexthops(fi) {
+ struct in_device *in_dev;
+
rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
- if (rtnh == NULL)
+ if (!rtnh)
goto nla_put_failure;
rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+ if (nh->nh_flags & RTNH_F_LINKDOWN) {
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+ rtnh->rtnh_flags |= RTNH_F_DEAD;
+ }
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
if (nh->nh_gw &&
- nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
+ nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid &&
@@ -1094,7 +1118,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
struct hlist_head *head = &fib_info_laddrhash[hash];
struct fib_info *fi;
- if (fib_info_laddrhash == NULL || local == 0)
+ if (!fib_info_laddrhash || local == 0)
return 0;
hlist_for_each_entry(fi, head, fib_lhash) {
@@ -1108,7 +1132,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
return ret;
}
-int fib_sync_down_dev(struct net_device *dev, int force)
+int fib_sync_down_dev(struct net_device *dev, unsigned long event)
{
int ret = 0;
int scope = RT_SCOPE_NOWHERE;
@@ -1117,7 +1141,8 @@ int fib_sync_down_dev(struct net_device *dev, int force)
struct hlist_head *head = &fib_info_devhash[hash];
struct fib_nh *nh;
- if (force)
+ if (event == NETDEV_UNREGISTER ||
+ event == NETDEV_DOWN)
scope = -1;
hlist_for_each_entry(nh, head, nh_hash) {
@@ -1134,7 +1159,15 @@ int fib_sync_down_dev(struct net_device *dev, int force)
dead++;
else if (nexthop_nh->nh_dev == dev &&
nexthop_nh->nh_scope != scope) {
- nexthop_nh->nh_flags |= RTNH_F_DEAD;
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+ break;
+ }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
spin_lock_bh(&fib_multipath_lock);
fi->fib_power -= nexthop_nh->nh_power;
@@ -1144,14 +1177,23 @@ int fib_sync_down_dev(struct net_device *dev, int force)
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (force > 1 && nexthop_nh->nh_dev == dev) {
+ if (event == NETDEV_UNREGISTER &&
+ nexthop_nh->nh_dev == dev) {
dead = fi->fib_nhs;
break;
}
#endif
} endfor_nexthops(fi)
if (dead == fi->fib_nhs) {
- fi->fib_flags |= RTNH_F_DEAD;
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ fi->fib_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ fi->fib_flags |= RTNH_F_LINKDOWN;
+ break;
+ }
ret++;
}
}
@@ -1160,68 +1202,85 @@ int fib_sync_down_dev(struct net_device *dev, int force)
}
/* Must be invoked inside of an RCU protected region. */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
- struct list_head *fa_head = res->fa_head;
+ struct hlist_head *fa_head = res->fa_head;
struct fib_table *tb = res->table;
+ u8 slen = 32 - res->prefixlen;
int order = -1, last_idx = -1;
- struct fib_alias *fa;
+ struct fib_alias *fa, *fa1 = NULL;
+ u32 last_prio = res->fi->fib_priority;
+ u8 last_tos = 0;
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
+ if (fa->fa_slen != slen)
+ continue;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
+ if (next_fi->fib_priority > last_prio &&
+ fa->fa_tos == last_tos) {
+ if (last_tos)
+ continue;
+ break;
+ }
+ if (next_fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ last_tos = fa->fa_tos;
+ last_prio = next_fi->fib_priority;
+
if (next_fi->fib_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
fib_alias_accessed(fa);
- if (fi == NULL) {
+ if (!fi) {
if (next_fi != res->fi)
break;
+ fa1 = fa;
} else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
+ &last_idx, fa1->fa_default)) {
fib_result_assign(res, fi);
- tb->tb_default = order;
+ fa1->fa_default = order;
goto out;
}
fi = next_fi;
order++;
}
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
+ if (order <= 0 || !fi) {
+ if (fa1)
+ fa1->fa_default = -1;
goto out;
}
if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
+ fa1->fa_default)) {
fib_result_assign(res, fi);
- tb->tb_default = order;
+ fa1->fa_default = order;
goto out;
}
if (last_idx >= 0)
fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
+ fa1->fa_default = last_idx;
out:
return;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
/*
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
*/
-int fib_sync_up(struct net_device *dev)
+int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
{
struct fib_info *prev_fi;
unsigned int hash;
@@ -1248,25 +1307,29 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & nh_flags)) {
alive++;
continue;
}
- if (nexthop_nh->nh_dev == NULL ||
+ if (!nexthop_nh->nh_dev ||
!(nexthop_nh->nh_dev->flags & IFF_UP))
continue;
if (nexthop_nh->nh_dev != dev ||
!__in_dev_get_rtnl(dev))
continue;
alive++;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
spin_lock_bh(&fib_multipath_lock);
nexthop_nh->nh_power = 0;
- nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
+ nexthop_nh->nh_flags &= ~nh_flags;
spin_unlock_bh(&fib_multipath_lock);
+#else
+ nexthop_nh->nh_flags &= ~nh_flags;
+#endif
} endfor_nexthops(fi)
if (alive > 0) {
- fi->fib_flags &= ~RTNH_F_DEAD;
+ fi->fib_flags &= ~nh_flags;
ret++;
}
}
@@ -1274,6 +1337,8 @@ int fib_sync_up(struct net_device *dev)
return ret;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
/*
* The algorithm is suboptimal, but it provides really
* fair weighted route distribution.
@@ -1281,16 +1346,22 @@ int fib_sync_up(struct net_device *dev)
void fib_select_multipath(struct fib_result *res)
{
struct fib_info *fi = res->fi;
+ struct in_device *in_dev;
int w;
spin_lock_bh(&fib_multipath_lock);
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
- power += nexthop_nh->nh_weight;
- nexthop_nh->nh_power = nexthop_nh->nh_weight;
- }
+ in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+ continue;
+ power += nexthop_nh->nh_weight;
+ nexthop_nh->nh_power = nexthop_nh->nh_weight;
} endfor_nexthops(fi);
fi->fib_power = power;
if (power <= 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3daf0224ff2e..37c4bb89a708 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -72,6 +72,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -79,6 +80,7 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/switchdev.h>
#include "fib_lookup.h"
#define MAX_STAT_DEPTH 32
@@ -88,38 +90,35 @@
typedef unsigned int t_key;
-#define IS_TNODE(n) ((n)->bits)
-#define IS_LEAF(n) (!(n)->bits)
+#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
-#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
-
-struct tnode {
+struct key_vector {
t_key key;
- unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char slen;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
union {
- /* The fields in this struct are valid if bits > 0 (TNODE) */
- struct {
- t_key empty_children; /* KEYLENGTH bits needed */
- t_key full_children; /* KEYLENGTH bits needed */
- struct tnode __rcu *child[0];
- };
- /* This list pointer if valid if bits == 0 (LEAF) */
- struct hlist_head list;
+ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+ struct hlist_head leaf;
+ /* This array is valid if (pos | bits) > 0 (TNODE) */
+ struct key_vector __rcu *tnode[0];
};
};
-struct leaf_info {
- struct hlist_node hlist;
- int plen;
- u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
- struct list_head falh;
+struct tnode {
struct rcu_head rcu;
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
+ struct key_vector __rcu *parent;
+ struct key_vector kv[1];
+#define tn_bits kv[0].bits
};
+#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE TNODE_SIZE(1)
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
@@ -142,13 +141,13 @@ struct trie_stat {
};
struct trie {
- struct tnode __rcu *trie;
+ struct key_vector kv[1];
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats;
#endif
};
-static void resize(struct trie *t, struct tnode *tn);
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
static size_t tnode_free_size;
/*
@@ -161,41 +160,46 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
+static inline struct tnode *tn_info(struct key_vector *kv)
+{
+ return container_of(kv, struct tnode, kv[0]);
+}
+
/* caller must hold RTNL */
-#define node_parent(n) rtnl_dereference((n)->parent)
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
/* caller must hold RCU read lock or RTNL */
-#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
/* wrapper for rcu_assign_pointer */
-static inline void node_set_parent(struct tnode *n, struct tnode *tp)
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
{
if (n)
- rcu_assign_pointer(n->parent, tp);
+ rcu_assign_pointer(tn_info(n)->parent, tp);
}
-#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
/* This provides us with the number of children in this node, in the case of a
* leaf this will return 0 meaning none of the children are accessible.
*/
-static inline unsigned long tnode_child_length(const struct tnode *tn)
+static inline unsigned long child_length(const struct key_vector *tn)
{
return (1ul << tn->bits) & ~(1ul);
}
-/* caller must hold RTNL */
-static inline struct tnode *tnode_get_child(const struct tnode *tn,
- unsigned long i)
-{
- return rtnl_dereference(tn->child[i]);
-}
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
-/* caller must hold RCU read lock or RTNL */
-static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
- unsigned long i)
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
{
- return rcu_dereference_rtnl(tn->child[i]);
+ unsigned long index = key ^ kv->key;
+
+ if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+ return 0;
+
+ return index >> kv->pos;
}
/* To understand this stuff, an understanding of keys and all their bits is
@@ -274,106 +278,108 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
}
#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
+ ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+#define TNODE_VMALLOC_MAX \
+ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
static void __node_free_rcu(struct rcu_head *head)
{
struct tnode *n = container_of(head, struct tnode, rcu);
- if (IS_LEAF(n))
+ if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->bits <= TNODE_KMALLOC_MAX)
+ else if (n->tn_bits <= TNODE_KMALLOC_MAX)
kfree(n);
else
vfree(n);
}
-#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
-static inline void free_leaf_info(struct leaf_info *leaf)
+static struct tnode *tnode_alloc(int bits)
{
- kfree_rcu(leaf, rcu);
-}
+ size_t size;
+
+ /* verify bits is within bounds */
+ if (bits > TNODE_VMALLOC_MAX)
+ return NULL;
+
+ /* determine size and verify it is non-zero and didn't overflow */
+ size = TNODE_SIZE(1ul << bits);
-static struct tnode *tnode_alloc(size_t size)
-{
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
return vzalloc(size);
}
-static inline void empty_child_inc(struct tnode *n)
+static inline void empty_child_inc(struct key_vector *n)
{
- ++n->empty_children ? : ++n->full_children;
+ ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
}
-static inline void empty_child_dec(struct tnode *n)
+static inline void empty_child_dec(struct key_vector *n)
{
- n->empty_children-- ? : n->full_children--;
+ tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
}
-static struct tnode *leaf_new(t_key key)
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
- struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
- if (l) {
- l->parent = NULL;
- /* set key and pos to reflect full key value
- * any trailing zeros in the key should be ignored
- * as the nodes are searched
- */
- l->key = key;
- l->slen = 0;
- l->pos = 0;
- /* set bits to 0 indicating we are not a tnode */
- l->bits = 0;
+ struct key_vector *l;
+ struct tnode *kv;
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
-}
+ kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ if (!kv)
+ return NULL;
-static struct leaf_info *leaf_info_new(int plen)
-{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- li->mask_plen = ntohl(inet_make_mask(plen));
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
+ /* initialize key vector */
+ l = kv->kv;
+ l->key = key;
+ l->pos = 0;
+ l->bits = 0;
+ l->slen = fa->fa_slen;
+
+ /* link leaf to fib alias */
+ INIT_HLIST_HEAD(&l->leaf);
+ hlist_add_head(&fa->fa_list, &l->leaf);
+
+ return l;
}
-static struct tnode *tnode_new(t_key key, int pos, int bits)
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = offsetof(struct tnode, child[1ul << bits]);
- struct tnode *tn = tnode_alloc(sz);
unsigned int shift = pos + bits;
+ struct key_vector *tn;
+ struct tnode *tnode;
/* verify bits and pos their msb bits clear and values are valid */
BUG_ON(!bits || (shift > KEYLENGTH));
- if (tn) {
- tn->parent = NULL;
- tn->slen = pos;
- tn->pos = pos;
- tn->bits = bits;
- tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
- if (bits == KEYLENGTH)
- tn->full_children = 1;
- else
- tn->empty_children = 1ul << bits;
- }
+ tnode = tnode_alloc(bits);
+ if (!tnode)
+ return NULL;
+
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
+ if (bits == KEYLENGTH)
+ tnode->full_children = 1;
+ else
+ tnode->empty_children = 1ul << bits;
+
+ tn = tnode->kv;
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->slen = pos;
- pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct tnode *) << bits);
return tn;
}
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
{
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
@@ -381,17 +387,18 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
/* Add a child at position i overwriting the old value.
* Update the value of full_children and empty_children.
*/
-static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
+static void put_child(struct key_vector *tn, unsigned long i,
+ struct key_vector *n)
{
- struct tnode *chi = tnode_get_child(tn, i);
+ struct key_vector *chi = get_child(tn, i);
int isfull, wasfull;
- BUG_ON(i >= tnode_child_length(tn));
+ BUG_ON(i >= child_length(tn));
/* update emptyChildren, overflow into fullChildren */
- if (n == NULL && chi != NULL)
+ if (!n && chi)
empty_child_inc(tn);
- if (n != NULL && chi == NULL)
+ if (n && !chi)
empty_child_dec(tn);
/* update fullChildren */
@@ -399,23 +406,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
- tn->full_children--;
+ tn_info(tn)->full_children--;
else if (!wasfull && isfull)
- tn->full_children++;
+ tn_info(tn)->full_children++;
if (n && (tn->slen < n->slen))
tn->slen = n->slen;
- rcu_assign_pointer(tn->child[i], n);
+ rcu_assign_pointer(tn->tnode[i], n);
}
-static void update_children(struct tnode *tn)
+static void update_children(struct key_vector *tn)
{
unsigned long i;
/* update all of the child parent pointers */
- for (i = tnode_child_length(tn); i;) {
- struct tnode *inode = tnode_get_child(tn, --i);
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
if (!inode)
continue;
@@ -431,36 +438,37 @@ static void update_children(struct tnode *tn)
}
}
-static inline void put_child_root(struct tnode *tp, struct trie *t,
- t_key key, struct tnode *n)
+static inline void put_child_root(struct key_vector *tp, t_key key,
+ struct key_vector *n)
{
- if (tp)
- put_child(tp, get_index(key, tp), n);
+ if (IS_TRIE(tp))
+ rcu_assign_pointer(tp->tnode[0], n);
else
- rcu_assign_pointer(t->trie, n);
+ put_child(tp, get_index(key, tp), n);
}
-static inline void tnode_free_init(struct tnode *tn)
+static inline void tnode_free_init(struct key_vector *tn)
{
- tn->rcu.next = NULL;
+ tn_info(tn)->rcu.next = NULL;
}
-static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
+static inline void tnode_free_append(struct key_vector *tn,
+ struct key_vector *n)
{
- n->rcu.next = tn->rcu.next;
- tn->rcu.next = &n->rcu;
+ tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+ tn_info(tn)->rcu.next = &tn_info(n)->rcu;
}
-static void tnode_free(struct tnode *tn)
+static void tnode_free(struct key_vector *tn)
{
- struct callback_head *head = &tn->rcu;
+ struct callback_head *head = &tn_info(tn)->rcu;
while (head) {
head = head->next;
- tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+ tnode_free_size += TNODE_SIZE(1ul << tn->bits);
node_free(tn);
- tn = container_of(head, struct tnode, rcu);
+ tn = container_of(head, struct tnode, rcu)->kv;
}
if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -469,14 +477,16 @@ static void tnode_free(struct tnode *tn)
}
}
-static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
+static struct key_vector *replace(struct trie *t,
+ struct key_vector *oldtnode,
+ struct key_vector *tn)
{
- struct tnode *tp = node_parent(oldtnode);
+ struct key_vector *tp = node_parent(oldtnode);
unsigned long i;
/* setup the parent pointer out of and back into this node */
NODE_INIT_PARENT(tn, tp);
- put_child_root(tp, t, tn->key, tn);
+ put_child_root(tp, tn->key, tn);
/* update all of the child parent pointers */
update_children(tn);
@@ -485,18 +495,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
tnode_free(oldtnode);
/* resize children now that oldtnode is freed */
- for (i = tnode_child_length(tn); i;) {
- struct tnode *inode = tnode_get_child(tn, --i);
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
/* resize child node */
if (tnode_full(tn, inode))
- resize(t, inode);
+ tn = resize(t, inode);
}
+
+ return tp;
}
-static int inflate(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *inflate(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *tn;
+ struct key_vector *tn;
unsigned long i;
t_key m;
@@ -504,7 +517,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
- return -ENOMEM;
+ goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
@@ -514,13 +527,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
* point to existing tnodes and the links between our allocated
* nodes.
*/
- for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
- struct tnode *inode = tnode_get_child(oldtnode, --i);
- struct tnode *node0, *node1;
+ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+ struct key_vector *inode = get_child(oldtnode, --i);
+ struct key_vector *node0, *node1;
unsigned long j, k;
/* An empty child */
- if (inode == NULL)
+ if (!inode)
continue;
/* A leaf or an internal node with skipped bits */
@@ -534,8 +547,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
/* An internal node with two children */
if (inode->bits == 1) {
- put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
- put_child(tn, 2 * i, tnode_get_child(inode, 0));
+ put_child(tn, 2 * i + 1, get_child(inode, 1));
+ put_child(tn, 2 * i, get_child(inode, 0));
continue;
}
@@ -564,11 +577,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
tnode_free_append(tn, node0);
/* populate child pointers in new nodes */
- for (k = tnode_child_length(inode), j = k / 2; j;) {
- put_child(node1, --j, tnode_get_child(inode, --k));
- put_child(node0, j, tnode_get_child(inode, j));
- put_child(node1, --j, tnode_get_child(inode, --k));
- put_child(node0, j, tnode_get_child(inode, j));
+ for (k = child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
}
/* link new nodes to parent */
@@ -581,25 +594,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
}
/* setup the parent pointers into and out of this node */
- replace(t, oldtnode, tn);
-
- return 0;
+ return replace(t, oldtnode, tn);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
- return -ENOMEM;
+notnode:
+ return NULL;
}
-static int halve(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *halve(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *tn;
+ struct key_vector *tn;
unsigned long i;
pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
if (!tn)
- return -ENOMEM;
+ goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
@@ -609,10 +622,10 @@ static int halve(struct trie *t, struct tnode *oldtnode)
* point to existing tnodes and the links between our allocated
* nodes.
*/
- for (i = tnode_child_length(oldtnode); i;) {
- struct tnode *node1 = tnode_get_child(oldtnode, --i);
- struct tnode *node0 = tnode_get_child(oldtnode, --i);
- struct tnode *inode;
+ for (i = child_length(oldtnode); i;) {
+ struct key_vector *node1 = get_child(oldtnode, --i);
+ struct key_vector *node0 = get_child(oldtnode, --i);
+ struct key_vector *inode;
/* At least one of the children is empty */
if (!node1 || !node0) {
@@ -622,10 +635,8 @@ static int halve(struct trie *t, struct tnode *oldtnode)
/* Two nonempty children */
inode = tnode_new(node0->key, oldtnode->pos, 1);
- if (!inode) {
- tnode_free(tn);
- return -ENOMEM;
- }
+ if (!inode)
+ goto nomem;
tnode_free_append(tn, inode);
/* initialize pointers out of node */
@@ -638,30 +649,36 @@ static int halve(struct trie *t, struct tnode *oldtnode)
}
/* setup the parent pointers into and out of this node */
- replace(t, oldtnode, tn);
-
- return 0;
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
}
-static void collapse(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *collapse(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *n, *tp;
+ struct key_vector *n, *tp;
unsigned long i;
/* scan the tnode looking for that one child that might still exist */
- for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
- n = tnode_get_child(oldtnode, --i);
+ for (n = NULL, i = child_length(oldtnode); !n && i;)
+ n = get_child(oldtnode, --i);
/* compress one level */
tp = node_parent(oldtnode);
- put_child_root(tp, t, oldtnode->key, n);
+ put_child_root(tp, oldtnode->key, n);
node_set_parent(n, tp);
/* drop dead node */
node_free(oldtnode);
+
+ return tp;
}
-static unsigned char update_suffix(struct tnode *tn)
+static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
@@ -671,8 +688,8 @@ static unsigned char update_suffix(struct tnode *tn)
* why we start with a stride of 2 since a stride of 1 would
* represent the nodes with suffix length equal to tn->pos
*/
- for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
- struct tnode *n = tnode_get_child(tn, i);
+ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+ struct key_vector *n = get_child(tn, i);
if (!n || (n->slen <= slen))
continue;
@@ -704,12 +721,12 @@ static unsigned char update_suffix(struct tnode *tn)
*
* 'high' in this instance is the variable 'inflate_threshold'. It
* is expressed as a percentage, so we multiply it with
- * tnode_child_length() and instead of multiplying by 2 (since the
+ * child_length() and instead of multiplying by 2 (since the
* child array will be doubled by inflate()) and multiplying
* the left-hand side by 100 (to handle the percentage thing) we
* multiply the left-hand side by 50.
*
- * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * The left-hand side may look a bit weird: child_length(tn)
* - tn->empty_children is of course the number of non-null children
* in the current node. tn->full_children is the number of "full"
* children, that is non-null tnodes with a skip value of 0.
@@ -719,10 +736,10 @@ static unsigned char update_suffix(struct tnode *tn)
* A clearer way to write this would be:
*
* to_be_doubled = tn->full_children;
- * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
* tn->full_children;
*
- * new_child_length = tnode_child_length(tn) * 2;
+ * new_child_length = child_length(tn) * 2;
*
* new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
* new_child_length;
@@ -739,57 +756,57 @@ static unsigned char update_suffix(struct tnode *tn)
* inflate_threshold * new_child_length
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >=
- * inflate_threshold * tnode_child_length(tn) * 2
+ * inflate_threshold * child_length(tn) * 2
*
* shorten again:
- * 50 * (tn->full_children + tnode_child_length(tn) -
+ * 50 * (tn->full_children + child_length(tn) -
* tn->empty_children) >= inflate_threshold *
- * tnode_child_length(tn)
+ * child_length(tn)
*
*/
-static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= tp ? inflate_threshold : inflate_threshold_root;
- used -= tn->empty_children;
- used += tn->full_children;
+ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+ used -= tn_info(tn)->empty_children;
+ used += tn_info(tn)->full_children;
/* if bits == KEYLENGTH then pos = 0, and will fail below */
return (used > 1) && tn->pos && ((50 * used) >= threshold);
}
-static bool should_halve(const struct tnode *tp, const struct tnode *tn)
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= tp ? halve_threshold : halve_threshold_root;
- used -= tn->empty_children;
+ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+ used -= tn_info(tn)->empty_children;
/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
}
-static bool should_collapse(const struct tnode *tn)
+static inline bool should_collapse(struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
- used -= tn->empty_children;
+ used -= tn_info(tn)->empty_children;
/* account for bits == KEYLENGTH case */
- if ((tn->bits == KEYLENGTH) && tn->full_children)
+ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
used -= KEY_MAX;
/* One child or none, time to drop us from the trie */
@@ -797,10 +814,13 @@ static bool should_collapse(const struct tnode *tn)
}
#define MAX_WORK 10
-static void resize(struct trie *t, struct tnode *tn)
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
{
- struct tnode *tp = node_parent(tn);
- struct tnode __rcu **cptr;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ struct key_vector *tp = node_parent(tn);
+ unsigned long cindex = get_index(tn->key, tp);
int max_work = MAX_WORK;
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
@@ -810,183 +830,128 @@ static void resize(struct trie *t, struct tnode *tn)
* doing it ourselves. This way we can let RCU fully do its
* thing without us interfering
*/
- cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
- BUG_ON(tn != rtnl_dereference(*cptr));
+ BUG_ON(tn != get_child(tp, cindex));
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
while (should_inflate(tp, tn) && max_work) {
- if (inflate(t, tn)) {
+ tp = inflate(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
+ this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
max_work--;
- tn = rtnl_dereference(*cptr);
+ tn = get_child(tp, cindex);
}
+ /* update parent in case inflate failed */
+ tp = node_parent(tn);
+
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return;
+ return tp;
/* Halve as long as the number of empty children in this
* node is above threshold.
*/
while (should_halve(tp, tn) && max_work) {
- if (halve(t, tn)) {
+ tp = halve(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
+ this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
max_work--;
- tn = rtnl_dereference(*cptr);
+ tn = get_child(tp, cindex);
}
/* Only one child remains */
- if (should_collapse(tn)) {
- collapse(t, tn);
- return;
- }
+ if (should_collapse(tn))
+ return collapse(t, tn);
+
+ /* update parent in case halve failed */
+ tp = node_parent(tn);
/* Return if at least one deflate was run */
if (max_work != MAX_WORK)
- return;
+ return tp;
/* push the suffix length to the parent node */
if (tn->slen > tn->pos) {
unsigned char slen = update_suffix(tn);
- if (tp && (slen > tp->slen))
+ if (slen > tp->slen)
tp->slen = slen;
}
-}
-
-/* readside must use rcu_read_lock currently dump routines
- via get_fa_head and dump */
-
-static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
-{
- struct hlist_head *head = &l->list;
- struct leaf_info *li;
-
- hlist_for_each_entry_rcu(li, head, hlist)
- if (li->plen == plen)
- return li;
-
- return NULL;
-}
-
-static inline struct list_head *get_fa_head(struct tnode *l, int plen)
-{
- struct leaf_info *li = find_leaf_info(l, plen);
-
- if (!li)
- return NULL;
- return &li->falh;
+ return tp;
}
-static void leaf_pull_suffix(struct tnode *l)
+static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
{
- struct tnode *tp = node_parent(l);
-
- while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
+ while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
if (update_suffix(tp) > l->slen)
break;
tp = node_parent(tp);
}
}
-static void leaf_push_suffix(struct tnode *l)
+static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
{
- struct tnode *tn = node_parent(l);
-
/* if this is a new leaf then tn will be NULL and we can sort
* out parent suffix lengths as a part of trie_rebalance
*/
- while (tn && (tn->slen < l->slen)) {
+ while (tn->slen < l->slen) {
tn->slen = l->slen;
tn = node_parent(tn);
}
}
-static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
-{
- /* record the location of the previous list_info entry */
- struct hlist_node **pprev = old->hlist.pprev;
- struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
-
- /* remove the leaf info from the list */
- hlist_del_rcu(&old->hlist);
-
- /* only access li if it is pointing at the last valid hlist_node */
- if (hlist_empty(&l->list) || (*pprev))
- return;
-
- /* update the trie with the latest suffix length */
- l->slen = KEYLENGTH - li->plen;
- leaf_pull_suffix(l);
-}
-
-static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+ struct key_vector **tp, u32 key)
{
- struct hlist_head *head = &l->list;
- struct leaf_info *li = NULL, *last = NULL;
-
- if (hlist_empty(head)) {
- hlist_add_head_rcu(&new->hlist, head);
- } else {
- hlist_for_each_entry(li, head, hlist) {
- if (new->plen > li->plen)
- break;
+ struct key_vector *pn, *n = t->kv;
+ unsigned long index = 0;
- last = li;
- }
- if (last)
- hlist_add_behind_rcu(&new->hlist, &last->hlist);
- else
- hlist_add_before_rcu(&new->hlist, &li->hlist);
- }
-
- /* if we added to the tail node then we need to update slen */
- if (l->slen < (KEYLENGTH - new->plen)) {
- l->slen = KEYLENGTH - new->plen;
- leaf_push_suffix(l);
- }
-}
+ do {
+ pn = n;
+ n = get_child_rcu(n, index);
-/* rcu_read_lock needs to be hold by caller from readside */
-static struct tnode *fib_find_node(struct trie *t, u32 key)
-{
- struct tnode *n = rcu_dereference_rtnl(t->trie);
+ if (!n)
+ break;
- while (n) {
- unsigned long index = get_index(key, n);
+ index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the bits in the cindex. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
- * if (index & (~0ul << bits))
+ * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
- if (index & (~0ul << n->bits))
- return NULL;
-
- /* we have found a leaf. Prefixes have already been compared */
- if (IS_LEAF(n))
+ if (index >= (1ul << n->bits)) {
+ n = NULL;
break;
+ }
- n = tnode_get_child_rcu(n, index);
- }
+ /* keep searching until we find a perfect match leaf or NULL */
+ } while (IS_TNODE(n));
+
+ *tp = pn;
return n;
}
@@ -994,14 +959,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
*/
-static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+ u8 tos, u32 prio, u32 tb_id)
{
struct fib_alias *fa;
if (!fah)
return NULL;
- list_for_each_entry(fa, fah, fa_list) {
+ hlist_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_slen < slen)
+ continue;
+ if (fa->fa_slen != slen)
+ break;
+ if (fa->tb_id > tb_id)
+ continue;
+ if (fa->tb_id != tb_id)
+ break;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
@@ -1011,77 +985,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
return NULL;
}
-static void trie_rebalance(struct trie *t, struct tnode *tn)
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
- struct tnode *tp;
-
- while ((tp = node_parent(tn)) != NULL) {
- resize(t, tn);
- tn = tp;
- }
-
- /* Handle last (top) tnode */
- if (IS_TNODE(tn))
- resize(t, tn);
+ while (!IS_TRIE(tn))
+ tn = resize(t, tn);
}
-/* only used from updater-side */
-
-static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+ struct fib_alias *new, t_key key)
{
- struct list_head *fa_head = NULL;
- struct tnode *l, *n, *tp = NULL;
- struct leaf_info *li;
+ struct key_vector *n, *l;
- li = leaf_info_new(plen);
- if (!li)
- return NULL;
- fa_head = &li->falh;
-
- n = rtnl_dereference(t->trie);
-
- /* If we point to NULL, stop. Either the tree is empty and we should
- * just put a new leaf in if, or we have reached an empty child slot,
- * and we should just put our new leaf in that.
- *
- * If we hit a node with a key that does't match then we should stop
- * and create a new tnode to replace that node and insert ourselves
- * and the other node into the new tnode.
- */
- while (n) {
- unsigned long index = get_index(key, n);
-
- /* This bit of code is a bit tricky but it combines multiple
- * checks into a single check. The prefix consists of the
- * prefix plus zeros for the "bits" in the prefix. The index
- * is the difference between the key and this value. From
- * this we can actually derive several pieces of data.
- * if !(index >> bits)
- * we know the value is child index
- * else
- * we have a mismatch in skip bits and failed
- */
- if (index >> n->bits)
- break;
-
- /* we have found a leaf. Prefixes have already been compared */
- if (IS_LEAF(n)) {
- /* Case 1: n is a leaf, and prefixes match*/
- insert_leaf_info(n, li);
- return fa_head;
- }
-
- tp = n;
- n = tnode_get_child_rcu(n, index);
- }
-
- l = leaf_new(key);
- if (!l) {
- free_leaf_info(li);
- return NULL;
- }
+ l = leaf_new(key, new);
+ if (!l)
+ goto noleaf;
- insert_leaf_info(l, li);
+ /* retrieve child from parent node */
+ n = get_child(tp, get_index(key, tp));
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
*
@@ -1090,21 +1010,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* leaves us in position for handling as case 3
*/
if (n) {
- struct tnode *tn;
+ struct key_vector *tn;
tn = tnode_new(key, __fls(key ^ n->key), 1);
- if (!tn) {
- free_leaf_info(li);
- node_free(l);
- return NULL;
- }
+ if (!tn)
+ goto notnode;
/* initialize routes out of node */
NODE_INIT_PARENT(tn, tp);
put_child(tn, get_index(key, tn) ^ 1, n);
/* start adding routes into the node */
- put_child_root(tp, t, key, tn);
+ put_child_root(tp, key, tn);
node_set_parent(n, tn);
/* parent now has a NULL spot where the leaf can go */
@@ -1112,69 +1029,94 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
}
/* Case 3: n is NULL, and will just insert a new leaf */
- if (tp) {
- NODE_INIT_PARENT(l, tp);
- put_child(tp, get_index(key, tp), l);
- trie_rebalance(t, tp);
+ NODE_INIT_PARENT(l, tp);
+ put_child_root(tp, key, l);
+ trie_rebalance(t, tp);
+
+ return 0;
+notnode:
+ node_free(l);
+noleaf:
+ return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *new,
+ struct fib_alias *fa, t_key key)
+{
+ if (!l)
+ return fib_insert_node(t, tp, new, key);
+
+ if (fa) {
+ hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
} else {
- rcu_assign_pointer(t->trie, l);
+ struct fib_alias *last;
+
+ hlist_for_each_entry(last, &l->leaf, fa_list) {
+ if (new->fa_slen < last->fa_slen)
+ break;
+ if ((new->fa_slen == last->fa_slen) &&
+ (new->tb_id > last->tb_id))
+ break;
+ fa = last;
+ }
+
+ if (fa)
+ hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+ else
+ hlist_add_head_rcu(&new->fa_list, &l->leaf);
+ }
+
+ /* if we added to the tail node then we need to update slen */
+ if (l->slen < new->fa_slen) {
+ l->slen = new->fa_slen;
+ leaf_push_suffix(tp, l);
}
- return fa_head;
+ return 0;
}
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
- struct trie *t = (struct trie *) tb->tb_data;
+ struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
- struct list_head *fa_head = NULL;
+ struct key_vector *l, *tp;
+ unsigned int nlflags = 0;
struct fib_info *fi;
- int plen = cfg->fc_dst_len;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
u8 tos = cfg->fc_tos;
- u32 key, mask;
+ u32 key;
int err;
- struct tnode *l;
- if (plen > 32)
+ if (plen > KEYLENGTH)
return -EINVAL;
key = ntohl(cfg->fc_dst);
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
- mask = ntohl(inet_make_mask(plen));
-
- if (key & ~mask)
+ if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL;
- key = key & mask;
-
fi = fib_create_info(cfg);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}
- l = fib_find_node(t, key);
- fa = NULL;
-
- if (l) {
- fa_head = get_fa_head(l, plen);
- fa = fib_find_alias(fa_head, tos, fi->fib_priority);
- }
+ l = fib_find_node(t, &tp, key);
+ fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ tb->tb_id) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
+ * insert to the tail of the section matching the suffix length
+ * of the new alias.
*/
if (fa && fa->fa_tos == tos &&
@@ -1192,9 +1134,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
*/
fa_match = NULL;
fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, fa_head, fa_list) {
- if (fa->fa_tos != tos)
+ hlist_for_each_entry_from(fa, fa_list) {
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1217,7 +1160,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
}
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
+ if (!new_fa)
goto out;
fi_drop = fa->fa_info;
@@ -1226,8 +1169,23 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
+ new_fa->fa_slen = fa->fa_slen;
+ new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
+
+ err = switchdev_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ switchdev_fib_ipv4_abort(fi);
+ kmem_cache_free(fn_alias_kmem, new_fa);
+ goto out;
+ }
+
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
@@ -1245,7 +1203,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (fa_match)
goto out;
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
+ if (cfg->fc_nlflags & NLM_F_APPEND)
+ nlflags = NLM_F_APPEND;
+ else
fa = fa_first;
}
err = -ENOENT;
@@ -1254,37 +1214,41 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
+ if (!new_fa)
goto out;
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
- /*
- * Insert new entry to the list.
- */
+ new_fa->fa_slen = slen;
+ new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
- if (!fa_head) {
- fa_head = fib_insert_node(t, key, plen);
- if (unlikely(!fa_head)) {
- err = -ENOMEM;
- goto out_free_new_fa;
- }
+ /* (Optionally) offload fib entry to switch hardware. */
+ err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
+ cfg->fc_nlflags, tb->tb_id);
+ if (err) {
+ switchdev_fib_ipv4_abort(fi);
+ goto out_free_new_fa;
}
+ /* Insert new entry to the list. */
+ err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+ if (err)
+ goto out_sw_fib_del;
+
if (!plen)
tb->tb_num_default++;
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : fa_head));
-
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
- &cfg->fc_nlinfo, 0);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
+ &cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
+out_sw_fib_del:
+ switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1293,7 +1257,7 @@ err:
return err;
}
-static inline t_key prefix_mismatch(t_key key, struct tnode *n)
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
{
t_key prefix = n->key;
@@ -1304,16 +1268,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n)
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
- struct trie *t = (struct trie *)tb->tb_data;
+ struct trie *t = (struct trie *) tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
const t_key key = ntohl(flp->daddr);
- struct tnode *n, *pn;
- struct leaf_info *li;
+ struct key_vector *n, *pn;
+ struct fib_alias *fa;
+ unsigned long index;
t_key cindex;
- n = rcu_dereference(t->trie);
+ pn = t->kv;
+ cindex = 0;
+
+ n = get_child_rcu(pn, cindex);
if (!n)
return -EAGAIN;
@@ -1321,24 +1289,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
this_cpu_inc(stats->gets);
#endif
- pn = n;
- cindex = 0;
-
/* Step 1: Travel to the longest prefix match in the trie */
for (;;) {
- unsigned long index = get_index(key, n);
+ index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the "bits" in the prefix. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
- * if (index & (~0ul << bits))
+ * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
- if (index & (~0ul << n->bits))
+ if (index >= (1ul << n->bits))
break;
/* we have found a leaf. Prefixes have already been compared */
@@ -1353,7 +1322,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
cindex = index;
}
- n = tnode_get_child_rcu(n, index);
+ n = get_child_rcu(n, index);
if (unlikely(!n))
goto backtrace;
}
@@ -1361,7 +1330,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Step 2: Sort out leaves and begin backtracing for longest prefix */
for (;;) {
/* record the pointer where our next node pointer is stored */
- struct tnode __rcu **cptr = n->child;
+ struct key_vector __rcu **cptr = n->tnode;
/* This test verifies that none of the bits that differ
* between the key and the prefix exist in the region of
@@ -1393,13 +1362,17 @@ backtrace:
while (!cindex) {
t_key pkey = pn->key;
- pn = node_parent_rcu(pn);
- if (unlikely(!pn))
+ /* If we don't have a parent then there is
+ * nothing for us to do as we do not have any
+ * further nodes to parse.
+ */
+ if (IS_TRIE(pn))
return -EAGAIN;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
/* Get Child's index */
+ pn = node_parent_rcu(pn);
cindex = get_index(pkey, pn);
}
@@ -1407,138 +1380,140 @@ backtrace:
cindex &= cindex - 1;
/* grab pointer for next child node */
- cptr = &pn->child[cindex];
+ cptr = &pn->tnode[cindex];
}
}
found:
+ /* this line carries forward the xor from earlier in the function */
+ index = key ^ n->key;
+
/* Step 3: Process the leaf, if that fails fall back to backtracing */
- hlist_for_each_entry_rcu(li, &n->list, hlist) {
- struct fib_alias *fa;
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
- if ((key ^ n->key) & li->mask_plen)
+ if ((index >= (1ul << fa->fa_slen)) &&
+ ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
continue;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+ struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- struct fib_info *fi = fa->fa_info;
- int nhsel, err;
-
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (nh->nh_flags & RTNH_F_DEAD)
continue;
- if (fi->fib_dead)
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN &&
+ !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
- if (fa->fa_info->fib_scope < flp->flowi4_scope)
- continue;
- fib_alias_accessed(fa);
- err = fib_props[fa->fa_type].error;
- if (unlikely(err < 0)) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_passed);
-#endif
- return err;
- }
- if (fi->fib_flags & RTNH_F_DEAD)
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
- continue;
- if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&fi->fib_clntref);
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
- res->prefixlen = li->plen;
- res->nh_sel = nhsel;
- res->type = fa->fa_type;
- res->scope = fi->fib_scope;
- res->fi = fi;
- res->table = tb;
- res->fa_head = &li->falh;
+ res->prefixlen = KEYLENGTH - fa->fa_slen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_passed);
+ this_cpu_inc(stats->semantic_match_passed);
#endif
- return err;
- }
+ return err;
}
-
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_miss);
+ this_cpu_inc(stats->semantic_match_miss);
#endif
- }
goto backtrace;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);
-/*
- * Remove the leaf and return parent.
- */
-static void trie_leaf_remove(struct trie *t, struct tnode *l)
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old)
{
- struct tnode *tp = node_parent(l);
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->fa_list.pprev;
+ struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
- pr_debug("entering trie_leaf_remove(%p)\n", l);
+ /* remove the fib_alias from the list */
+ hlist_del_rcu(&old->fa_list);
- if (tp) {
- put_child(tp, get_index(l->key, tp), NULL);
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ if (hlist_empty(&l->leaf)) {
+ put_child_root(tp, l->key, NULL);
+ node_free(l);
trie_rebalance(t, tp);
- } else {
- RCU_INIT_POINTER(t->trie, NULL);
+ return;
}
- node_free(l);
+ /* only access fa if it is pointing at the last valid hlist_node */
+ if (*pprev)
+ return;
+
+ /* update the trie with the latest suffix length */
+ l->slen = fa->fa_slen;
+ leaf_pull_suffix(tp, l);
}
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
- u32 key, mask;
- int plen = cfg->fc_dst_len;
- u8 tos = cfg->fc_tos;
struct fib_alias *fa, *fa_to_delete;
- struct list_head *fa_head;
- struct tnode *l;
- struct leaf_info *li;
+ struct key_vector *l, *tp;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
- if (plen > 32)
+ if (plen > KEYLENGTH)
return -EINVAL;
key = ntohl(cfg->fc_dst);
- mask = ntohl(inet_make_mask(plen));
- if (key & ~mask)
+ if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL;
- key = key & mask;
- l = fib_find_node(t, key);
-
+ l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
- li = find_leaf_info(l, plen);
-
- if (!li)
- return -ESRCH;
-
- fa_head = &li->falh;
- fa = fib_find_alias(fa_head, tos, 0);
-
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
if (!fa)
return -ESRCH;
pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, fa_head, fa_list) {
+ hlist_for_each_entry_from(fa, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (fa->fa_tos != tos)
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -1557,240 +1532,391 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
- &cfg->fc_nlinfo, 0);
+ switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
- list_del_rcu(&fa->fa_list);
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
if (!plen)
tb->tb_num_default--;
- if (list_empty(fa_head)) {
- remove_leaf_info(l, li);
- free_leaf_info(li);
- }
-
- if (hlist_empty(&l->list))
- trie_leaf_remove(t, l);
+ fib_remove_alias(t, tp, l, fa_to_delete);
- if (fa->fa_state & FA_S_ACCESSED)
+ if (fa_to_delete->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
+ fib_release_info(fa_to_delete->fa_info);
+ alias_free_mem_rcu(fa_to_delete);
return 0;
}
-static int trie_flush_list(struct list_head *head)
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
{
- struct fib_alias *fa, *fa_node;
- int found = 0;
+ struct key_vector *pn, *n = *tn;
+ unsigned long cindex;
- list_for_each_entry_safe(fa, fa_node, head, fa_list) {
- struct fib_info *fi = fa->fa_info;
+ /* this loop is meant to try and find the key in the trie */
+ do {
+ /* record parent and next child index */
+ pn = n;
+ cindex = key ? get_index(key, pn) : 0;
- if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
- found++;
+ if (cindex >> pn->bits)
+ break;
+
+ /* descend into the next child */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ break;
+
+ /* guarantee forward progress on the keys */
+ if (IS_LEAF(n) && (n->key >= key))
+ goto found;
+ } while (IS_TNODE(n));
+
+ /* this loop will search for the next leaf with a greater key */
+ while (!IS_TRIE(pn)) {
+ /* if we exhausted the parent node we will need to climb */
+ if (cindex >= (1ul << pn->bits)) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ continue;
}
+
+ /* grab the next available node */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ continue;
+
+ /* no need to compare keys since we bumped the index */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* Rescan start scanning in new node */
+ pn = n;
+ cindex = 0;
}
- return found;
+
+ *tn = pn;
+ return NULL; /* Root of trie */
+found:
+ /* if we are at the limit for keys just return NULL for the tnode */
+ *tn = pn;
+ return n;
}
-static int trie_flush_leaf(struct tnode *l)
+static void fib_trie_free(struct fib_table *tb)
{
- int found = 0;
- struct hlist_head *lih = &l->list;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
struct hlist_node *tmp;
- struct leaf_info *li = NULL;
- unsigned char plen = KEYLENGTH;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order and free everything */
+ for (;;) {
+ struct key_vector *n;
- hlist_for_each_entry_safe(li, tmp, lih, hlist) {
- found += trie_flush_list(&li->falh);
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ n = pn;
+ pn = node_parent(pn);
+
+ /* drop emptied tnode */
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+
+ cindex = get_index(pkey, pn);
- if (list_empty(&li->falh)) {
- hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
continue;
}
- plen = li->plen;
- }
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- l->slen = KEYLENGTH - plen;
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- return found;
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ }
+
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ free_percpu(t->stats);
+#endif
+ kfree(tb);
}
-/*
- * Scan for the next right leaf starting at node p->child[idx]
- * Since we have back pointer, no recursion necessary.
- */
-static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
{
- do {
- unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
+ struct trie *ot = (struct trie *)oldtb->tb_data;
+ struct key_vector *l, *tp = ot->kv;
+ struct fib_table *local_tb;
+ struct fib_alias *fa;
+ struct trie *lt;
+ t_key key = 0;
+
+ if (oldtb->tb_data == oldtb->__data)
+ return oldtb;
+
+ local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+ if (!local_tb)
+ return NULL;
+
+ lt = (struct trie *)local_tb->tb_data;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ struct key_vector *local_l = NULL, *local_tp;
- while (idx < tnode_child_length(p)) {
- c = tnode_get_child_rcu(p, idx++);
- if (!c)
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_alias *new_fa;
+
+ if (local_tb->tb_id != fa->tb_id)
continue;
- if (IS_LEAF(c))
- return c;
+ /* clone fa for new local table */
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ memcpy(new_fa, fa, sizeof(*fa));
- /* Rescan start scanning in new node */
- p = c;
- idx = 0;
+ /* insert clone into table */
+ if (!local_l)
+ local_l = fib_find_node(lt, &local_tp, l->key);
+
+ if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+ NULL, l->key))
+ goto out;
}
- /* Node empty, walk back up to parent */
- c = p;
- } while ((p = node_parent_rcu(c)) != NULL);
+ /* stop loop if key wrapped back to 0 */
+ key = l->key + 1;
+ if (key < l->key)
+ break;
+ }
- return NULL; /* Root of trie */
+ return local_tb;
+out:
+ fib_trie_free(local_tb);
+
+ return NULL;
}
-static struct tnode *trie_firstleaf(struct trie *t)
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
{
- struct tnode *n = rcu_dereference_rtnl(t->trie);
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
- if (!n)
- return NULL;
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
- if (IS_LEAF(n)) /* trie is just a leaf */
- return n;
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- return leaf_walk_rcu(n, NULL);
-}
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
-static struct tnode *trie_nextleaf(struct tnode *l)
-{
- struct tnode *p = node_parent_rcu(l);
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
- if (!p)
- return NULL; /* trie with just one leaf */
+ continue;
+ }
- return leaf_walk_rcu(p, l);
-}
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
-static struct tnode *trie_leafindex(struct trie *t, int index)
-{
- struct tnode *l = trie_firstleaf(t);
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
- while (l && index-- > 0)
- l = trie_nextleaf(l);
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
- return l;
-}
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
+ }
+ /* record local slen */
+ slen = fa->fa_slen;
-/*
- * Caller must hold RTNL.
- */
+ if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
+ continue;
+
+ switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id);
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+ }
+}
+
+/* Caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{
- struct trie *t = (struct trie *) tb->tb_data;
- struct tnode *l, *ll = NULL;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
int found = 0;
- for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
- found += trie_flush_leaf(l);
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
- if (ll) {
- if (hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
- else
- leaf_pull_suffix(ll);
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
}
- ll = l;
- }
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- if (ll) {
- if (hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
- else
- leaf_pull_suffix(ll);
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id);
+ hlist_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
}
pr_debug("trie_flush found=%d\n", found);
return found;
}
-void fib_free_table(struct fib_table *tb)
+static void __trie_free_rcu(struct rcu_head *head)
{
+ struct fib_table *tb = container_of(head, struct fib_table, rcu);
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie *t = (struct trie *)tb->tb_data;
- free_percpu(t->stats);
+ if (tb->tb_data == tb->__data)
+ free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
kfree(tb);
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
- struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+void fib_free_table(struct fib_table *tb)
{
- int i, s_i;
+ call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ __be32 xkey = htonl(l->key);
struct fib_alias *fa;
- __be32 xkey = htonl(key);
+ int i, s_i;
- s_i = cb->args[5];
+ s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
-
- list_for_each_entry_rcu(fa, fah, fa_list) {
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (i < s_i) {
i++;
continue;
}
+ if (tb->tb_id != fa->tb_id) {
+ i++;
+ continue;
+ }
+
if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWROUTE,
tb->tb_id,
fa->fa_type,
xkey,
- plen,
+ KEYLENGTH - fa->fa_slen,
fa->fa_tos,
fa->fa_info, NLM_F_MULTI) < 0) {
- cb->args[5] = i;
- return -1;
- }
- i++;
- }
- cb->args[5] = i;
- return skb->len;
-}
-
-static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct leaf_info *li;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
-
- /* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
- if (i < s_i) {
- i++;
- continue;
- }
-
- if (i > s_i)
- cb->args[5] = 0;
-
- if (list_empty(&li->falh))
- continue;
-
- if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
cb->args[4] = i;
return -1;
}
@@ -1801,44 +1927,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
return skb->len;
}
+/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct tnode *l;
- struct trie *t = (struct trie *) tb->tb_data;
- t_key key = cb->args[2];
- int count = cb->args[3];
-
- rcu_read_lock();
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
/* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key.
*/
- if (count == 0)
- l = trie_firstleaf(t);
- else {
- /* Normally, continue from last key, but if that is missing
- * fallback to using slow rescan
- */
- l = fib_find_node(t, key);
- if (!l)
- l = trie_leafindex(t, count);
- }
+ int count = cb->args[2];
+ t_key key = cb->args[3];
- while (l) {
- cb->args[2] = l->key;
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
- cb->args[3] = count;
- rcu_read_unlock();
+ cb->args[3] = key;
+ cb->args[2] = count;
return -1;
}
++count;
- l = trie_nextleaf(l);
+ key = l->key + 1;
+
memset(&cb->args[4], 0,
sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+ /* stop loop if key wrapped back to 0 */
+ if (key < l->key)
+ break;
}
- cb->args[3] = count;
- rcu_read_unlock();
+
+ cb->args[3] = key;
+ cb->args[2] = count;
return skb->len;
}
@@ -1850,28 +1970,33 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
- max(sizeof(struct tnode),
- sizeof(struct leaf_info)),
+ LEAF_SIZE,
0, SLAB_PANIC, NULL);
}
-
-struct fib_table *fib_trie_table(u32 id)
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
struct fib_table *tb;
struct trie *t;
+ size_t sz = sizeof(*tb);
+
+ if (!alias)
+ sz += sizeof(struct trie);
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
- GFP_KERNEL);
- if (tb == NULL)
+ tb = kzalloc(sz, GFP_KERNEL);
+ if (!tb)
return NULL;
tb->tb_id = id;
- tb->tb_default = -1;
tb->tb_num_default = 0;
+ tb->tb_data = (alias ? alias->__data : tb->__data);
+
+ if (alias)
+ return tb;
t = (struct trie *) tb->tb_data;
- RCU_INIT_POINTER(t->trie, NULL);
+ t->kv[0].pos = KEYLENGTH;
+ t->kv[0].slen = KEYLENGTH;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats = alloc_percpu(struct trie_use_stats);
if (!t->stats) {
@@ -1888,65 +2013,64 @@ struct fib_table *fib_trie_table(u32 id)
struct fib_trie_iter {
struct seq_net_private p;
struct fib_table *tb;
- struct tnode *tnode;
+ struct key_vector *tnode;
unsigned int index;
unsigned int depth;
};
-static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
{
unsigned long cindex = iter->index;
- struct tnode *tn = iter->tnode;
- struct tnode *p;
-
- /* A single entry routing table */
- if (!tn)
- return NULL;
+ struct key_vector *pn = iter->tnode;
+ t_key pkey;
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
iter->tnode, iter->index, iter->depth);
-rescan:
- while (cindex < tnode_child_length(tn)) {
- struct tnode *n = tnode_get_child_rcu(tn, cindex);
- if (n) {
+ while (!IS_TRIE(pn)) {
+ while (cindex < child_length(pn)) {
+ struct key_vector *n = get_child_rcu(pn, cindex++);
+
+ if (!n)
+ continue;
+
if (IS_LEAF(n)) {
- iter->tnode = tn;
- iter->index = cindex + 1;
+ iter->tnode = pn;
+ iter->index = cindex;
} else {
/* push down one level */
iter->tnode = n;
iter->index = 0;
++iter->depth;
}
+
return n;
}
- ++cindex;
- }
-
- /* Current node exhausted, pop back up */
- p = node_parent_rcu(tn);
- if (p) {
- cindex = get_index(tn->key, p) + 1;
- tn = p;
+ /* Current node exhausted, pop back up */
+ pkey = pn->key;
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
--iter->depth;
- goto rescan;
}
- /* got root? */
+ /* record root node so further searches know we are done */
+ iter->tnode = pn;
+ iter->index = 0;
+
return NULL;
}
-static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
- struct trie *t)
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
{
- struct tnode *n;
+ struct key_vector *n, *pn;
if (!t)
return NULL;
- n = rcu_dereference(t->trie);
+ pn = t->kv;
+ n = rcu_dereference(pn->tnode[0]);
if (!n)
return NULL;
@@ -1955,7 +2079,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
iter->index = 0;
iter->depth = 1;
} else {
- iter->tnode = NULL;
+ iter->tnode = pn;
iter->index = 0;
iter->depth = 0;
}
@@ -1965,7 +2089,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct tnode *n;
+ struct key_vector *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -1973,20 +2097,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
rcu_read_lock();
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
- struct leaf_info *li;
+ struct fib_alias *fa;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, &n->list, hlist)
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
++s->prefixes;
} else {
s->tnodes++;
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
- s->nullpointers += n->empty_children;
+ s->nullpointers += tn_info(n)->empty_children;
}
}
rcu_read_unlock();
@@ -2009,13 +2133,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- bytes = sizeof(struct tnode) * stat->leaves;
+ bytes = LEAF_SIZE * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
- bytes += sizeof(struct leaf_info) * stat->prefixes;
+ bytes += sizeof(struct fib_alias) * stat->prefixes;
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
- bytes += sizeof(struct tnode) * stat->tnodes;
+ bytes += TNODE_SIZE(0) * stat->tnodes;
max = MAX_STAT_DEPTH;
while (max > 0 && stat->nodesizes[max-1] == 0)
@@ -2030,7 +2154,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct tnode *) * pointers;
+ bytes += sizeof(struct key_vector *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2084,7 +2208,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"Basic info: size of leaf:"
" %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct tnode), sizeof(struct tnode));
+ LEAF_SIZE, TNODE_SIZE(0));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2123,7 +2247,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2135,7 +2259,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
- struct tnode *n;
+ struct key_vector *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2164,7 +2288,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct tnode *n;
+ struct key_vector *n;
++*pos;
/* next node in same table */
@@ -2250,9 +2374,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct tnode *n = v;
+ struct key_vector *n = v;
- if (!node_parent_rcu(n))
+ if (IS_TRIE(node_parent_rcu(n)))
fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
@@ -2261,30 +2385,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
&prf, KEYLENGTH - n->pos - n->bits, n->bits,
- n->full_children, n->empty_children);
+ tn_info(n)->full_children,
+ tn_info(n)->empty_children);
} else {
- struct leaf_info *li;
__be32 val = htonl(n->key);
+ struct fib_alias *fa;
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, &n->list, hlist) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- char buf1[32], buf2[32];
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ char buf1[32], buf2[32];
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", li->plen,
- rtn_scope(buf1, sizeof(buf1),
- fa->fa_info->fib_scope),
- rtn_type(buf2, sizeof(buf2),
- fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
- seq_putc(seq, '\n');
- }
+ seq_indent(seq, iter->depth + 1);
+ seq_printf(seq, " /%zu %s %s",
+ KEYLENGTH - fa->fa_slen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, " tos=%d", fa->fa_tos);
+ seq_putc(seq, '\n');
}
}
@@ -2314,31 +2436,47 @@ static const struct file_operations fib_trie_fops = {
struct fib_route_iter {
struct seq_net_private p;
- struct trie *main_trie;
+ struct fib_table *main_tb;
+ struct key_vector *tnode;
loff_t pos;
t_key key;
};
-static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+ loff_t pos)
{
- struct tnode *l = NULL;
- struct trie *t = iter->main_trie;
+ struct fib_table *tb = iter->main_tb;
+ struct key_vector *l, **tp = &iter->tnode;
+ struct trie *t;
+ t_key key;
- /* use cache location of last found key */
- if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
+ /* use cache location of next-to-find key */
+ if (iter->pos > 0 && pos >= iter->pos) {
pos -= iter->pos;
- else {
+ key = iter->key;
+ } else {
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
iter->pos = 0;
- l = trie_firstleaf(t);
+ key = 0;
}
- while (l && pos-- > 0) {
+ while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ key = l->key + 1;
iter->pos++;
- l = trie_nextleaf(l);
+
+ if (pos-- <= 0)
+ break;
+
+ l = NULL;
+
+ /* handle unlikely case of a key wrap */
+ if (!key)
+ break;
}
if (l)
- iter->key = pos; /* remember it */
+ iter->key = key; /* remember it */
else
iter->pos = 0; /* forget it */
@@ -2350,37 +2488,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
struct fib_table *tb;
+ struct trie *t;
rcu_read_lock();
+
tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
if (!tb)
return NULL;
- iter->main_trie = (struct trie *) tb->tb_data;
- if (*pos == 0)
- return SEQ_START_TOKEN;
- else
- return fib_route_get_idx(iter, *pos - 1);
+ iter->main_tb = tb;
+
+ if (*pos != 0)
+ return fib_route_get_idx(iter, *pos);
+
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ iter->key = 0;
+
+ return SEQ_START_TOKEN;
}
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
- struct tnode *l = v;
+ struct key_vector *l = NULL;
+ t_key key = iter->key;
++*pos;
- if (v == SEQ_START_TOKEN) {
- iter->pos = 0;
- l = trie_firstleaf(iter->main_trie);
- } else {
+
+ /* only allow key of 0 for start of sequence */
+ if ((v == SEQ_START_TOKEN) || key)
+ l = leaf_walk_rcu(&iter->tnode, key);
+
+ if (l) {
+ iter->key = l->key + 1;
iter->pos++;
- l = trie_nextleaf(l);
+ } else {
+ iter->pos = 0;
}
- if (l)
- iter->key = l->key;
- else
- iter->pos = 0;
return l;
}
@@ -2412,8 +2559,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- struct tnode *l = v;
- struct leaf_info *li;
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb = iter->main_tb;
+ struct fib_alias *fa;
+ struct key_vector *l = v;
+ __be32 prefix;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2422,45 +2572,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
- struct fib_alias *fa;
- __be32 mask, prefix;
+ prefix = htonl(l->key);
- mask = inet_make_mask(li->plen);
- prefix = htonl(l->key);
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- const struct fib_info *fi = fa->fa_info;
- unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+ if ((fa->fa_type == RTN_BROADCAST) ||
+ (fa->fa_type == RTN_MULTICAST))
+ continue;
- if (fa->fa_type == RTN_BROADCAST
- || fa->fa_type == RTN_MULTICAST)
- continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
- seq_setwidth(seq, 127);
+ seq_setwidth(seq, 127);
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->nh_gw, flags, 0, 0,
- fi->fib_priority,
- mask,
- (fi->fib_advmss ?
- fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0);
+ if (fi)
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
- seq_pad(seq, '\n');
- }
+ seq_pad(seq, '\n');
}
return 0;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index ff069f6597ac..34968cd5c146 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -16,14 +16,12 @@
#include <uapi/linux/fou.h>
#include <uapi/linux/genetlink.h>
-static DEFINE_SPINLOCK(fou_lock);
-static LIST_HEAD(fou_list);
-
struct fou {
struct socket *sock;
u8 protocol;
u8 flags;
- u16 port;
+ __be16 port;
+ u16 type;
struct udp_offload udp_offloads;
struct list_head list;
};
@@ -37,6 +35,13 @@ struct fou_cfg {
struct udp_port_cfg udp_config;
};
+static unsigned int fou_net_id;
+
+struct fou_net {
+ struct list_head fou_list;
+ struct mutex fou_lock;
+};
+
static inline struct fou *fou_from_sock(struct sock *sk)
{
return sk->sk_user_data;
@@ -387,20 +392,21 @@ out_unlock:
return err;
}
-static int fou_add_to_port_list(struct fou *fou)
+static int fou_add_to_port_list(struct net *net, struct fou *fou)
{
+ struct fou_net *fn = net_generic(net, fou_net_id);
struct fou *fout;
- spin_lock(&fou_lock);
- list_for_each_entry(fout, &fou_list, list) {
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
if (fou->port == fout->port) {
- spin_unlock(&fou_lock);
+ mutex_unlock(&fn->fou_lock);
return -EALREADY;
}
}
- list_add(&fou->list, &fou_list);
- spin_unlock(&fou_lock);
+ list_add(&fou->list, &fn->fou_list);
+ mutex_unlock(&fn->fou_lock);
return 0;
}
@@ -410,14 +416,10 @@ static void fou_release(struct fou *fou)
struct socket *sock = fou->sock;
struct sock *sk = sock->sk;
- udp_del_offload(&fou->udp_offloads);
-
+ if (sk->sk_family == AF_INET)
+ udp_del_offload(&fou->udp_offloads);
list_del(&fou->list);
-
- /* Remove hooks into tunnel socket */
- sk->sk_user_data = NULL;
-
- sock_release(sock);
+ udp_tunnel_sock_release(sock);
kfree(fou);
}
@@ -447,10 +449,10 @@ static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
static int fou_create(struct net *net, struct fou_cfg *cfg,
struct socket **sockp)
{
- struct fou *fou = NULL;
- int err;
struct socket *sock = NULL;
+ struct fou *fou = NULL;
struct sock *sk;
+ int err;
/* Open UDP socket */
err = udp_sock_create(net, &cfg->udp_config, &sock);
@@ -486,6 +488,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
goto error;
}
+ fou->type = cfg->type;
+
udp_sk(sk)->encap_type = 1;
udp_encap_enable();
@@ -502,7 +506,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
goto error;
}
- err = fou_add_to_port_list(fou);
+ err = fou_add_to_port_list(net, fou);
if (err)
goto error;
@@ -514,27 +518,27 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
error:
kfree(fou);
if (sock)
- sock_release(sock);
+ udp_tunnel_sock_release(sock);
return err;
}
static int fou_destroy(struct net *net, struct fou_cfg *cfg)
{
- struct fou *fou;
- u16 port = cfg->udp_config.local_udp_port;
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ __be16 port = cfg->udp_config.local_udp_port;
int err = -EINVAL;
+ struct fou *fou;
- spin_lock(&fou_lock);
- list_for_each_entry(fou, &fou_list, list) {
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fou, &fn->fou_list, list) {
if (fou->port == port) {
- udp_del_offload(&fou->udp_offloads);
fou_release(fou);
err = 0;
break;
}
}
- spin_unlock(&fou_lock);
+ mutex_unlock(&fn->fou_lock);
return err;
}
@@ -573,7 +577,7 @@ static int parse_nl_config(struct genl_info *info,
}
if (info->attrs[FOU_ATTR_PORT]) {
- u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]);
+ __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
cfg->udp_config.local_udp_port = port;
}
@@ -592,6 +596,7 @@ static int parse_nl_config(struct genl_info *info,
static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
{
+ struct net *net = genl_info_net(info);
struct fou_cfg cfg;
int err;
@@ -599,16 +604,119 @@ static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- return fou_create(&init_net, &cfg, NULL);
+ return fou_create(net, &cfg, NULL);
}
static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_destroy(net, &cfg);
+}
+
+static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
+{
+ if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
+ nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+ nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
+ nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
+ return -1;
+
+ if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
+ if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
+ return -1;
+ return 0;
+}
+
+static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
+ u32 flags, struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (fou_fill_info(fou, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct sk_buff *msg;
struct fou_cfg cfg;
+ struct fou *fout;
+ __be16 port;
+ int ret;
+
+ ret = parse_nl_config(info, &cfg);
+ if (ret)
+ return ret;
+ port = cfg.udp_config.local_udp_port;
+ if (port == 0)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = -ESRCH;
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (port == fout->port) {
+ ret = fou_dump_info(fout, info->snd_portid,
+ info->snd_seq, 0, msg,
+ info->genlhdr->cmd);
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+ if (ret < 0)
+ goto out_free;
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+ int idx = 0, ret;
- parse_nl_config(info, &cfg);
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (idx++ < cb->args[0])
+ continue;
+ ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, FOU_CMD_GET);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fn->fou_lock);
- return fou_destroy(&init_net, &cfg);
+ cb->args[0] = idx;
+ return skb->len;
}
static const struct genl_ops fou_nl_ops[] = {
@@ -624,6 +732,12 @@ static const struct genl_ops fou_nl_ops[] = {
.policy = fou_nl_policy,
.flags = GENL_ADMIN_PERM,
},
+ {
+ .cmd = FOU_CMD_GET,
+ .doit = fou_nl_cmd_get_port,
+ .dumpit = fou_nl_dump,
+ .policy = fou_nl_policy,
+ },
};
size_t fou_encap_hlen(struct ip_tunnel_encap *e)
@@ -771,12 +885,12 @@ EXPORT_SYMBOL(gue_build_header);
#ifdef CONFIG_NET_FOU_IP_TUNNELS
-static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = {
+static const struct ip_tunnel_encap_ops fou_iptun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou_build_header,
};
-static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = {
+static const struct ip_tunnel_encap_ops gue_iptun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue_build_header,
};
@@ -820,38 +934,63 @@ static void ip_tunnel_encap_del_fou_ops(void)
#endif
+static __net_init int fou_init_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+
+ INIT_LIST_HEAD(&fn->fou_list);
+ mutex_init(&fn->fou_lock);
+ return 0;
+}
+
+static __net_exit void fou_exit_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fou, *next;
+
+ /* Close all the FOU sockets */
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry_safe(fou, next, &fn->fou_list, list)
+ fou_release(fou);
+ mutex_unlock(&fn->fou_lock);
+}
+
+static struct pernet_operations fou_net_ops = {
+ .init = fou_init_net,
+ .exit = fou_exit_net,
+ .id = &fou_net_id,
+ .size = sizeof(struct fou_net),
+};
+
static int __init fou_init(void)
{
int ret;
+ ret = register_pernet_device(&fou_net_ops);
+ if (ret)
+ goto exit;
+
ret = genl_register_family_with_ops(&fou_nl_family,
fou_nl_ops);
-
if (ret < 0)
- goto exit;
+ goto unregister;
ret = ip_tunnel_encap_add_fou_ops();
- if (ret < 0)
- genl_unregister_family(&fou_nl_family);
+ if (ret == 0)
+ return 0;
+ genl_unregister_family(&fou_nl_family);
+unregister:
+ unregister_pernet_device(&fou_net_ops);
exit:
return ret;
}
static void __exit fou_fini(void)
{
- struct fou *fou, *next;
-
ip_tunnel_encap_del_fou_ops();
-
genl_unregister_family(&fou_nl_family);
-
- /* Close all the FOU sockets */
-
- spin_lock(&fou_lock);
- list_for_each_entry_safe(fou, next, &fou_list, list)
- fou_release(fou);
- spin_unlock(&fou_lock);
+ unregister_pernet_device(&fou_net_ops);
}
module_init(fou_init);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve_core.c
index 5a4828ba05ad..311a4ba6950a 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve_core.c
@@ -60,11 +60,6 @@ struct geneve_net {
static int geneve_net_id;
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
- return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
static struct geneve_sock *geneve_find_sock(struct net *net,
sa_family_t family, __be16 port)
{
@@ -113,10 +108,6 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
int min_headroom;
int err;
- skb = udp_tunnel_handle_offloads(skb, csum);
- if (IS_ERR(skb))
- return PTR_ERR(skb);
-
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
@@ -131,12 +122,16 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
if (unlikely(!skb))
return -ENOMEM;
+ skb = udp_tunnel_handle_offloads(skb, csum);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
skb_set_inner_protocol(skb, htons(ETH_P_TEB));
- return udp_tunnel_xmit_skb(rt, skb, src, dst,
+ return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
tos, ttl, df, src_port, dst_port, xnet,
!csum);
}
@@ -196,7 +191,7 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
rcu_read_lock();
ptype = gro_find_receive_by_type(type);
- if (ptype == NULL) {
+ if (!ptype) {
flush = 1;
goto out_unlock;
}
@@ -230,7 +225,7 @@ static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
rcu_read_lock();
ptype = gro_find_complete_by_type(type);
- if (ptype != NULL)
+ if (ptype)
err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
rcu_read_unlock();
@@ -435,7 +430,7 @@ static int __init geneve_init_module(void)
if (rc)
return rc;
- pr_info("Geneve driver\n");
+ pr_info("Geneve core logic\n");
return 0;
}
@@ -449,5 +444,4 @@ module_exit(geneve_cleanup_module);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
-MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
-MODULE_ALIAS_RTNL_LINK("geneve");
+MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 51973ddc05a6..5aa46d4b44ef 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -149,7 +149,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
rcu_read_lock();
ptype = gro_find_receive_by_type(type);
- if (ptype == NULL)
+ if (!ptype)
goto out_unlock;
grehlen = GRE_HEADER_SECTION;
@@ -243,7 +243,7 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
rcu_read_lock();
ptype = gro_find_complete_by_type(type);
- if (ptype != NULL)
+ if (ptype)
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
rcu_read_unlock();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5e564014a0b7..f5203fba6236 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -399,7 +399,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
return;
sk = icmp_xmit_lock(net);
- if (sk == NULL)
+ if (!sk)
return;
inet = inet_sk(sk);
@@ -609,7 +609,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
skb_in->data,
sizeof(_inner_type),
&_inner_type);
- if (itp == NULL)
+ if (!itp)
goto out;
/*
@@ -627,7 +627,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
return;
sk = icmp_xmit_lock(net);
- if (sk == NULL)
+ if (!sk)
goto out_free;
/*
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 666cf364df86..651cdf648ec4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -97,6 +97,7 @@
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
@@ -369,7 +370,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
pip->saddr = fl4.saddr;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(skb, NULL);
+ ip_select_ident(net, skb, NULL);
((u8 *)&pip[1])[0] = IPOPT_RA;
((u8 *)&pip[1])[1] = 4;
((u8 *)&pip[1])[2] = 0;
@@ -691,7 +692,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
- if (skb == NULL) {
+ if (!skb) {
ip_rt_put(rt);
return -1;
}
@@ -713,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
- ip_select_ident(skb, NULL);
+ ip_select_ident(net, skb, NULL);
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
@@ -980,7 +981,7 @@ int igmp_rcv(struct sk_buff *skb)
int len = skb->len;
bool dropped = true;
- if (in_dev == NULL)
+ if (!in_dev)
goto drop;
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
@@ -1338,6 +1339,168 @@ out:
}
EXPORT_SYMBOL(ip_mc_inc_group);
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ unsigned int len;
+ unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+ return -EINVAL;
+
+ offset += ip_hdrlen(skb) - sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ return -EINVAL;
+
+ len = skb_network_offset(skb) + ntohs(iph->tot_len);
+ if (skb->len < len || len < offset)
+ return -EINVAL;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmpv3_report);
+
+ return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmphdr);
+ if (skb->len < len)
+ return -EINVAL;
+
+ /* IGMPv{1,2}? */
+ if (skb->len != len) {
+ /* or IGMPv3? */
+ len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
+ if (skb->len < len || !pskb_may_pull(skb, len))
+ return -EINVAL;
+ }
+
+ /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+ * all-systems destination addresses (224.0.0.1) for general queries
+ */
+ if (!igmp_hdr(skb)->group &&
+ ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+ switch (igmp_hdr(skb)->type) {
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ /* fall through */
+ return 0;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ return ip_mc_check_igmp_reportv3(skb);
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ return ip_mc_check_igmp_query(skb);
+ default:
+ return -ENOMSG;
+ }
+}
+
+static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+ return skb_checksum_simple_validate(skb);
+}
+
+static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+
+{
+ struct sk_buff *skb_chk;
+ unsigned int transport_len;
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+ int ret;
+
+ transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+
+ skb_get(skb);
+ skb_chk = skb_checksum_trimmed(skb, transport_len,
+ ip_mc_validate_checksum);
+ if (!skb_chk)
+ return -EINVAL;
+
+ if (!pskb_may_pull(skb_chk, len)) {
+ kfree_skb(skb_chk);
+ return -EINVAL;
+ }
+
+ ret = ip_mc_check_igmp_msg(skb_chk);
+ if (ret) {
+ kfree_skb(skb_chk);
+ return ret;
+ }
+
+ if (skb_trimmed)
+ *skb_trimmed = skb_chk;
+ else
+ kfree_skb(skb_chk);
+
+ return 0;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ * standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an IGMP packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+ int ret = ip_mc_check_iphdr(skb);
+
+ if (ret < 0)
+ return ret;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+ return -ENOMSG;
+
+ return __ip_mc_check_igmp(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
/*
* Resend IGMP JOIN report; used by netdev notifier.
*/
@@ -1849,30 +2012,28 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
pmc->sfcount[MCAST_EXCLUDE] = 1;
}
-
-/*
- * Join a multicast group
+/* Join a multicast group
*/
-int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
- int err;
__be32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml = NULL, *i;
+ struct ip_mc_socklist *iml, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
int ifindex;
int count = 0;
+ int err;
+
+ ASSERT_RTNL();
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
- iml = NULL;
err = -ENODEV;
goto done;
}
@@ -1889,7 +2050,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
if (count >= sysctl_igmp_max_memberships)
goto done;
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
- if (iml == NULL)
+ if (!iml)
goto done;
memcpy(&iml->multi, imr, sizeof(*imr));
@@ -1900,7 +2061,6 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
ip_mc_inc_group(in_dev, addr);
err = 0;
done:
- rtnl_unlock();
return err;
}
EXPORT_SYMBOL(ip_mc_join_group);
@@ -1911,7 +2071,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
int err;
- if (psf == NULL) {
+ if (!psf) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
@@ -1925,10 +2085,6 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
return err;
}
-/*
- * Ask a socket to leave a group.
- */
-
int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1940,7 +2096,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
u32 ifindex;
int ret = -EADDRNOTAVAIL;
- rtnl_lock();
+ ASSERT_RTNL();
+
in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
ret = -ENODEV;
@@ -1964,14 +2121,13 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
ip_mc_dec_group(in_dev, group);
- rtnl_unlock();
+
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
return 0;
}
out:
- rtnl_unlock();
return ret;
}
EXPORT_SYMBOL(ip_mc_leave_group);
@@ -1993,7 +2149,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
+ ASSERT_RTNL();
imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
imr.imr_address.s_addr = mreqs->imr_interface;
@@ -2107,9 +2263,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
done:
- rtnl_unlock();
if (leavegroup)
- return ip_mc_leave_group(sk, &imr);
+ err = ip_mc_leave_group(sk, &imr);
return err;
}
@@ -2131,7 +2286,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
msf->imsf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- rtnl_lock();
+ ASSERT_RTNL();
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
@@ -2193,7 +2348,6 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
- rtnl_unlock();
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
@@ -2368,7 +2522,7 @@ void ip_mc_drop_socket(struct sock *sk)
struct ip_mc_socklist *iml;
struct net *net = sock_net(sk);
- if (inet->mc_list == NULL)
+ if (!inet->mc_list)
return;
rtnl_lock();
@@ -2378,7 +2532,7 @@ void ip_mc_drop_socket(struct sock *sk)
inet->mc_list = iml->next_rcu;
in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
- if (in_dev != NULL)
+ if (in_dev)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
@@ -2595,13 +2749,13 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
idev = __in_dev_get_rcu(state->dev);
- if (unlikely(idev == NULL))
+ if (unlikely(!idev))
continue;
im = rcu_dereference(idev->mc_list);
- if (likely(im != NULL)) {
+ if (likely(im)) {
spin_lock_bh(&im->lock);
psf = im->sources;
- if (likely(psf != NULL)) {
+ if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
@@ -2671,7 +2825,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
__releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
- if (likely(state->im != NULL)) {
+ if (likely(state->im)) {
spin_unlock_bh(&state->im->lock);
state->im = NULL;
}
@@ -2724,6 +2878,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
+ int err;
pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
if (!pde)
@@ -2732,8 +2887,18 @@ static int __net_init igmp_net_init(struct net *net)
&igmp_mcf_seq_fops);
if (!pde)
goto out_mcfilter;
+ err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
+ SOCK_DGRAM, 0, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
+ err);
+ goto out_sock;
+ }
+
return 0;
+out_sock:
+ remove_proc_entry("mcfilter", net->proc_net);
out_mcfilter:
remove_proc_entry("igmp", net->proc_net);
out_igmp:
@@ -2744,6 +2909,7 @@ static void __net_exit igmp_net_exit(struct net *net)
{
remove_proc_entry("mcfilter", net->proc_net);
remove_proc_entry("igmp", net->proc_net);
+ inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}
static struct pernet_operations igmp_net_ops = {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3e44b9b0b78e..60021d0d9326 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <net/tcp.h>
#ifdef INET_CSK_DEBUG
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -98,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk);
+ int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
local_bh_disable();
if (!snum) {
@@ -105,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
again:
inet_get_local_port_range(net, &low, &high);
+ if (attempt_half) {
+ int half = low + ((high - low) >> 1);
+
+ if (attempt_half == 1)
+ high = half;
+ else
+ low = half;
+ }
remaining = (high - low) + 1;
smallest_rover = rover = prandom_u32() % remaining + low;
@@ -126,11 +136,6 @@ again:
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
- !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
- snum = smallest_rover;
- goto tb_found;
- }
}
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
@@ -158,6 +163,11 @@ again:
snum = smallest_rover;
goto have_snum;
}
+ if (attempt_half == 1) {
+ /* OK we now try the upper half of the range */
+ attempt_half = 2;
+ goto again;
+ }
goto fail;
}
/* OK, here is the one we will use. HEAD is
@@ -294,8 +304,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct sock *newsk;
struct request_sock *req;
+ struct sock *newsk;
int error;
lock_sock(sk);
@@ -324,9 +334,11 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
newsk = req->sk;
sk_acceptq_removed(sk);
- if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ tcp_rsk(req)->tfo_listener &&
+ queue->fastopenq) {
spin_lock_bh(&queue->fastopenq->lock);
- if (tcp_rsk(req)->listener) {
+ if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
@@ -341,7 +353,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
out:
release_sock(sk);
if (req)
- __reqsk_free(req);
+ reqsk_put(req);
return newsk;
out_err:
newsk = NULL;
@@ -400,18 +412,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
{
- struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
- struct ip_options_rcu *opt = inet_rsk(req)->opt;
- struct net *net = sock_net(sk);
- int flags = inet_sk_flowi_flags(sk);
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct ip_options_rcu *opt = ireq->opt;
+ struct rtable *rt;
- flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
- sk->sk_protocol,
- flags,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num));
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -433,9 +444,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
const struct request_sock *req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = read_pnet(&ireq->ireq_net);
struct inet_sock *newinet = inet_sk(newsk);
struct ip_options_rcu *opt;
- struct net *net = sock_net(sk);
struct flowi4 *fl4;
struct rtable *rt;
@@ -443,11 +454,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
rcu_read_lock();
opt = rcu_dereference(newinet->inet_opt);
- flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num));
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -475,33 +487,37 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
-#define AF_INET_FAMILY(fam) 1
+#define AF_INET_FAMILY(fam) true
#endif
-struct request_sock *inet_csk_search_req(const struct sock *sk,
- struct request_sock ***prevp,
- const __be16 rport, const __be32 raddr,
+/* Note: this is temporary :
+ * req sock will no longer be in listener hash table
+*/
+struct request_sock *inet_csk_search_req(struct sock *sk,
+ const __be16 rport,
+ const __be32 raddr,
const __be32 laddr)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req, **prev;
+ struct request_sock *req;
+ u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries);
- for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
+ spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
+ for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq->ir_rmt_port == rport &&
ireq->ir_rmt_addr == raddr &&
ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
+ atomic_inc(&req->rsk_refcnt);
WARN_ON(req->sk);
- *prevp = prev;
break;
}
}
+ spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req;
}
@@ -557,23 +573,58 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
-void inet_csk_reqsk_queue_prune(struct sock *parent,
- const unsigned long interval,
- const unsigned long timeout,
- const unsigned long max_rto)
+/* return true if req was found in the syn_table[] */
+static bool reqsk_queue_unlink(struct request_sock_queue *queue,
+ struct request_sock *req)
{
- struct inet_connection_sock *icsk = inet_csk(parent);
+ struct listen_sock *lopt = queue->listen_opt;
+ struct request_sock **prev;
+ bool found = false;
+
+ spin_lock(&queue->syn_wait_lock);
+
+ for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
+ prev = &(*prev)->dl_next) {
+ if (*prev == req) {
+ *prev = req->dl_next;
+ found = true;
+ break;
+ }
+ }
+
+ spin_unlock(&queue->syn_wait_lock);
+ if (del_timer(&req->rsk_timer))
+ reqsk_put(req);
+ return found;
+}
+
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+ if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ reqsk_put(req);
+ }
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
+static void reqsk_timer_handler(unsigned long data)
+{
+ struct request_sock *req = (struct request_sock *)data;
+ struct sock *sk_listener = req->rsk_listener;
+ struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt;
- int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
- unsigned long now = jiffies;
- struct request_sock **reqp, *req;
- int i, budget;
+ int qlen, expire = 0, resend = 0;
+ int max_retries, thresh;
+ u8 defer_accept;
- if (lopt == NULL || lopt->qlen == 0)
+ if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
+ reqsk_put(req);
return;
+ }
+ max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ thresh = max_retries;
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
* If synack was not acknowledged for 1 second, it means
@@ -591,67 +642,65 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- if (lopt->qlen>>(lopt->max_qlen_log-1)) {
- int young = (lopt->qlen_young<<1);
+ qlen = listen_sock_qlen(lopt);
+ if (qlen >> (lopt->max_qlen_log - 1)) {
+ int young = listen_sock_young(lopt) << 1;
while (thresh > 2) {
- if (lopt->qlen < young)
+ if (qlen < young)
break;
thresh--;
young <<= 1;
}
}
+ defer_accept = READ_ONCE(queue->rskq_defer_accept);
+ if (defer_accept)
+ max_retries = defer_accept;
+ syn_ack_recalc(req, thresh, max_retries, defer_accept,
+ &expire, &resend);
+ req->rsk_ops->syn_ack_timeout(req);
+ if (!expire &&
+ (!resend ||
+ !inet_rtx_syn_ack(sk_listener, req) ||
+ inet_rsk(req)->acked)) {
+ unsigned long timeo;
- if (queue->rskq_defer_accept)
- max_retries = queue->rskq_defer_accept;
-
- budget = 2 * (lopt->nr_table_entries / (timeout / interval));
- i = lopt->clock_hand;
-
- do {
- reqp=&lopt->syn_table[i];
- while ((req = *reqp) != NULL) {
- if (time_after_eq(now, req->expires)) {
- int expire = 0, resend = 0;
-
- syn_ack_recalc(req, thresh, max_retries,
- queue->rskq_defer_accept,
- &expire, &resend);
- req->rsk_ops->syn_ack_timeout(parent, req);
- if (!expire &&
- (!resend ||
- !inet_rtx_syn_ack(parent, req) ||
- inet_rsk(req)->acked)) {
- unsigned long timeo;
-
- if (req->num_timeout++ == 0)
- lopt->qlen_young--;
- timeo = min(timeout << req->num_timeout,
- max_rto);
- req->expires = now + timeo;
- reqp = &req->dl_next;
- continue;
- }
+ if (req->num_timeout++ == 0)
+ atomic_inc(&lopt->young_dec);
+ timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+ return;
+ }
+ inet_csk_reqsk_queue_drop(sk_listener, req);
+ reqsk_put(req);
+}
- /* Drop this request */
- inet_csk_reqsk_queue_unlink(parent, req, reqp);
- reqsk_queue_removed(queue, req);
- reqsk_free(req);
- continue;
- }
- reqp = &req->dl_next;
- }
+void reqsk_queue_hash_req(struct request_sock_queue *queue,
+ u32 hash, struct request_sock *req,
+ unsigned long timeout)
+{
+ struct listen_sock *lopt = queue->listen_opt;
- i = (i + 1) & (lopt->nr_table_entries - 1);
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
- } while (--budget > 0);
+ /* before letting lookups find us, make sure all req fields
+ * are committed to memory and refcnt initialized.
+ */
+ smp_wmb();
+ atomic_set(&req->rsk_refcnt, 2);
+ setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
+ req->rsk_hash = hash;
- lopt->clock_hand = i;
+ spin_lock(&queue->syn_wait_lock);
+ req->dl_next = lopt->syn_table[hash];
+ lopt->syn_table[hash] = req;
+ spin_unlock(&queue->syn_wait_lock);
- if (lopt->qlen)
- inet_csk_reset_keepalive_timer(parent, interval);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+EXPORT_SYMBOL(reqsk_queue_hash_req);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -667,7 +716,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
{
struct sock *newsk = sk_clone_lock(sk, priority);
- if (newsk != NULL) {
+ if (newsk) {
struct inet_connection_sock *newicsk = inet_csk(newsk);
newsk->sk_state = TCP_SYN_RECV;
@@ -679,6 +728,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
newsk->sk_write_space = sk_stream_write_space;
newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
@@ -785,8 +836,6 @@ void inet_csk_listen_stop(struct sock *sk)
struct request_sock *acc_req;
struct request_sock *req;
- inet_csk_delete_keepalive_timer(sk);
-
/* make all the listen_opt local to us */
acc_req = reqsk_queue_yank_acceptq(queue);
@@ -816,9 +865,9 @@ void inet_csk_listen_stop(struct sock *sk)
percpu_counter_inc(sk->sk_prot->orphan_count);
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != tcp_rsk(req)->listener);
+ BUG_ON(sk != req->rsk_listener);
/* Paranoid, to prevent race condition if
* an inbound pkt destined for child is
@@ -827,7 +876,6 @@ void inet_csk_listen_stop(struct sock *sk)
* tcp_v4_destroy_sock().
*/
tcp_sk(child)->fastopen_rsk = NULL;
- sock_put(sk);
}
inet_csk_destroy_sock(child);
@@ -836,9 +884,9 @@ void inet_csk_listen_stop(struct sock *sk)
sock_put(child);
sk_acceptq_removed(sk);
- __reqsk_free(req);
+ reqsk_put(req);
}
- if (queue->fastopenq != NULL) {
+ if (queue->fastopenq) {
/* Free all the reqs queued in rskq_rst_head. */
spin_lock_bh(&queue->fastopenq->lock);
acc_req = queue->fastopenq->rskq_rst_head;
@@ -846,7 +894,7 @@ void inet_csk_listen_stop(struct sock *sk)
spin_unlock_bh(&queue->fastopenq->lock);
while ((req = acc_req) != NULL) {
acc_req = req->dl_next;
- __reqsk_free(req);
+ reqsk_put(req);
}
}
WARN_ON(sk->sk_ack_backlog);
@@ -870,7 +918,7 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_af_ops->compat_getsockopt != NULL)
+ if (icsk->icsk_af_ops->compat_getsockopt)
return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
optval, optlen);
return icsk->icsk_af_ops->getsockopt(sk, level, optname,
@@ -883,7 +931,7 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_af_ops->compat_setsockopt != NULL)
+ if (icsk->icsk_af_ops->compat_setsockopt)
return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
optval, optlen);
return icsk->icsk_af_ops->setsockopt(sk, level, optname,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 592aff37366b..c3b1f3a0f4cf 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -38,16 +38,12 @@
static const struct inet_diag_handler **inet_diag_table;
struct inet_diag_entry {
- __be32 *saddr;
- __be32 *daddr;
+ const __be32 *saddr;
+ const __be32 *daddr;
u16 sport;
u16 dport;
u16 family;
u16 userlocks;
-#if IS_ENABLED(CONFIG_IPV6)
- struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
- struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
-#endif
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -65,12 +61,35 @@ static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
return inet_diag_table[proto];
}
-static inline void inet_diag_unlock_handler(
- const struct inet_diag_handler *handler)
+static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
{
mutex_unlock(&inet_diag_table_mutex);
}
+static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+{
+ r->idiag_family = sk->sk_family;
+
+ r->id.idiag_sport = htons(sk->sk_num);
+ r->id.idiag_dport = sk->sk_dport;
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ } else
+#endif
+ {
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = sk->sk_daddr;
+ }
+}
+
static size_t inet_sk_attr_size(void)
{
return nla_total_size(sizeof(struct tcp_info))
@@ -86,21 +105,22 @@ static size_t inet_sk_attr_size(void)
}
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
- struct sk_buff *skb, struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ struct sk_buff *skb, const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
{
const struct inet_sock *inet = inet_sk(sk);
+ const struct tcp_congestion_ops *ca_ops;
+ const struct inet_diag_handler *handler;
+ int ext = req->idiag_ext;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
- const struct inet_diag_handler *handler;
- int ext = req->idiag_ext;
handler = inet_diag_table[req->sdiag_protocol];
- BUG_ON(handler == NULL);
+ BUG_ON(!handler);
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
@@ -108,25 +128,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
return -EMSGSIZE;
r = nlmsg_data(nlh);
- BUG_ON(sk->sk_state == TCP_TIME_WAIT);
+ BUG_ON(!sk_fullsock(sk));
- r->idiag_family = sk->sk_family;
+ inet_diag_msg_common_fill(r, sk);
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
- r->id.idiag_if = sk->sk_bound_dev_if;
- sock_diag_save_cookie(sk, r->id.idiag_cookie);
-
- r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = inet->inet_dport;
-
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
- r->id.idiag_src[0] = inet->inet_rcv_saddr;
- r->id.idiag_dst[0] = inet->inet_daddr;
-
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
goto errout;
@@ -139,14 +147,14 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
-
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
-
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
if (nla_put_u8(skb, INET_DIAG_TCLASS,
inet6_sk(sk)->tclass) < 0)
goto errout;
+
+ if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+ goto errout;
}
#endif
@@ -169,7 +177,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
goto errout;
- if (icsk == NULL) {
+ if (!icsk) {
handler->idiag_get_info(sk, r, NULL);
goto out;
}
@@ -196,25 +204,42 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
}
#undef EXPIRES_IN_MS
- if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
attr = nla_reserve(skb, INET_DIAG_INFO,
- sizeof(struct tcp_info));
+ handler->idiag_info_size);
if (!attr)
goto errout;
info = nla_data(attr);
}
- if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
- if (nla_put_string(skb, INET_DIAG_CONG,
- icsk->icsk_ca_ops->name) < 0)
+ if (ext & (1 << (INET_DIAG_CONG - 1))) {
+ int err = 0;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops)
+ err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
+ rcu_read_unlock();
+ if (err < 0)
goto errout;
+ }
handler->idiag_get_info(sk, r, info);
- if (sk->sk_state < TCP_TIME_WAIT &&
- icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
- icsk->icsk_ca_ops->get_info(sk, ext, skb);
+ if (sk->sk_state < TCP_TIME_WAIT) {
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ext, &attr, &info);
+ rcu_read_unlock();
+ if (sz && nla_put(skb, attr, sz, &info) < 0)
+ goto errout;
+ }
out:
nlmsg_end(skb, nlh);
@@ -227,23 +252,25 @@ errout:
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ struct sk_buff *skb,
+ const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- return inet_sk_diag_fill(sk, inet_csk(sk),
- skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
+ user_ns, portid, seq, nlmsg_flags, unlh);
}
-static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
- struct sk_buff *skb, struct inet_diag_req_v2 *req,
+static int inet_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- s32 tmo;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
+ long tmo;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
@@ -253,25 +280,13 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_ttd - inet_tw_time_stamp();
+ tmo = tw->tw_timer.expires - jiffies;
if (tmo < 0)
tmo = 0;
- r->idiag_family = tw->tw_family;
+ inet_diag_msg_common_fill(r, sk);
r->idiag_retrans = 0;
- r->id.idiag_if = tw->tw_bound_dev_if;
- sock_diag_save_cookie(tw, r->id.idiag_cookie);
-
- r->id.idiag_sport = tw->tw_sport;
- r->id.idiag_dport = tw->tw_dport;
-
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
- r->id.idiag_src[0] = tw->tw_rcv_saddr;
- r->id.idiag_dst[0] = tw->tw_daddr;
-
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
r->idiag_expires = jiffies_to_msecs(tmo);
@@ -279,61 +294,91 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- if (tw->tw_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
- }
-#endif
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
+ r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
nlmsg_end(skb, nlh);
return 0;
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct inet_diag_req_v2 *r,
+ const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+ return inet_twsk_diag_fill(sk, skb, portid, seq,
nlmsg_flags, unlh);
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return inet_req_diag_fill(sk, skb, portid, seq,
+ nlmsg_flags, unlh);
+
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
nlmsg_flags, unlh);
}
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
{
- int err;
- struct sock *sk;
- struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
err = -EINVAL;
- if (req->sdiag_family == AF_INET) {
+ if (req->sdiag_family == AF_INET)
sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
- }
#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6) {
+ else if (req->sdiag_family == AF_INET6)
sk = inet6_lookup(net, hashinfo,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
req->id.idiag_sport,
req->id.idiag_if);
- }
#endif
- else {
+ else
goto out_nosk;
- }
err = -ENOENT;
- if (sk == NULL)
+ if (!sk)
goto out_nosk;
err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
@@ -371,7 +416,7 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
static int inet_diag_get_exact(struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
const struct inet_diag_handler *handler;
int err;
@@ -412,9 +457,8 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
return 1;
}
-
static int inet_diag_bc_run(const struct nlattr *_bc,
- const struct inet_diag_entry *entry)
+ const struct inet_diag_entry *entry)
{
const void *bc = nla_data(_bc);
int len = nla_len(_bc);
@@ -446,10 +490,10 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
break;
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND: {
- struct inet_diag_hostcond *cond;
- __be32 *addr;
+ const struct inet_diag_hostcond *cond;
+ const __be32 *addr;
- cond = (struct inet_diag_hostcond *)(op + 1);
+ cond = (const struct inet_diag_hostcond *)(op + 1);
if (cond->port != -1 &&
cond->port != (op->code == INET_DIAG_BC_S_COND ?
entry->sport : entry->dport)) {
@@ -498,29 +542,36 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
return len == 0;
}
+/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
+ */
+static void entry_fill_addrs(struct inet_diag_entry *entry,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry->daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry->saddr = &sk->sk_rcv_saddr;
+ entry->daddr = &sk->sk_daddr;
+ }
+}
+
int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
{
- struct inet_diag_entry entry;
struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
- if (bc == NULL)
+ if (!bc)
return 1;
entry.family = sk->sk_family;
-#if IS_ENABLED(CONFIG_IPV6)
- if (entry.family == AF_INET6) {
-
- entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
- entry.daddr = sk->sk_v6_daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->inet_rcv_saddr;
- entry.daddr = &inet->inet_daddr;
- }
+ entry_fill_addrs(&entry, sk);
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
- entry.userlocks = sk->sk_userlocks;
+ entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
return inet_diag_bc_run(bc, &entry);
}
@@ -547,8 +598,8 @@ static int valid_cc(const void *bc, int len, int cc)
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
int *min_len)
{
- int addr_len;
struct inet_diag_hostcond *cond;
+ int addr_len;
/* Check hostcond space. */
*min_len += sizeof(struct inet_diag_hostcond);
@@ -582,8 +633,8 @@ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
}
/* Validate a port comparison operator. */
-static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
- int len, int *min_len)
+static bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
{
/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
*min_len += sizeof(struct inet_diag_bc_op);
@@ -598,10 +649,9 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
int len = bytecode_len;
while (len > 0) {
- const struct inet_diag_bc_op *op = bc;
int min_len = sizeof(struct inet_diag_bc_op);
+ const struct inet_diag_bc_op *op = bc;
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
@@ -642,7 +692,7 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
- struct inet_diag_req_v2 *r,
+ const struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
if (!inet_diag_bc_sk(bc, sk))
@@ -654,139 +704,42 @@ static int inet_csk_diag_dump(struct sock *sk,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
-static int inet_twsk_diag_dump(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+static void twsk_build_assert(void)
{
- struct inet_timewait_sock *tw = inet_twsk(sk);
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
- if (bc != NULL) {
- struct inet_diag_entry entry;
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
- entry.family = tw->tw_family;
-#if IS_ENABLED(CONFIG_IPV6)
- if (tw->tw_family == AF_INET6) {
- entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw->tw_v6_daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &tw->tw_rcv_saddr;
- entry.daddr = &tw->tw_daddr;
- }
- entry.sport = tw->tw_num;
- entry.dport = ntohs(tw->tw_dport);
- entry.userlocks = 0;
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
- if (!inet_diag_bc_run(bc, &entry))
- return 0;
- }
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
- return inet_twsk_diag_fill(tw, skb, r,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
-}
-
-/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
- * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
- */
-static inline void inet_diag_req_addrs(const struct sock *sk,
- const struct request_sock *req,
- struct inet_diag_entry *entry)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- if (req->rsk_ops->family == AF_INET6) {
- entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
- entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
- } else if (req->rsk_ops->family == AF_INET) {
- ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
- &entry->saddr_storage);
- ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
- &entry->daddr_storage);
- entry->saddr = entry->saddr_storage.s6_addr32;
- entry->daddr = entry->daddr_storage.s6_addr32;
- }
- } else
-#endif
- {
- entry->saddr = &ireq->ir_loc_addr;
- entry->daddr = &ireq->ir_rmt_addr;
- }
-}
-
-static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq,
- const struct nlmsghdr *unlh)
-{
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- NLM_F_MULTI);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- r->idiag_family = sk->sk_family;
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = req->num_retrans;
-
- r->id.idiag_if = sk->sk_bound_dev_if;
- sock_diag_save_cookie(req, r->id.idiag_cookie);
-
- tmo = req->expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = ireq->ir_rmt_port;
-
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
- r->id.idiag_src[0] = ireq->ir_loc_addr;
- r->id.idiag_dst[0] = ireq->ir_rmt_addr;
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
- r->idiag_expires = jiffies_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
- r->idiag_inode = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- if (r->idiag_family == AF_INET6) {
- struct inet_diag_entry entry;
- inet_diag_req_addrs(sk, req, &entry);
- memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
- memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
- }
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
#endif
-
- nlmsg_end(skb, nlh);
- return 0;
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
struct netlink_callback *cb,
- struct inet_diag_req_v2 *r,
+ const struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
- struct inet_diag_entry entry;
struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt;
struct inet_sock *inet = inet_sk(sk);
- int j, s_j;
- int reqnum, s_reqnum;
+ struct inet_diag_entry entry;
+ int j, s_j, reqnum, s_reqnum;
+ struct listen_sock *lopt;
int err = 0;
s_j = cb->args[3];
@@ -797,13 +750,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
entry.family = sk->sk_family;
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !lopt->qlen)
+ if (!lopt || !listen_sock_qlen(lopt))
goto out;
- if (bc != NULL) {
+ if (bc) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
@@ -822,17 +775,18 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
continue;
if (bc) {
- inet_diag_req_addrs(sk, req, &entry);
+ /* Note: entry.sport and entry.userlocks are already set */
+ entry_fill_addrs(&entry, req_to_sk(req));
entry.dport = ntohs(ireq->ir_rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
}
- err = inet_diag_fill_req(skb, sk, req,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, cb->nlh);
+ err = inet_req_diag_fill(req_to_sk(req), skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
cb->args[4] = reqnum;
@@ -844,17 +798,17 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
}
out:
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return err;
}
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
- struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
- int i, num;
- int s_i, s_num;
struct net *net = sock_net(skb->sk);
+ int i, num, s_i, s_num;
s_i = cb->args[1];
s_num = num = cb->args[2];
@@ -864,9 +818,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
- struct sock *sk;
- struct hlist_nulls_node *node;
struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
@@ -883,7 +837,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
}
if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
+ sk->sk_family != r->sdiag_family)
goto next_listen;
if (r->id.idiag_sport != inet->inet_sport &&
@@ -931,8 +885,8 @@ skip_listen_ht:
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
- struct sock *sk;
struct hlist_nulls_node *node;
+ struct sock *sk;
num = 0;
@@ -944,8 +898,7 @@ skip_listen_ht:
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- int res;
- int state;
+ int state, res;
if (!net_eq(sock_net(sk), net))
continue;
@@ -964,10 +917,16 @@ skip_listen_ht:
if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
- if (sk->sk_state == TCP_TIME_WAIT)
- res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
- else
- res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_normal;
+
+ res = sk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
@@ -988,7 +947,8 @@ out:
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
{
const struct inet_diag_handler *handler;
int err = 0;
@@ -1005,8 +965,8 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct nlattr *bc = NULL;
if (nlmsg_attrlen(cb->nlh, hdrlen))
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
@@ -1014,7 +974,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
}
-static inline int inet_diag_type2proto(int type)
+static int inet_diag_type2proto(int type)
{
switch (type) {
case TCPDIAG_GETSOCK:
@@ -1026,12 +986,13 @@ static inline int inet_diag_type2proto(int type)
}
}
-static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet_diag_dump_compat(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ int hdrlen = sizeof(struct inet_diag_req);
struct inet_diag_req_v2 req;
struct nlattr *bc = NULL;
- int hdrlen = sizeof(struct inet_diag_req);
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@@ -1046,7 +1007,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
}
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh)
+ const struct nlmsghdr *nlh)
{
struct inet_diag_req *rc = nlmsg_data(nlh);
struct inet_diag_req_v2 req;
@@ -1075,7 +1036,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
attr = nlmsg_find_attr(nlh, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (attr == NULL ||
+ if (!attr ||
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
@@ -1102,9 +1063,10 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
+
attr = nlmsg_find_attr(h, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (attr == NULL ||
+ if (!attr ||
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
@@ -1120,14 +1082,62 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
+static
+int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
+{
+ const struct inet_diag_handler *handler;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ struct inet_diag_msg *r;
+ void *info = NULL;
+ int err = 0;
+
+ nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0);
+ if (!nlh)
+ return -ENOMEM;
+
+ r = nlmsg_data(nlh);
+ memset(r, 0, sizeof(*r));
+ inet_diag_msg_common_fill(r, sk);
+ if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM)
+ r->id.idiag_sport = inet_sk(sk)->inet_sport;
+ r->idiag_state = sk->sk_state;
+
+ if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ handler = inet_diag_lock_handler(sk->sk_protocol);
+ if (IS_ERR(handler)) {
+ inet_diag_unlock_handler(handler);
+ nlmsg_cancel(skb, nlh);
+ return PTR_ERR(handler);
+ }
+
+ attr = handler->idiag_info_size
+ ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size)
+ : NULL;
+ if (attr)
+ info = nla_data(attr);
+
+ handler->idiag_get_info(sk, r, info);
+ inet_diag_unlock_handler(handler);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
.dump = inet_diag_handler_dump,
+ .get_info = inet_diag_handler_get_info,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
.dump = inet_diag_handler_dump,
+ .get_info = inet_diag_handler_get_info,
};
int inet_diag_register(const struct inet_diag_handler *h)
@@ -1140,7 +1150,7 @@ int inet_diag_register(const struct inet_diag_handler *h)
mutex_lock(&inet_diag_table_mutex);
err = -EEXIST;
- if (inet_diag_table[type] == NULL) {
+ if (!inet_diag_table[type]) {
inet_diag_table[type] = h;
err = 0;
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e7920352646a..d0a7c0319e3d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -131,34 +131,22 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
unsigned int evicted = 0;
HLIST_HEAD(expired);
-evict_again:
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))
continue;
- if (!del_timer(&fq->timer)) {
- /* q expiring right now thus increment its refcount so
- * it won't be freed under us and wait until the timer
- * has finished executing then destroy it
- */
- atomic_inc(&fq->refcnt);
- spin_unlock(&hb->chain_lock);
- del_timer_sync(&fq->timer);
- inet_frag_put(fq, f);
- goto evict_again;
- }
+ if (!del_timer(&fq->timer))
+ continue;
- fq->flags |= INET_FRAG_EVICTED;
- hlist_del(&fq->list);
- hlist_add_head(&fq->list, &expired);
+ hlist_add_head(&fq->list_evictor, &expired);
++evicted;
}
spin_unlock(&hb->chain_lock);
- hlist_for_each_entry_safe(fq, n, &expired, list)
+ hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq);
return evicted;
@@ -240,18 +228,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
int i;
nf->low_thresh = 0;
- local_bh_disable();
evict_again:
+ local_bh_disable();
seq = read_seqbegin(&f->rnd_seqlock);
for (i = 0; i < INETFRAGS_HASHSZ ; i++)
inet_evict_bucket(f, &f->hash[i]);
- if (read_seqretry(&f->rnd_seqlock, seq))
- goto evict_again;
-
local_bh_enable();
+ cond_resched();
+
+ if (read_seqretry(&f->rnd_seqlock, seq) ||
+ percpu_counter_sum(&nf->mem))
+ goto evict_again;
percpu_counter_destroy(&nf->mem);
}
@@ -284,8 +274,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
struct inet_frag_bucket *hb;
hb = get_frag_bucket_locked(fq, f);
- if (!(fq->flags & INET_FRAG_EVICTED))
- hlist_del(&fq->list);
+ hlist_del(&fq->list);
+ fq->flags |= INET_FRAG_COMPLETE;
spin_unlock(&hb->chain_lock);
}
@@ -297,7 +287,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
- fq->flags |= INET_FRAG_COMPLETE;
}
}
EXPORT_SYMBOL(inet_frag_kill);
@@ -330,11 +319,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
fp = xp;
}
sum = sum_truesize + f->qsize;
- sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
+
+ sub_frag_mem_limit(nf, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
@@ -385,12 +375,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
}
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
- if (q == NULL)
+ if (!q)
return NULL;
q->net = nf;
f->constructor(q, arg);
- add_frag_mem_limit(q, f->qsize);
+ add_frag_mem_limit(nf, f->qsize);
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
@@ -406,7 +396,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frag_queue *q;
q = inet_frag_alloc(nf, f, arg);
- if (q == NULL)
+ if (!q)
return NULL;
return inet_frag_intern(nf, q, f, arg);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9111a4e22155..0cb9165421d4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -18,15 +18,16 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
+#include <linux/vmalloc.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
-static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
{
static u32 inet_ehash_secret __read_mostly;
@@ -36,17 +37,21 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
inet_ehash_secret + net_hash_mix(net));
}
-
-static unsigned int inet_sk_ehashfn(const struct sock *sk)
+/* This function handles inet_sock, but also timewait and request sockets
+ * for IPv4/IPv6.
+ */
+u32 sk_ehashfn(const struct sock *sk)
{
- const struct inet_sock *inet = inet_sk(sk);
- const __be32 laddr = inet->inet_rcv_saddr;
- const __u16 lport = inet->inet_num;
- const __be32 faddr = inet->inet_daddr;
- const __be16 fport = inet->inet_dport;
- struct net *net = sock_net(sk);
-
- return inet_ehashfn(net, laddr, lport, faddr, fport);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return inet6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+#endif
+ return inet_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
}
/*
@@ -60,8 +65,8 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
- if (tb != NULL) {
- write_pnet(&tb->ib_net, hold_net(net));
+ if (tb) {
+ write_pnet(&tb->ib_net, net);
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
@@ -79,7 +84,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
- release_net(ib_net(tb));
kmem_cache_free(cachep, tb);
}
}
@@ -87,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
- atomic_inc(&hashinfo->bsockets);
-
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
tb->num_owners++;
@@ -108,8 +108,6 @@ static void __inet_put_port(struct sock *sk)
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind_bucket *tb;
- atomic_dec(&hashinfo->bsockets);
-
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
@@ -263,11 +261,19 @@ void sock_gen_put(struct sock *sk)
if (sk->sk_state == TCP_TIME_WAIT)
inet_twsk_free(inet_twsk(sk));
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ reqsk_free(inet_reqsk(sk));
else
sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);
+void sock_edemux(struct sk_buff *skb)
+{
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
@@ -377,7 +383,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, death_row);
+ inet_twsk_deschedule(tw);
inet_twsk_put(tw);
}
@@ -388,9 +394,10 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static inline u32 inet_sk_port_offset(const struct sock *sk)
+static u32 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
+
return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
inet->inet_daddr,
inet->inet_dport);
@@ -400,13 +407,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
- spinlock_t *lock;
struct inet_ehash_bucket *head;
+ spinlock_t *lock;
int twrefcnt = 0;
WARN_ON(!sk_unhashed(sk));
- sk->sk_hash = inet_sk_ehashfn(sk);
+ sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
@@ -423,15 +430,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-static void __inet_hash(struct sock *sk)
+int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
- if (sk->sk_state != TCP_LISTEN) {
- __inet_hash_nolisten(sk, NULL);
- return;
- }
+ if (sk->sk_state != TCP_LISTEN)
+ return __inet_hash_nolisten(sk, tw);
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -440,13 +445,15 @@ static void __inet_hash(struct sock *sk)
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
+ return 0;
}
+EXPORT_SYMBOL(__inet_hash);
void inet_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk);
+ __inet_hash(sk, NULL);
local_bh_enable();
}
}
@@ -477,8 +484,7 @@ EXPORT_SYMBOL_GPL(inet_unhash);
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
- struct sock *, __u16, struct inet_timewait_sock **),
- int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
+ struct sock *, __u16, struct inet_timewait_sock **))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->inet_num;
@@ -497,8 +503,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
+ /* By starting with offset being an even number,
+ * we tend to leave about 50% of ports for other uses,
+ * like bind(0).
+ */
+ offset &= ~1;
+
local_bh_disable();
- for (i = 1; i <= remaining; i++) {
+ for (i = 0; i < remaining; i++) {
port = low + (i + offset) % remaining;
if (inet_is_local_reserved_port(net, port))
continue;
@@ -542,20 +554,20 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL;
ok:
- hint += i;
+ hint += (i + 2) & ~1;
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- twrefcnt += hash(sk, tw);
+ twrefcnt += __inet_hash_nolisten(sk, tw);
}
if (tw)
twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
if (tw) {
- inet_twsk_deschedule(tw, death_row);
+ inet_twsk_deschedule(tw);
while (twrefcnt) {
twrefcnt--;
inet_twsk_put(tw);
@@ -570,7 +582,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- hash(sk, NULL);
+ __inet_hash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -589,8 +601,12 @@ out:
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
- __inet_check_established, __inet_hash_nolisten);
+ u32 port_offset = 0;
+
+ if (!inet_sk(sk)->inet_num)
+ port_offset = inet_sk_port_offset(sk);
+ return __inet_hash_connect(death_row, sk, port_offset,
+ __inet_check_established);
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
@@ -598,7 +614,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
{
int i;
- atomic_set(&h->bsockets, 0);
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
@@ -606,3 +621,32 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
}
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+ unsigned int locksz = sizeof(spinlock_t);
+ unsigned int i, nblocks = 1;
+
+ if (locksz != 0) {
+ /* allocate 2 cache lines or at least one spinlock per cpu */
+ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
+ nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+
+ /* no more locks than number of hash buckets */
+ nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+ hashinfo->ehash_locks = kmalloc_array(nblocks, locksz,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!hashinfo->ehash_locks)
+ hashinfo->ehash_locks = vmalloc(nblocks * locksz);
+
+ if (!hashinfo->ehash_locks)
+ return -ENOMEM;
+
+ for (i = 0; i < nblocks; i++)
+ spin_lock_init(&hashinfo->ehash_locks[i]);
+ }
+ hashinfo->ehash_locks_mask = nblocks - 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8555fb..2ffbd16b79e0 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
}
/* Must be called with locally disabled BHs. */
-static void __inet_twsk_kill(struct inet_timewait_sock *tw,
- struct inet_hashinfo *hashinfo)
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
struct inet_bind_hashbucket *bhead;
int refcnt;
/* Unlink from established hashes. */
@@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
atomic_sub(refcnt, &tw->tw_refcnt);
+ atomic_dec(&tw->tw_dr->tw_count);
+ inet_twsk_put(tw);
}
void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -98,7 +100,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw)
#ifdef SOCK_REFCNT_DEBUG
pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
#endif
- release_net(twsk_net(tw));
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
@@ -169,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
-struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+static void tw_timer_handler(unsigned long data)
{
- struct inet_timewait_sock *tw =
- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
- GFP_ATOMIC);
- if (tw != NULL) {
+ struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
+
+ if (tw->tw_kill)
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+ else
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+ inet_twsk_kill(tw);
+}
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+ struct inet_timewait_death_row *dr,
+ const int state)
+{
+ struct inet_timewait_sock *tw;
+
+ if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ return NULL;
+
+ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ GFP_ATOMIC);
+ if (tw) {
const struct inet_sock *inet = inet_sk(sk);
kmemcheck_annotate_bitfield(tw, flags);
+ tw->tw_dr = dr;
/* Give us an identity. */
tw->tw_daddr = inet->inet_daddr;
tw->tw_rcv_saddr = inet->inet_rcv_saddr;
@@ -195,14 +214,16 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_ipv6only = 0;
tw->tw_transparent = inet->transparent;
tw->tw_prot = sk->sk_prot_creator;
- twsk_net_set(tw, hold_net(sock_net(sk)));
+ atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
+ twsk_net_set(tw, sock_net(sk));
+ setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
* timewait socket.
*/
atomic_set(&tw->tw_refcnt, 0);
- inet_twsk_dead_node_init(tw);
+
__module_get(tw->tw_prot->owner);
}
@@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
}
EXPORT_SYMBOL_GPL(inet_twsk_alloc);
-/* Returns non-zero if quota exceeded. */
-static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
- const int slot)
-{
- struct inet_timewait_sock *tw;
- unsigned int killed;
- int ret;
-
- /* NOTE: compare this to previous version where lock
- * was released after detaching chain. It was racy,
- * because tw buckets are scheduled in not serialized context
- * in 2.3 (with netfilter), and with softnet it is common, because
- * soft irqs are not sequenced.
- */
- killed = 0;
- ret = 0;
-rescan:
- inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
- __inet_twsk_del_dead_node(tw);
- spin_unlock(&twdr->death_lock);
- __inet_twsk_kill(tw, twdr->hashinfo);
-#ifdef CONFIG_NET_NS
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
-#endif
- inet_twsk_put(tw);
- killed++;
- spin_lock(&twdr->death_lock);
- if (killed > INET_TWDR_TWKILL_QUOTA) {
- ret = 1;
- break;
- }
-
- /* While we dropped twdr->death_lock, another cpu may have
- * killed off the next TW bucket in the list, therefore
- * do a fresh re-read of the hlist head node with the
- * lock reacquired. We still use the hlist traversal
- * macro in order to get the prefetches.
- */
- goto rescan;
- }
-
- twdr->tw_count -= killed;
-#ifndef CONFIG_NET_NS
- NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
-#endif
- return ret;
-}
-
-void inet_twdr_hangman(unsigned long data)
-{
- struct inet_timewait_death_row *twdr;
- unsigned int need_timer;
-
- twdr = (struct inet_timewait_death_row *)data;
- spin_lock(&twdr->death_lock);
-
- if (twdr->tw_count == 0)
- goto out;
-
- need_timer = 0;
- if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
- twdr->thread_slots |= (1 << twdr->slot);
- schedule_work(&twdr->twkill_work);
- need_timer = 1;
- } else {
- /* We purged the entire slot, anything left? */
- if (twdr->tw_count)
- need_timer = 1;
- twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
- }
- if (need_timer)
- mod_timer(&twdr->tw_timer, jiffies + twdr->period);
-out:
- spin_unlock(&twdr->death_lock);
-}
-EXPORT_SYMBOL_GPL(inet_twdr_hangman);
-
-void inet_twdr_twkill_work(struct work_struct *work)
-{
- struct inet_timewait_death_row *twdr =
- container_of(work, struct inet_timewait_death_row, twkill_work);
- int i;
-
- BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
- (sizeof(twdr->thread_slots) * 8));
-
- while (twdr->thread_slots) {
- spin_lock_bh(&twdr->death_lock);
- for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
- if (!(twdr->thread_slots & (1 << i)))
- continue;
-
- while (inet_twdr_do_twkill_work(twdr, i) != 0) {
- if (need_resched()) {
- spin_unlock_bh(&twdr->death_lock);
- schedule();
- spin_lock_bh(&twdr->death_lock);
- }
- }
-
- twdr->thread_slots &= ~(1 << i);
- }
- spin_unlock_bh(&twdr->death_lock);
- }
-}
-EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
-
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
*/
/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw,
- struct inet_timewait_death_row *twdr)
+void inet_twsk_deschedule(struct inet_timewait_sock *tw)
{
- spin_lock(&twdr->death_lock);
- if (inet_twsk_del_dead_node(tw)) {
- inet_twsk_put(tw);
- if (--twdr->tw_count == 0)
- del_timer(&twdr->tw_timer);
- }
- spin_unlock(&twdr->death_lock);
- __inet_twsk_kill(tw, twdr->hashinfo);
+ if (del_timer_sync(&tw->tw_timer))
+ inet_twsk_kill(tw);
}
EXPORT_SYMBOL(inet_twsk_deschedule);
-void inet_twsk_schedule(struct inet_timewait_sock *tw,
- struct inet_timewait_death_row *twdr,
- const int timeo, const int timewait_len)
+void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
{
- struct hlist_head *list;
- int slot;
-
/* timeout := RTO * 3.5
*
* 3.5 = 1+2+0.5 to wait for two retransmits.
@@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
* is greater than TS tick!) and detect old duplicates with help
* of PAWS.
*/
- slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
- spin_lock(&twdr->death_lock);
-
- /* Unlink it, if it was scheduled */
- if (inet_twsk_del_dead_node(tw))
- twdr->tw_count--;
- else
+ tw->tw_kill = timeo <= 4*HZ;
+ if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
atomic_inc(&tw->tw_refcnt);
-
- if (slot >= INET_TWDR_RECYCLE_SLOTS) {
- /* Schedule to slow timer */
- if (timeo >= timewait_len) {
- slot = INET_TWDR_TWKILL_SLOTS - 1;
- } else {
- slot = DIV_ROUND_UP(timeo, twdr->period);
- if (slot >= INET_TWDR_TWKILL_SLOTS)
- slot = INET_TWDR_TWKILL_SLOTS - 1;
- }
- tw->tw_ttd = inet_tw_time_stamp() + timeo;
- slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
- list = &twdr->cells[slot];
- } else {
- tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
-
- if (twdr->twcal_hand < 0) {
- twdr->twcal_hand = 0;
- twdr->twcal_jiffie = jiffies;
- twdr->twcal_timer.expires = twdr->twcal_jiffie +
- (slot << INET_TWDR_RECYCLE_TICK);
- add_timer(&twdr->twcal_timer);
- } else {
- if (time_after(twdr->twcal_timer.expires,
- jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
- mod_timer(&twdr->twcal_timer,
- jiffies + (slot << INET_TWDR_RECYCLE_TICK));
- slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
- }
- list = &twdr->twcal_row[slot];
+ atomic_inc(&tw->tw_dr->tw_count);
}
-
- hlist_add_head(&tw->tw_death_node, list);
-
- if (twdr->tw_count++ == 0)
- mod_timer(&twdr->tw_timer, jiffies + twdr->period);
- spin_unlock(&twdr->death_lock);
}
EXPORT_SYMBOL_GPL(inet_twsk_schedule);
-void inet_twdr_twcal_tick(unsigned long data)
-{
- struct inet_timewait_death_row *twdr;
- int n, slot;
- unsigned long j;
- unsigned long now = jiffies;
- int killed = 0;
- int adv = 0;
-
- twdr = (struct inet_timewait_death_row *)data;
-
- spin_lock(&twdr->death_lock);
- if (twdr->twcal_hand < 0)
- goto out;
-
- slot = twdr->twcal_hand;
- j = twdr->twcal_jiffie;
-
- for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
- if (time_before_eq(j, now)) {
- struct hlist_node *safe;
- struct inet_timewait_sock *tw;
-
- inet_twsk_for_each_inmate_safe(tw, safe,
- &twdr->twcal_row[slot]) {
- __inet_twsk_del_dead_node(tw);
- __inet_twsk_kill(tw, twdr->hashinfo);
-#ifdef CONFIG_NET_NS
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
-#endif
- inet_twsk_put(tw);
- killed++;
- }
- } else {
- if (!adv) {
- adv = 1;
- twdr->twcal_jiffie = j;
- twdr->twcal_hand = slot;
- }
-
- if (!hlist_empty(&twdr->twcal_row[slot])) {
- mod_timer(&twdr->twcal_timer, j);
- goto out;
- }
- }
- j += 1 << INET_TWDR_RECYCLE_TICK;
- slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
- }
- twdr->twcal_hand = -1;
-
-out:
- if ((twdr->tw_count -= killed) == 0)
- del_timer(&twdr->tw_timer);
-#ifndef CONFIG_NET_NS
- NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
-#endif
- spin_unlock(&twdr->death_lock);
-}
-EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
-
void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family)
{
@@ -487,6 +289,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
restart_rcu:
+ cond_resched();
rcu_read_lock();
restart:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
@@ -508,7 +311,7 @@ restart:
rcu_read_unlock();
local_bh_disable();
- inet_twsk_deschedule(tw, twdr);
+ inet_twsk_deschedule(tw);
local_bh_enable();
inet_twsk_put(tw);
goto restart_rcu;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index d9bc28ac5d1b..2d3aa408fbdc 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -39,17 +39,21 @@
#include <net/route.h>
#include <net/xfrm.h>
-static bool ip_may_fragment(const struct sk_buff *skb)
-{
- return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
- skb->ignore_df;
-}
-
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
+ if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0))
+ return false;
+
+ /* original fragment exceeds mtu and DF is set */
+ if (unlikely(IPCB(skb)->frag_max_size > mtu))
+ return true;
+
+ if (skb->ignore_df)
+ return false;
+
if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
return false;
@@ -57,7 +61,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
}
-static int ip_forward_finish(struct sk_buff *skb)
+static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
@@ -68,7 +72,7 @@ static int ip_forward_finish(struct sk_buff *skb)
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
- return dst_output(skb);
+ return dst_output_sk(sk, skb);
}
int ip_forward(struct sk_buff *skb)
@@ -82,6 +86,9 @@ int ip_forward(struct sk_buff *skb)
if (skb->pkt_type != PACKET_HOST)
goto drop;
+ if (unlikely(skb->sk))
+ goto drop;
+
if (skb_warn_if_lro(skb))
goto drop;
@@ -111,7 +118,7 @@ int ip_forward(struct sk_buff *skb)
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
- if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
+ if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
@@ -136,8 +143,8 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
- rt->dst.dev, ip_forward_finish);
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
+ skb->dev, rt->dst.dev, ip_forward_finish);
sr_failed:
/*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 145a50c4d566..921138f6c97c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -75,6 +75,7 @@ struct ipq {
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */
+ u16 max_df_size; /* largest frag with DF set seen */
int iif;
unsigned int rid;
struct inet_peer *peer;
@@ -173,6 +174,15 @@ static void ipq_kill(struct ipq *ipq)
inet_frag_kill(&ipq->q, &ip4_frags);
}
+static bool frag_expire_skip_icmp(u32 user)
+{
+ return user == IP_DEFRAG_AF_PACKET ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+ __IP_DEFRAG_CONNTRACK_IN_END) ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
+ __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
+}
+
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
@@ -192,7 +202,7 @@ static void ip_expire(unsigned long arg)
ipq_kill(qp);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
- if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+ if (!inet_frag_evicting(&qp->q)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
@@ -217,10 +227,8 @@ static void ip_expire(unsigned long arg)
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (qp->user == IP_DEFRAG_AF_PACKET ||
- ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
- (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
- (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ if (frag_expire_skip_icmp(qp->user) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock;
/* Send an ICMP "Fragment Reassembly Timeout" message. */
@@ -301,7 +309,7 @@ static int ip_frag_reinit(struct ipq *qp)
kfree_skb(fp);
fp = xp;
} while (fp);
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
@@ -319,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
+ unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
@@ -342,7 +351,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
ihl = ip_hdrlen(skb);
/* Determine the position of this fragment. */
- end = offset + skb->len - ihl;
+ end = offset + skb->len - skb_network_offset(skb) - ihl;
err = -EINVAL;
/* Is this the final fragment? */
@@ -372,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
goto err;
err = -ENOMEM;
- if (pskb_pull(skb, ihl) == NULL)
+ if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
goto err;
err = pskb_trim_rcsum(skb, end - offset);
@@ -446,7 +455,7 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- sub_frag_mem_limit(&qp->q, free_it->truesize);
+ sub_frag_mem_limit(qp->q.net, free_it->truesize);
kfree_skb(free_it);
}
}
@@ -470,13 +479,18 @@ found:
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(&qp->q, skb->truesize);
+ add_frag_mem_limit(qp->q.net, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
+ fragsize = skb->len + ihl;
+
+ if (fragsize > qp->q.max_size)
+ qp->q.max_size = fragsize;
+
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len + ihl > qp->q.max_size)
- qp->q.max_size = skb->len + ihl;
+ fragsize > qp->max_df_size)
+ qp->max_df_size = fragsize;
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
@@ -537,7 +551,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
qp->q.fragments = head;
}
- WARN_ON(head == NULL);
+ WARN_ON(!head);
WARN_ON(FRAG_CB(head)->offset != 0);
/* Allocate a new buffer for the datagram. */
@@ -559,7 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct sk_buff *clone;
int i, plen = 0;
- if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
goto out_nomem;
clone->next = head->next;
head->next = clone;
@@ -572,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(&qp->q, clone->truesize);
+ add_frag_mem_limit(qp->q.net, clone->truesize);
}
skb_push(head, head->data - skb_network_header(head));
@@ -600,18 +615,34 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
fp = next;
}
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, sum_truesize);
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = qp->q.max_size;
+ IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
iph = ip_hdr(head);
- /* max_size != 0 implies at least one fragment had IP_DF set */
- iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
+
+ /* When we set IP_DF on a refragmented skb we must also force a
+ * call to ip_fragment to avoid forwarding a DF-skb of size s while
+ * original sender only sent fragments of size f (where f < s).
+ *
+ * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+ * frag seen to avoid sending tiny DF-fragments in case skb was built
+ * from one very small df-fragment and one large non-df frag.
+ */
+ if (qp->max_df_size == qp->q.max_size) {
+ IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ iph->frag_off = htons(IP_DF);
+ } else {
+ iph->frag_off = 0;
+ }
+
+ ip_send_check(iph);
+
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
@@ -638,7 +669,8 @@ int ip_defrag(struct sk_buff *skb, u32 user)
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
- if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
+ qp = ip_find(net, ip_hdr(skb), user);
+ if (qp) {
int ret;
spin_lock(&qp->q.lock);
@@ -754,7 +786,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table = ip4_frags_ns_ctl_table;
if (!net_eq(net, &init_net)) {
table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
- if (table == NULL)
+ if (!table)
goto err_alloc;
table[0].data = &net->ipv4.frags.high_thresh;
@@ -770,7 +802,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
}
hdr = register_net_sysctl(net, "net/ipv4", table);
- if (hdr == NULL)
+ if (!hdr)
goto err_reg;
net->ipv4.frags_hdr = hdr;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6207275fc749..5fd706473c73 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -182,7 +182,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->daddr, iph->saddr, tpi->key);
- if (t == NULL)
+ if (!t)
return PACKET_REJECT;
if (t->parms.iph.daddr == 0 ||
@@ -423,7 +423,7 @@ static int ipgre_open(struct net_device *dev)
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
ip_rt_put(rt);
- if (__in_dev_get_rtnl(dev) == NULL)
+ if (!__in_dev_get_rtnl(dev))
return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
@@ -456,6 +456,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
};
#define GRE_FEATURES (NETIF_F_SG | \
@@ -621,10 +622,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
if (data[IFLA_GRE_LOCAL])
- parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
if (data[IFLA_GRE_REMOTE])
- parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
if (data[IFLA_GRE_TTL])
parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
@@ -686,6 +687,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -776,8 +778,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
- nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
- nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
nla_put_u8(skb, IFLA_GRE_PMTUDISC,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3d4da2c16b6a..2db4c8773c1b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -187,7 +187,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct sk_buff *skb)
+static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
@@ -203,7 +203,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
raw = raw_local_deliver(skb, protocol);
ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot != NULL) {
+ if (ipprot) {
int ret;
if (!ipprot->no_policy) {
@@ -253,7 +253,8 @@ int ip_local_deliver(struct sk_buff *skb)
return 0;
}
- return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
+ skb->dev, NULL,
ip_local_deliver_finish);
}
@@ -309,12 +310,12 @@ drop:
int sysctl_ip_early_demux __read_mostly = 1;
EXPORT_SYMBOL(sysctl_ip_early_demux);
-static int ip_rcv_finish(struct sk_buff *skb)
+static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
+ if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
@@ -387,7 +388,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
@@ -450,7 +452,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
+ dev, NULL,
ip_rcv_finish);
csum_error:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 5b3d91be2db0..bd246792360b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -264,7 +264,7 @@ int ip_options_compile(struct net *net,
unsigned char *iph;
int optlen, l;
- if (skb != NULL) {
+ if (skb) {
rt = skb_rtable(skb);
optptr = (unsigned char *)&(ip_hdr(skb)[1]);
} else
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a7aea2048a0d..6bf89a6312bc 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -83,6 +83,10 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
+static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct sock *, struct sk_buff *));
+
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
@@ -91,14 +95,19 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-int __ip_local_out(struct sk_buff *skb)
+static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
- skb_dst(skb)->dev, dst_output);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
+ skb_dst(skb)->dev, dst_output_sk);
+}
+
+int __ip_local_out(struct sk_buff *skb)
+{
+ return __ip_local_out_sk(skb->sk, skb);
}
int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
@@ -148,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(skb, sk);
+ ip_select_ident(sock_net(sk), skb, sk);
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -163,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static inline int ip_finish_output2(struct sk_buff *skb)
+static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
@@ -182,7 +191,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
- if (skb2 == NULL) {
+ if (!skb2) {
kfree_skb(skb);
return -ENOMEM;
}
@@ -211,7 +220,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sk_buff *skb)
+static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -219,8 +229,8 @@ static int ip_finish_output_gso(struct sk_buff *skb)
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
- skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
- return ip_finish_output2(skb);
+ skb_gso_network_seglen(skb) <= mtu)
+ return ip_finish_output2(sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -243,7 +253,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
int err;
segs->next = NULL;
- err = ip_fragment(segs, ip_finish_output2);
+ err = ip_fragment(sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -253,22 +263,25 @@ static int ip_finish_output_gso(struct sk_buff *skb)
return ret;
}
-static int ip_finish_output(struct sk_buff *skb)
+static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
- if (skb_dst(skb)->xfrm != NULL) {
+ if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(skb);
+ return dst_output_sk(sk, skb);
}
#endif
+ mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(skb);
+ return ip_finish_output_gso(sk, skb, mtu);
- if (skb->len > ip_skb_dst_mtu(skb))
- return ip_fragment(skb, ip_finish_output2);
+ if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ return ip_fragment(sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(skb);
+ return ip_finish_output2(sk, skb);
}
int ip_mc_output(struct sock *sk, struct sk_buff *skb)
@@ -307,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- newskb, NULL, newskb->dev,
+ sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
}
@@ -322,11 +335,11 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
NULL, newskb->dev, dev_loopback_xmit);
}
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
skb->dev, ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -340,7 +353,8 @@ int ip_output(struct sock *sk, struct sk_buff *skb)
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
+ NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -376,12 +390,12 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
rt = skb_rtable(skb);
- if (rt != NULL)
+ if (rt)
goto packet_routed;
/* Make sure we can route this packet. */
rt = (struct rtable *)__sk_dst_check(sk, 0);
- if (rt == NULL) {
+ if (!rt) {
__be32 daddr;
/* Use correct destination address if we have options. */
@@ -430,7 +444,8 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
+ ip_select_ident_segs(sock_net(sk), skb, sk,
+ skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
@@ -448,7 +463,6 @@ no_route:
}
EXPORT_SYMBOL(ip_queue_xmit);
-
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
to->pkt_type = from->pkt_type;
@@ -472,6 +486,31 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->frag_off & htons(IP_DF)) == 0)
+ return ip_do_fragment(sk, skb, output);
+
+ if (unlikely(!skb->ignore_df ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(sk, skb, output);
+}
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -479,7 +518,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
* single device frame, and queue such a frame for sending.
*/
-int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -500,15 +540,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
- (IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > mtu))) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- kfree_skb(skb);
- return -EMSGSIZE;
- }
+ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+ mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
@@ -516,10 +549,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge)
- mtu -= nf_bridge_mtu_reduction(skb);
-#endif
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
@@ -586,13 +615,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
ip_options_fragment(frag);
offset += skb->len - hlen;
iph->frag_off = htons(offset>>3);
- if (frag->next != NULL)
+ if (frag->next)
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
ip_send_check(iph);
}
- err = output(skb);
+ err = output(sk, skb);
if (!err)
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
@@ -636,10 +665,7 @@ slow_path:
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
- /* for bridged IP traffic encapsulated inside f.e. a vlan header,
- * we need to make room for the encapsulating header
- */
- ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
+ ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
/*
* Fragment the datagram.
@@ -707,6 +733,9 @@ slow_path:
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ iph->frag_off |= htons(IP_DF);
+
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -732,7 +761,7 @@ slow_path:
ip_send_check(iph);
- err = output(skb2);
+ err = output(sk, skb2);
if (err)
goto fail;
@@ -747,7 +776,7 @@ fail:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
@@ -792,12 +821,13 @@ static inline int ip_ufo_append_data(struct sock *sk,
* device, so create one single skb packet containing complete
* udp datagram
*/
- if ((skb = skb_peek_tail(queue)) == NULL) {
+ skb = skb_peek_tail(queue);
+ if (!skb) {
skb = sock_alloc_send_skb(sk,
hh_len + fragheaderlen + transhdrlen + 20,
(flags & MSG_DONTWAIT), &err);
- if (skb == NULL)
+ if (!skb)
return err;
/* reserve space for Hardware header */
@@ -814,7 +844,6 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->csum = 0;
-
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -963,10 +992,10 @@ alloc_new_skb:
skb = sock_wmalloc(sk,
alloclen + hh_len + 15, 1,
sk->sk_allocation);
- if (unlikely(skb == NULL))
+ if (unlikely(!skb))
err = -ENOBUFS;
}
- if (skb == NULL)
+ if (!skb)
goto error;
/*
@@ -1090,10 +1119,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
*/
opt = ipc->opt;
if (opt) {
- if (cork->opt == NULL) {
+ if (!cork->opt) {
cork->opt = kmalloc(sizeof(struct ip_options) + 40,
sk->sk_allocation);
- if (unlikely(cork->opt == NULL))
+ if (unlikely(!cork->opt))
return -ENOBUFS;
}
memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
@@ -1200,7 +1229,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EMSGSIZE;
}
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ skb = skb_peek_tail(&sk->sk_write_queue);
+ if (!skb)
return -EINVAL;
cork->length += size;
@@ -1211,13 +1241,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
}
-
while (size > 0) {
- int i;
-
- if (skb_is_gso(skb))
+ if (skb_is_gso(skb)) {
len = size;
- else {
+ } else {
/* Check if the remaining data fits into current packet. */
len = mtu - skb->len;
@@ -1269,15 +1296,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
continue;
}
- i = skb_shinfo(skb)->nr_frags;
if (len > size)
len = size;
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
- } else if (i < MAX_SKB_FRAGS) {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, len);
- } else {
+
+ if (skb_append_pagefrags(skb, page, offset, len)) {
err = -EMSGSIZE;
goto error;
}
@@ -1331,7 +1353,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
__be16 df = 0;
__u8 ttl;
- if ((skb = __skb_dequeue(queue)) == NULL)
+ skb = __skb_dequeue(queue);
+ if (!skb)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
@@ -1382,7 +1405,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
- ip_select_ident(skb, sk);
+ ip_select_ident(net, skb, sk);
if (opt) {
iph->ihl += opt->optlen>>2;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5cd99271d3a6..c3c359ad66e3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -351,7 +351,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
}
- if (new_ra == NULL) {
+ if (!new_ra) {
spin_unlock_bh(&ip_ra_lock);
return -ENOBUFS;
}
@@ -387,7 +387,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
skb_network_header(skb);
serr->port = port;
- if (skb_pull(skb, payload - skb->data) != NULL) {
+ if (skb_pull(skb, payload - skb->data)) {
skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb) == 0)
return;
@@ -432,6 +432,15 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
kfree_skb(skb);
}
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+ return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
/* IPv4 supports cmsg on all imcp errors and some timestamps
*
* Timestamp code paths do not initialize the fields expected by cmsg:
@@ -482,7 +491,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
err = -EAGAIN;
skb = sock_dequeue_err_skb(sk);
- if (skb == NULL)
+ if (!skb)
goto out;
copied = skb->len;
@@ -498,7 +507,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
serr = SKB_EXT_ERR(skb);
- if (sin && serr->port) {
+ if (sin && ipv4_datagram_support_addr(serr)) {
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
serr->addr_offset);
@@ -536,12 +545,34 @@ out:
* Socket option code for IP. This is the end of the line after any
* TCP,UDP etc options on an IP socket.
*/
+static bool setsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_BLOCK_SOURCE:
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case IP_MSFILTER:
+ case IP_UNBLOCK_SOURCE:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_MSFILTER:
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_UNBLOCK_SOURCE:
+ return true;
+ }
+ return false;
+}
static int do_ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
int val = 0, err;
+ bool needs_rtnl = setsockopt_needs_rtnl(optname);
switch (optname) {
case IP_PKTINFO:
@@ -560,6 +591,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_TRANSPARENT:
case IP_MINTTL:
case IP_NODEFRAG:
+ case IP_BIND_ADDRESS_NO_PORT:
case IP_UNICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_ALL:
@@ -584,6 +616,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
return ip_mroute_setsockopt(sk, optname, optval, optlen);
err = 0;
+ if (needs_rtnl)
+ rtnl_lock();
lock_sock(sk);
switch (optname) {
@@ -708,6 +742,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
inet->nodefrag = val ? 1 : 0;
break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet->bind_address_no_port = val ? 1 : 0;
+ break;
case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval;
@@ -1118,10 +1155,14 @@ mc_msf_out:
break;
}
release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
return err;
e_inval:
release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
return -EINVAL;
}
@@ -1296,6 +1337,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_NODEFRAG:
val = inet->nodefrag;
break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ val = inet->bind_address_no_port;
+ break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 2cd08280c77b..626d9e56a6bd 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -389,7 +389,6 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
hlen = tdev->hard_header_len + tdev->needed_headroom;
mtu = tdev->mtu;
}
- dev->iflink = tunnel->parms.link;
dev->needed_headroom = t_hlen + hlen;
mtu -= (dev->hard_header_len + t_hlen);
@@ -587,7 +586,8 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
EXPORT_SYMBOL(ip_tunnel_encap);
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
- struct rtable *rt, __be16 df)
+ struct rtable *rt, __be16 df,
+ const struct iphdr *inner_iph)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
@@ -604,7 +604,8 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
if (skb->protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) &&
- (df & htons(IP_DF)) && mtu < pkt_size) {
+ (inner_iph->frag_off & htons(IP_DF)) &&
+ mtu < pkt_size) {
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
return -E2BIG;
@@ -655,7 +656,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (dst == 0) {
/* NBMA tunnel */
- if (skb_dst(skb) == NULL) {
+ if (!skb_dst(skb)) {
dev->stats.tx_fifo_errors++;
goto tx_error;
}
@@ -673,7 +674,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
neigh = dst_neigh_lookup(skb_dst(skb),
&ipv6_hdr(skb)->daddr);
- if (neigh == NULL)
+ if (!neigh)
goto tx_error;
addr6 = (const struct in6_addr *)&neigh->primary_key;
@@ -738,7 +739,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
+ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
ip_rt_put(rt);
goto tx_error;
}
@@ -783,7 +784,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
return;
}
- err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
+ err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
@@ -844,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
case SIOCGETTUNNEL:
if (dev == itn->fb_tunnel_dev) {
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
- if (t == NULL)
+ if (!t)
t = netdev_priv(dev);
}
memcpy(p, &t->parms, sizeof(*p));
@@ -877,7 +878,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
break;
}
if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
+ if (t) {
if (t->dev != dev) {
err = -EEXIST;
break;
@@ -915,7 +916,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
if (dev == itn->fb_tunnel_dev) {
err = -ENOENT;
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
- if (t == NULL)
+ if (!t)
goto done;
err = -EPERM;
if (t == netdev_priv(itn->fb_tunnel_dev))
@@ -980,6 +981,14 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev)
}
EXPORT_SYMBOL(ip_tunnel_get_link_net);
+int ip_tunnel_get_iflink(const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return tunnel->parms.link;
+}
+EXPORT_SYMBOL(ip_tunnel_get_iflink);
+
int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 88c386cf7d85..6a51a71a6c67 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -74,7 +74,8 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
+ __ip_select_ident(dev_net(rt->dst.dev), iph,
+ skb_shinfo(skb)->gso_segs ?: 1);
err = ip_local_out_sk(sk, skb);
if (unlikely(net_xmit_eval(err)))
@@ -97,7 +98,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
return -ENOMEM;
eh = (struct ethhdr *)skb->data;
- if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ if (likely(eth_proto_is_802_3(eh->h_proto)))
skb->protocol = eh->h_proto;
else
skb->protocol = htons(ETH_P_802_2);
@@ -164,6 +165,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
{
int i;
+ netdev_stats_to_stats64(tot, &dev->stats);
+
for_each_possible_cpu(i) {
const struct pcpu_sw_netstats *tstats =
per_cpu_ptr(dev->tstats, i);
@@ -184,22 +187,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
tot->tx_bytes += tx_bytes;
}
- tot->multicast = dev->stats.multicast;
-
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
-
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- tot->collisions = dev->stats.collisions;
-
return tot;
}
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 94efe148181c..0c152087ca15 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -60,12 +60,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->saddr, iph->daddr, 0);
- if (tunnel != NULL) {
+ if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
- skb->mark = be32_to_cpu(tunnel->parms.i_key);
return xfrm_input(skb, nexthdr, spi, encap_type);
}
@@ -91,6 +90,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+ u32 orig_mark = skb->mark;
+ int ret;
if (!tunnel)
return 1;
@@ -107,7 +108,11 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
x = xfrm_input_state(skb);
family = x->inner_mode->afinfo->family;
- if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+ skb->mark = be32_to_cpu(tunnel->parms.i_key);
+ ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+ skb->mark = orig_mark;
+
+ if (!ret)
return -EPERM;
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
@@ -216,8 +221,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
memset(&fl, 0, sizeof(fl));
- skb->mark = be32_to_cpu(tunnel->parms.o_key);
-
switch (skb->protocol) {
case htons(ETH_P_IP):
xfrm_decode_session(skb, &fl, AF_INET);
@@ -233,6 +236,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
+ /* override mark with tunnel output key */
+ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
+
return vti_xmit(skb, dev, &fl);
}
@@ -341,6 +347,7 @@ static const struct net_device_ops vti_netdev_ops = {
.ndo_do_ioctl = vti_tunnel_ioctl,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
};
static void vti_tunnel_setup(struct net_device *dev)
@@ -361,7 +368,6 @@ static int vti_tunnel_init(struct net_device *dev)
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
dev->mtu = ETH_DATA_LEN;
dev->flags = IFF_NOARP;
- dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
netif_keep_dst(dev);
@@ -456,10 +462,10 @@ static void vti_netlink_parms(struct nlattr *data[],
parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
if (data[IFLA_VTI_LOCAL])
- parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);
if (data[IFLA_VTI_REMOTE])
- parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
}
@@ -505,8 +511,8 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u32(skb, IFLA_VTI_LINK, p->link);
nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
- nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
- nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+ nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+ nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr);
return 0;
}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index c0855d50a3fa..d97f4f2787f5 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -63,7 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
struct xfrm_state *t;
t = xfrm_state_alloc(net);
- if (t == NULL)
+ if (!t)
goto out;
t->id.proto = IPPROTO_IPIP;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b26376ef87f6..8e7328c6a390 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -504,7 +504,8 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (!net_eq(dev_net(dev), &init_net))
goto drop;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
return NET_RX_DROP;
if (!pskb_may_pull(skb, sizeof(struct arphdr)))
@@ -958,7 +959,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
return NET_RX_DROP;
if (!pskb_may_pull(skb,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 915d215a7d14..254238daf58b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -144,7 +144,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
err = -ENOENT;
t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->daddr, iph->saddr, 0);
- if (t == NULL)
+ if (!t)
goto out;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
@@ -251,7 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EINVAL;
}
- p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
+ p.i_key = p.o_key = 0;
+ p.i_flags = p.o_flags = 0;
if (p.iph.ttl)
p.iph.frag_off |= htons(IP_DF);
@@ -272,6 +273,7 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
};
#define IPIP_FEATURES (NETIF_F_SG | \
@@ -286,7 +288,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->type = ARPHRD_TUNNEL;
dev->flags = IFF_NOARP;
- dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
netif_keep_dst(dev);
@@ -325,10 +326,10 @@ static void ipip_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
if (data[IFLA_IPTUN_LOCAL])
- parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
if (data[IFLA_IPTUN_REMOTE])
- parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
if (data[IFLA_IPTUN_TTL]) {
parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
@@ -450,8 +451,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel_parm *parm = &tunnel->parms;
if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
- nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
- nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index fe54eba6d00d..3a2c0162c3ba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -73,9 +73,7 @@
struct mr_table {
struct list_head list;
-#ifdef CONFIG_NET_NS
- struct net *net;
-#endif
+ possible_net_t net;
u32 id;
struct sock __rcu *mroute_sk;
struct timer_list ipmr_expire_timer;
@@ -191,7 +189,7 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
}
mrt = ipmr_get_table(rule->fr_net, rule->table);
- if (mrt == NULL)
+ if (!mrt)
return -EAGAIN;
res->mrt = mrt;
return 0;
@@ -255,7 +253,7 @@ static int __net_init ipmr_rules_init(struct net *net)
INIT_LIST_HEAD(&net->ipv4.mr_tables);
mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
- if (mrt == NULL) {
+ if (!mrt) {
err = -ENOMEM;
goto err1;
}
@@ -323,11 +321,11 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
unsigned int i;
mrt = ipmr_get_table(net, id);
- if (mrt != NULL)
+ if (mrt)
return mrt;
mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
- if (mrt == NULL)
+ if (!mrt)
return NULL;
write_pnet(&mrt->net, net);
mrt->id = id;
@@ -429,7 +427,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
dev->flags |= IFF_MULTICAST;
in_dev = __in_dev_get_rtnl(dev);
- if (in_dev == NULL)
+ if (!in_dev)
goto failure;
ipv4_devconf_setall(in_dev);
@@ -480,8 +478,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
+static int reg_vif_get_iflink(const struct net_device *dev)
+{
+ return 0;
+}
+
static const struct net_device_ops reg_vif_netdev_ops = {
.ndo_start_xmit = reg_vif_xmit,
+ .ndo_get_iflink = reg_vif_get_iflink,
};
static void reg_vif_setup(struct net_device *dev)
@@ -507,7 +511,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
- if (dev == NULL)
+ if (!dev)
return NULL;
dev_net_set(dev, net);
@@ -516,7 +520,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
free_netdev(dev);
return NULL;
}
- dev->iflink = 0;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -764,7 +767,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
case 0:
if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
- if (dev && __in_dev_get_rtnl(dev) == NULL) {
+ if (dev && !__in_dev_get_rtnl(dev)) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
@@ -808,7 +811,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->pkt_out = 0;
v->link = dev->ifindex;
if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
- v->link = dev->iflink;
+ v->link = dev_get_iflink(dev);
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
@@ -1010,7 +1013,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
rcu_read_lock();
mroute_sk = rcu_dereference(mrt->mroute_sk);
- if (mroute_sk == NULL) {
+ if (!mroute_sk) {
rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
@@ -1163,7 +1166,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
return -EINVAL;
c = ipmr_cache_alloc();
- if (c == NULL)
+ if (!c)
return -ENOMEM;
c->mfc_origin = mfc->mfcc_origin.s_addr;
@@ -1285,7 +1288,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -EOPNOTSUPP;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return -ENOENT;
if (optname != MRT_INIT) {
@@ -1448,7 +1451,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
return -EOPNOTSUPP;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return -ENOENT;
if (optname != MRT_VERSION &&
@@ -1494,7 +1497,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
struct mr_table *mrt;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return -ENOENT;
switch (cmd) {
@@ -1568,7 +1571,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
struct mr_table *mrt;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return -ENOENT;
switch (cmd) {
@@ -1649,7 +1652,8 @@ static struct notifier_block ip_mr_notifier = {
* important for multicast video.
*/
-static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+static void ip_encap(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr)
{
struct iphdr *iph;
const struct iphdr *old_iph = ip_hdr(skb);
@@ -1668,14 +1672,14 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
iph->tot_len = htons(skb->len);
- ip_select_ident(skb, NULL);
+ ip_select_ident(net, skb, NULL);
ip_send_check(iph);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
nf_reset(skb);
}
-static inline int ipmr_forward_finish(struct sk_buff *skb)
+static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
@@ -1685,7 +1689,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output(skb);
+ return dst_output_sk(sk, skb);
}
/*
@@ -1702,7 +1706,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
struct flowi4 fl4;
int encap = 0;
- if (vif->dev == NULL)
+ if (!vif->dev)
goto out_free;
#ifdef CONFIG_IP_PIMSM
@@ -1765,7 +1769,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* What do we do with netfilter? -- RR
*/
if (vif->flags & VIFF_TUNNEL) {
- ip_encap(skb, vif->local, vif->remote);
+ ip_encap(net, skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
vif->dev->stats.tx_packets++;
vif->dev->stats.tx_bytes += skb->len;
@@ -1784,7 +1788,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
+ skb->dev, dev,
ipmr_forward_finish);
return;
@@ -1993,7 +1998,7 @@ int ip_mr_input(struct sk_buff *skb)
/* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
- if (cache == NULL) {
+ if (!cache) {
int vif = ipmr_find_vif(mrt, skb->dev);
if (vif >= 0)
@@ -2004,13 +2009,13 @@ int ip_mr_input(struct sk_buff *skb)
/*
* No usable cache entry
*/
- if (cache == NULL) {
+ if (!cache) {
int vif;
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
ip_local_deliver(skb);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOBUFS;
skb = skb2;
}
@@ -2069,7 +2074,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
read_unlock(&mrt_lock);
- if (reg_dev == NULL)
+ if (!reg_dev)
return 1;
skb->mac_header = skb->network_header;
@@ -2199,18 +2204,18 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
int err;
mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return -ENOENT;
rcu_read_lock();
cache = ipmr_cache_find(mrt, saddr, daddr);
- if (cache == NULL && skb->dev) {
+ if (!cache && skb->dev) {
int vif = ipmr_find_vif(mrt, skb->dev);
if (vif >= 0)
cache = ipmr_cache_find_any(mrt, daddr, vif);
}
- if (cache == NULL) {
+ if (!cache) {
struct sk_buff *skb2;
struct iphdr *iph;
struct net_device *dev;
@@ -2268,7 +2273,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int err;
nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
- if (nlh == NULL)
+ if (!nlh)
return -EMSGSIZE;
rtm = nlmsg_data(nlh);
@@ -2287,8 +2292,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
rtm->rtm_protocol = RTPROT_MROUTED;
rtm->rtm_flags = 0;
- if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
- nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
+ if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
goto nla_put_failure;
err = __ipmr_fill_mroute(mrt, skb, c, rtm);
/* do not break the dump if cache is unresolved */
@@ -2333,7 +2338,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
@@ -2448,7 +2453,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
struct mr_table *mrt;
mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return ERR_PTR(-ENOENT);
iter->mrt = mrt;
@@ -2567,7 +2572,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
struct mr_table *mrt;
mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
- if (mrt == NULL)
+ if (!mrt)
return ERR_PTR(-ENOENT);
it->mrt = mrt;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 7ebd6e37875c..61eafc9b4545 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -94,7 +94,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
{
struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
- if (entry->hook == NF_INET_LOCAL_OUT) {
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
const struct iphdr *iph = ip_hdr(skb);
rt_info->tos = iph->tos;
@@ -109,7 +109,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
{
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
- if (entry->hook == NF_INET_LOCAL_OUT) {
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
const struct iphdr *iph = ip_hdr(skb);
if (!(iph->tos == rt_info->tos &&
@@ -197,11 +197,4 @@ static int __init ipv4_netfilter_init(void)
{
return nf_register_afinfo(&nf_ip_afinfo);
}
-
-static void __exit ipv4_netfilter_fini(void)
-{
- nf_unregister_afinfo(&nf_ip_afinfo);
-}
-
-module_init(ipv4_netfilter_init);
-module_exit(ipv4_netfilter_fini);
+subsys_initcall(ipv4_netfilter_init);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 59f883d9cadf..2199a5db25e6 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,24 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
-config NF_LOG_ARP
- tristate "ARP packet logging"
- default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
-
-config NF_LOG_IPV4
- tristate "IPv4 packet logging"
- default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
+if NF_TABLES
config NF_TABLES_IPV4
- depends on NF_TABLES
tristate "IPv4 nf_tables support"
help
This option enables the IPv4 support for nf_tables.
+if NF_TABLES_IPV4
+
config NFT_CHAIN_ROUTE_IPV4
- depends on NF_TABLES_IPV4
tristate "IPv4 nf_tables route chain support"
help
This option enables the "route" chain for IPv4 in nf_tables. This
@@ -61,22 +53,34 @@ config NFT_CHAIN_ROUTE_IPV4
fields such as the source, destination, type of service and
the packet mark.
-config NF_REJECT_IPV4
- tristate "IPv4 packet rejection"
- default m if NETFILTER_ADVANCED=n
-
config NFT_REJECT_IPV4
- depends on NF_TABLES_IPV4
select NF_REJECT_IPV4
default NFT_REJECT
tristate
+endif # NF_TABLES_IPV4
+
config NF_TABLES_ARP
- depends on NF_TABLES
tristate "ARP nf_tables support"
help
This option enables the ARP support for nf_tables.
+endif # NF_TABLES
+
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_REJECT_IPV4
+ tristate "IPv4 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
config NF_NAT_IPV4
tristate "IPv4 NAT"
depends on NF_CONNTRACK_IPV4
@@ -191,7 +195,8 @@ config IP_NF_MATCH_ECN
config IP_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE || IP_NF_RAW
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f95b6f93814b..92305a1a021a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -248,16 +248,16 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
unsigned int arpt_do_table(struct sk_buff *skb,
unsigned int hook,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct xt_table *table)
{
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
const struct arphdr *arp;
- struct arpt_entry *e, *back;
+ struct arpt_entry *e, **jumpstack;
const char *indev, *outdev;
- void *table_base;
+ const void *table_base;
+ unsigned int cpu, stackidx = 0;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
@@ -265,24 +265,25 @@ unsigned int arpt_do_table(struct sk_buff *skb,
if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
return NF_DROP;
- indev = in ? in->name : nulldevname;
- outdev = out ? out->name : nulldevname;
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ cpu = smp_processor_id();
/*
* Ensure we load private-> members after we've fetched the base
* pointer.
*/
smp_read_barrier_depends();
- table_base = private->entries[smp_processor_id()];
+ table_base = private->entries;
+ jumpstack = (struct arpt_entry **)private->jumpstack[cpu];
e = get_entry(table_base, private->hook_entry[hook]);
- back = get_entry(table_base, private->underflow[hook]);
- acpar.in = in;
- acpar.out = out;
+ acpar.in = state->in;
+ acpar.out = state->out;
acpar.hooknum = hook;
acpar.family = NFPROTO_ARP;
acpar.hotdrop = false;
@@ -290,13 +291,15 @@ unsigned int arpt_do_table(struct sk_buff *skb,
arp = arp_hdr(skb);
do {
const struct xt_entry_target *t;
+ struct xt_counters *counter;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
e = arpt_next_entry(e);
continue;
}
- ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
t = arpt_get_target_c(e);
@@ -311,18 +314,23 @@ unsigned int arpt_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- e = back;
- back = get_entry(table_base, back->comefrom);
+ if (stackidx == 0) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ } else {
+ e = jumpstack[--stackidx];
+ e = arpt_next_entry(e);
+ }
continue;
}
if (table_base + v
!= arpt_next_entry(e)) {
- /* Save old back ptr in next entry */
- struct arpt_entry *next = arpt_next_entry(e);
- next->comefrom = (void *)back - table_base;
- /* set back pointer to next entry */
- back = next;
+ if (stackidx >= private->stacksize) {
+ verdict = NF_DROP;
+ break;
+ }
+ jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
@@ -522,6 +530,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
@@ -539,6 +551,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -615,6 +629,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
@@ -703,12 +718,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -723,14 +732,16 @@ static void get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -775,7 +786,7 @@ static int copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
/* ... then copy entire thing ... */
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
@@ -864,16 +875,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct arpt_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1038,7 +1049,7 @@ static int __do_replace(struct net *net, const char *name,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ loc_cpu_old_entry = oldinfo->entries;
xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
cleanup_entry(iter);
@@ -1076,14 +1087,16 @@ static int do_replace(struct net *net, const void __user *user,
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1113,7 +1126,7 @@ static int do_replace(struct net *net, const void __user *user,
static int do_add_counters(struct net *net, const void __user *user,
unsigned int len, int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1123,7 +1136,6 @@ static int do_add_counters(struct net *net, const void __user *user,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
struct arpt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1179,12 +1191,13 @@ static int do_add_counters(struct net *net, const void __user *user,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
- loc_cpu_entry = private->entries[curcpu];
+
addend = xt_write_recseq_begin();
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
@@ -1394,7 +1407,7 @@ static int translate_compat_table(const char *name,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1414,9 +1427,17 @@ static int translate_compat_table(const char *name,
i = 0;
xt_entry_foreach(iter1, entry1, newinfo->size) {
+ iter1->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(iter1->counters.pcnt)) {
+ ret = -ENOMEM;
+ break;
+ }
+
ret = check_target(iter1, name);
- if (ret != 0)
+ if (ret != 0) {
+ xt_percpu_counter_free(iter1->counters.pcnt);
break;
+ }
++i;
if (strcmp(arpt_get_target(iter1)->u.user.name,
XT_ERROR_TARGET) == 0)
@@ -1446,11 +1467,6 @@ static int translate_compat_table(const char *name,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1500,14 +1516,16 @@ static int compat_do_replace(struct net *net, void __user *user,
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1604,7 +1622,6 @@ static int compat_copy_entries_to_user(unsigned int total_size,
void __user *pos;
unsigned int size;
int ret = 0;
- void *loc_cpu_entry;
unsigned int i = 0;
struct arpt_entry *iter;
@@ -1612,11 +1629,9 @@ static int compat_copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy on our node/cpu */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -1785,8 +1800,7 @@ struct xt_table *arpt_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(newinfo, loc_cpu_entry, repl);
@@ -1817,7 +1831,7 @@ void arpt_unregister_table(struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter);
if (private->number > private->initial_entries)
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 802ddecb30b8..93876d03120c 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -28,12 +28,11 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- const struct net *net = dev_net((in != NULL) ? in : out);
+ const struct net *net = dev_net(state->in ? state->in : state->out);
- return arpt_do_table(skb, ops->hooknum, in, out,
+ return arpt_do_table(skb, ops->hooknum, state,
net->ipv4.arptable_filter);
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cf5e82f39d3b..6c72fbb7b49e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb,
const struct xt_table_info *private,
const struct ipt_entry *e)
{
- const void *table_base;
const struct ipt_entry *root;
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
struct net *net = dev_net(in ? in : out);
- table_base = private->entries[smp_processor_id()];
- root = get_entry(table_base, private->hook_entry[hook]);
+ root = get_entry(private->entries, private->hook_entry[hook]);
hookname = chainname = hooknames[hook];
comment = comments[NF_IP_TRACE_COMMENT_RULE];
@@ -288,8 +286,7 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
unsigned int
ipt_do_table(struct sk_buff *skb,
unsigned int hook,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct xt_table *table)
{
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
@@ -306,8 +303,8 @@ ipt_do_table(struct sk_buff *skb,
/* Initialization */
ip = ip_hdr(skb);
- indev = in ? in->name : nulldevname;
- outdev = out ? out->name : nulldevname;
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
@@ -317,8 +314,8 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
- acpar.in = in;
- acpar.out = out;
+ acpar.in = state->in;
+ acpar.out = state->out;
acpar.family = NFPROTO_IPV4;
acpar.hooknum = hook;
@@ -332,7 +329,7 @@ ipt_do_table(struct sk_buff *skb,
* pointer.
*/
smp_read_barrier_depends();
- table_base = private->entries[cpu];
+ table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
stackptr = per_cpu_ptr(private->stackptr, cpu);
origptr = *stackptr;
@@ -346,6 +343,7 @@ ipt_do_table(struct sk_buff *skb,
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
+ struct xt_counters *counter;
IP_NF_ASSERT(e);
if (!ip_packet_match(ip, indev, outdev,
@@ -362,7 +360,8 @@ ipt_do_table(struct sk_buff *skb,
goto no_match;
}
- ADD_COUNTER(e->counters, skb->len, 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -370,7 +369,7 @@ ipt_do_table(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
- trace_packet(skb, hook, in, out,
+ trace_packet(skb, hook, state->in, state->out,
table->name, private, e);
#endif
/* Standard target? */
@@ -666,6 +665,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -692,6 +695,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
ret = check_target(e, net, name);
if (ret)
goto err;
+
return 0;
err:
module_put(t->u.kernel.target->me);
@@ -701,6 +705,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -785,6 +792,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
@@ -867,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -888,14 +890,16 @@ get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -940,11 +944,7 @@ copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
@@ -1052,16 +1052,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct ipt_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1182,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_table *t;
struct xt_table_info *oldinfo;
struct xt_counters *counters;
- void *loc_cpu_old_entry;
struct ipt_entry *iter;
ret = 0;
@@ -1225,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
@@ -1263,14 +1261,16 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1301,7 +1301,7 @@ static int
do_add_counters(struct net *net, const void __user *user,
unsigned int len, int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1311,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
struct ipt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1367,12 +1366,12 @@ do_add_counters(struct net *net, const void __user *user,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
- loc_cpu_entry = private->entries[curcpu];
addend = xt_write_recseq_begin();
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
@@ -1442,7 +1441,6 @@ static int
compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ipt_ip *ip,
- unsigned int hookmask,
int *size)
{
struct xt_match *match;
@@ -1511,8 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
entry_offset = (void *)e - (void *)base;
j = 0;
xt_ematch_foreach(ematch, e) {
- ret = compat_find_calc_match(ematch, name,
- &e->ip, e->comefrom, &off);
+ ret = compat_find_calc_match(ematch, name, &e->ip, &off);
if (ret != 0)
goto release_matches;
++j;
@@ -1608,6 +1605,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
unsigned int j;
int ret = 0;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -1632,6 +1633,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -1716,7 +1720,7 @@ translate_compat_table(struct net *net,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1768,11 +1772,6 @@ translate_compat_table(struct net *net,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1810,14 +1809,16 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1888,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
void __user *pos;
unsigned int size;
int ret = 0;
- const void *loc_cpu_entry;
unsigned int i = 0;
struct ipt_entry *iter;
@@ -1896,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -2078,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu, but dont care about preemption */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2110,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter, net);
if (private->number > private->initial_entries)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e90f83a3415b..45cb16a6a4a3 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
struct clusterip_config *config;
int ret;
+ if (par->nft_compat) {
+ pr_err("cannot use CLUSTERIP target from nftables compat\n");
+ return -EOPNOTSUPP;
+ }
+
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
@@ -418,6 +423,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
+
+ if (!par->net->xt.clusterip_deprecated_warning) {
+ pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
+ "use xt_cluster instead\n");
+ par->net->xt.clusterip_deprecated_warning = true;
+ }
+
return ret;
}
@@ -497,14 +509,12 @@ static void arp_print(struct arp_payload *payload)
static unsigned int
arp_mangle(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
- struct net *net = dev_net(in ? in : out);
+ struct net *net = dev_net(state->in ? state->in : state->out);
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -529,10 +539,10 @@ arp_mangle(const struct nf_hook_ops *ops,
* addresses on different interfacs. However, in the CLUSTERIP case
* this wouldn't work, since we didn't subscribe the mcast group on
* other interfaces */
- if (c->dev != out) {
+ if (c->dev != state->out) {
pr_debug("not mangling arp reply on different "
"interface: cip'%s'-skb'%s'\n",
- c->dev->name, out->name);
+ c->dev->name, state->out->name);
clusterip_config_put(c);
return NF_ACCEPT;
}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 8f48f5517e33..87907d4bd259 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,31 +34,32 @@ static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_reject_info *reject = par->targinfo;
+ int hook = par->hooknum;
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
- nf_send_unreach(skb, ICMP_NET_UNREACH);
+ nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
break;
case IPT_ICMP_HOST_UNREACHABLE:
- nf_send_unreach(skb, ICMP_HOST_UNREACH);
+ nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
break;
case IPT_ICMP_PROT_UNREACHABLE:
- nf_send_unreach(skb, ICMP_PROT_UNREACH);
+ nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
break;
case IPT_ICMP_PORT_UNREACHABLE:
- nf_send_unreach(skb, ICMP_PORT_UNREACH);
+ nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
break;
case IPT_ICMP_NET_PROHIBITED:
- nf_send_unreach(skb, ICMP_NET_ANO);
+ nf_send_unreach(skb, ICMP_NET_ANO, hook);
break;
case IPT_ICMP_HOST_PROHIBITED:
- nf_send_unreach(skb, ICMP_HOST_ANO);
+ nf_send_unreach(skb, ICMP_HOST_ANO, hook);
break;
case IPT_ICMP_ADMIN_PROHIBITED:
- nf_send_unreach(skb, ICMP_PKT_FILTERED);
+ nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(skb, par->hooknum);
+ nf_send_reset(skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index a313c3fbeb46..fe8cc183411e 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -18,7 +18,7 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
static struct iphdr *
-synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct iphdr *iph;
@@ -220,7 +220,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
nth->ack_seq = th->ack_seq;
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
- nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
nth->check = 0;
nth->urg_ptr = 0;
@@ -300,11 +300,9 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
+ struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 4bfaedf9b34e..8618fd150c96 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
struct net *net = dev_net(dev);
int ret __maybe_unused;
- if (fib_lookup(net, fl4, &res))
+ if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
if (res.type != RTN_UNICAST) {
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index e08a74a243a8..a0f3beca52d2 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -34,8 +34,7 @@ static const struct xt_table packet_filter = {
static unsigned int
iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
const struct net *net;
@@ -45,9 +44,8 @@ iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, ops->hooknum, in, out,
- net->ipv4.iptable_filter);
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 6a5079c34bb3..62cbb8c5f4a8 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -37,8 +37,9 @@ static const struct xt_table packet_mangler = {
};
static unsigned int
-ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
+ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct net_device *out = state->out;
unsigned int ret;
const struct iphdr *iph;
u_int8_t tos;
@@ -58,7 +59,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
+ ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
dev_net(out)->ipv4.iptable_mangle);
/* Reroute for ANY change. */
if (ret != NF_DROP && ret != NF_STOLEN) {
@@ -81,18 +82,16 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
static unsigned int
iptable_mangle_hook(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
if (ops->hooknum == NF_INET_LOCAL_OUT)
- return ipt_mangle_out(skb, out);
+ return ipt_mangle_out(skb, state);
if (ops->hooknum == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, ops->hooknum, in, out,
- dev_net(out)->ipv4.iptable_mangle);
+ return ipt_do_table(skb, ops->hooknum, state,
+ dev_net(state->out)->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, ops->hooknum, in, out,
- dev_net(in)->ipv4.iptable_mangle);
+ return ipt_do_table(skb, ops->hooknum, state,
+ dev_net(state->in)->ipv4.iptable_mangle);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 6b67d7e9a75d..0d4d9cdf98a4 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -30,49 +30,40 @@ static const struct xt_table nf_nat_ipv4_table = {
static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
- return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
}
static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain);
+ return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
}
static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain);
+ return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
}
static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain);
+ return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
}
static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain);
+ return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
}
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index b2f7e8f98316..0356e6da4bb7 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -21,8 +21,7 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
const struct net *net;
@@ -32,8 +31,8 @@ iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c86647ed2078..4bce3980ccd9 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -38,9 +38,7 @@ static const struct xt_table security_table = {
static unsigned int
iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
const struct net *net;
@@ -50,8 +48,8 @@ iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, ops->hooknum, in, out,
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state,
net->ipv4.iptable_security);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5c61328b7704..30ad9554b5e9 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -94,9 +94,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -123,9 +121,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -149,24 +145,20 @@ out:
static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
}
static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
}
/* Connection tracking may drop packets, but never alters them, so
@@ -322,8 +314,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
- nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
goto nla_put_failure;
return 0;
@@ -342,8 +334,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
return -EINVAL;
- t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]);
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
return 0;
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index a460a87e14f8..f0dfe92a00d6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -300,7 +300,9 @@ static int exp_seq_show(struct seq_file *s, void *v)
__nf_ct_l3proto_find(exp->tuple.src.l3num),
__nf_ct_l4proto_find(exp->tuple.src.l3num,
exp->tuple.dst.protonum));
- return seq_putc(s, '\n');
+ seq_putc(s, '\n');
+
+ return 0;
}
static const struct seq_operations exp_seq_ops = {
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 7e5ca6f2d0cd..c88b7d434718 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -63,9 +63,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(skb->sk);
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index d059182c1466..e7ad950cf9ef 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -10,8 +10,10 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/skbuff.h>
@@ -27,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
- .level = 5,
+ .level = LOGLEVEL_NOTICE,
.logflags = NF_LOG_MASK,
},
},
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 75101980eeee..076aadda0473 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -5,8 +5,10 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/skbuff.h>
@@ -26,7 +28,7 @@ static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
- .level = 5,
+ .level = LOGLEVEL_NOTICE,
.logflags = NF_LOG_MASK,
},
},
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index fc37711e11f3..e59cc05c09e9 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -256,11 +256,10 @@ EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
unsigned int
nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
+ const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct nf_conn *ct))
{
struct nf_conn *ct;
@@ -309,7 +308,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = do_chain(ops, skb, in, out, ct);
+ ret = do_chain(ops, skb, state, ct);
if (ret != NF_ACCEPT)
return ret;
@@ -323,7 +322,8 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
+ state->out))
goto oif_changed;
}
break;
@@ -332,7 +332,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
goto oif_changed;
}
@@ -346,17 +346,16 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
unsigned int
nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
+ const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct nf_conn *ct))
{
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
@@ -367,11 +366,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
unsigned int
nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
+ const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct nf_conn *ct))
{
#ifdef CONFIG_XFRM
@@ -386,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -410,11 +408,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
unsigned int
nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
+ const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct nf_conn *ct))
{
const struct nf_conn *ct;
@@ -427,7 +424,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 536da7bc598a..3262e41ff76f 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -13,6 +13,7 @@
#include <net/dst.h>
#include <net/netfilter/ipv4/nf_reject.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
#include <net/netfilter/ipv4/nf_reject.h>
const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
@@ -43,7 +44,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
const struct sk_buff *oldskb,
- __be16 protocol, int ttl)
+ __u8 protocol, int ttl)
{
struct iphdr *niph, *oiph = ip_hdr(oldskb);
@@ -146,7 +147,8 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
*/
if (oldskb->nf_bridge) {
struct ethhdr *oeth = eth_hdr(oldskb);
- nskb->dev = oldskb->nf_bridge->physindev;
+
+ nskb->dev = nf_bridge_get_physindev(oldskb);
niph->tot_len = htons(nskb->len);
ip_send_check(niph);
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
@@ -164,4 +166,27 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
}
EXPORT_SYMBOL_GPL(nf_send_reset);
+void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
+{
+ struct iphdr *iph = ip_hdr(skb_in);
+ u8 proto;
+
+ if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET))
+ return;
+
+ if (skb_csum_unnecessary(skb_in)) {
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+ return;
+ }
+
+ if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
+ proto = iph->protocol;
+ else
+ proto = 0;
+
+ if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach);
+
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 19412a4063fb..8412268bbad1 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -17,13 +17,11 @@
static unsigned int
nft_do_chain_arp(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo(&pkt, ops, skb, in, out);
+ nft_set_pktinfo(&pkt, ops, skb, state);
return nft_do_chain(&pkt, ops);
}
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 6820c8c40842..aa180d3a69a5 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -20,22 +20,18 @@
static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
return nft_do_chain(&pkt, ops);
}
static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
if (unlikely(skb->len < sizeof(struct iphdr) ||
ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
@@ -45,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
return NF_ACCEPT;
}
- return nft_do_chain_ipv4(ops, skb, in, out, okfn);
+ return nft_do_chain_ipv4(ops, skb, state);
}
struct nft_af_info nft_af_ipv4 __read_mostly = {
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index df547bf50078..bf5c30ae14e4 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -28,51 +28,42 @@
static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
+ const struct nf_hook_state *state,
struct nf_conn *ct)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
return nft_do_chain(&pkt, ops);
}
static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain);
+ return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
}
static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain);
+ return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
}
static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain);
+ return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
}
static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain);
+ return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
}
static const struct nf_chain_type nft_chain_nat_ipv4 = {
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 125b66766c0a..e335b0afdaf3 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -23,9 +23,7 @@
static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ const struct nf_hook_state *state)
{
unsigned int ret;
struct nft_pktinfo pkt;
@@ -39,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
mark = skb->mark;
iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 665de06561cd..40e414c4ca56 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -17,20 +17,17 @@
#include <net/netfilter/ipv4/nf_nat_masquerade.h>
static void nft_masq_ipv4_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1],
+ struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
struct nft_masq *priv = nft_expr_priv(expr);
struct nf_nat_range range;
- unsigned int verdict;
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
- verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
- &range, pkt->out);
-
- data[NFT_REG_VERDICT].verdict = verdict;
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ &range, pkt->out);
}
static struct nft_expr_type nft_masq_ipv4_type;
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index 6ecfce63201a..d8d795df9c13 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -18,26 +18,25 @@
#include <net/netfilter/nft_redir.h>
static void nft_redir_ipv4_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1],
+ struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
struct nft_redir *priv = nft_expr_priv(expr);
struct nf_nat_ipv4_multi_range_compat mr;
- unsigned int verdict;
memset(&mr, 0, sizeof(mr));
if (priv->sreg_proto_min) {
mr.range[0].min.all =
- *(__be16 *)&data[priv->sreg_proto_min].data[0];
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
mr.range[0].max.all =
- *(__be16 *)&data[priv->sreg_proto_max].data[0];
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
mr.range[0].flags |= priv->flags;
- verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum);
- data[NFT_REG_VERDICT].verdict = verdict;
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
+ pkt->ops->hooknum);
}
static struct nft_expr_type nft_redir_ipv4_type;
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index d729542bd1b7..b07e58b51158 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -20,21 +20,24 @@
#include <net/netfilter/nft_reject.h>
static void nft_reject_ipv4_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1],
+ struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
struct nft_reject *priv = nft_expr_priv(expr);
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code);
+ nf_send_unreach(pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
break;
case NFT_REJECT_TCP_RST:
nf_send_reset(pkt->skb, pkt->ops->hooknum);
break;
+ default:
+ break;
}
- data[NFT_REG_VERDICT].verdict = NF_DROP;
+ regs->verdict.code = NF_DROP;
}
static struct nft_expr_type nft_reject_ipv4_type;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 208d5439e59b..05ff44b758df 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -64,11 +64,11 @@ EXPORT_SYMBOL_GPL(pingv6_ops);
static u16 ping_port_rover;
-static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
+static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
- int res = (num + net_hash_mix(net)) & mask;
+ u32 res = (num + net_hash_mix(net)) & mask;
- pr_debug("hash(%d) = %d\n", num, res);
+ pr_debug("hash(%u) = %u\n", num, res);
return res;
}
EXPORT_SYMBOL_GPL(ping_hash);
@@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk)
if (sk_hashed(sk)) {
write_lock_bh(&ping_table.lock);
hlist_nulls_del(&sk->sk_nulls_node);
+ sk_nulls_node_init(&sk->sk_nulls_node);
sock_put(sk);
isk->inet_num = 0;
isk->inet_sport = 0;
@@ -516,7 +517,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
ntohs(icmph->un.echo.sequence));
sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
- if (sk == NULL) {
+ if (!sk) {
pr_debug("no socket, dropping\n");
return; /* No socket for error */
}
@@ -692,8 +693,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
}
EXPORT_SYMBOL_GPL(ping_common_sendmsg);
-static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct net *net = sock_net(sk);
struct flowi4 fl4;
@@ -849,8 +849,8 @@ do_confirm:
goto out;
}
-int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
{
struct inet_sock *isk = inet_sk(sk);
int family = sk->sk_family;
@@ -972,7 +972,7 @@ bool ping_rcv(struct sk_buff *skb)
skb_push(skb, skb->data - (u8 *)icmph);
sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
- if (sk != NULL) {
+ if (sk) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
pr_debug("rcv on socket %p\n", sk);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index d8953ef0770c..da5d483e236a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- tcp_death_row.tw_count, sockets,
+ atomic_read(&tcp_death_row.tw_count), sockets,
proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
@@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+ SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+ SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f027a708b7e0..561cd4b8fc6e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -46,7 +46,6 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/errno.h>
-#include <linux/aio.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
@@ -293,7 +292,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
read_lock(&raw_v4_hashinfo.lock);
raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
- if (raw_sk != NULL) {
+ if (raw_sk) {
iph = (const struct iphdr *)skb->data;
net = dev_net(skb->dev);
@@ -363,7 +362,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb = sock_alloc_send_skb(sk,
length + hlen + tlen + 15,
flags & MSG_DONTWAIT, &err);
- if (skb == NULL)
+ if (!skb)
goto error;
skb_reserve(skb, hlen);
@@ -404,7 +403,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(skb, NULL);
+ ip_select_ident(net, skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
@@ -412,8 +411,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
icmp_out_count(net, ((struct icmphdr *)
skb_transport_header(skb))->type);
- err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
- rt->dst.dev, dst_output);
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, rt->dst.dev, dst_output_sk);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -481,8 +480,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
}
-static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct ipcm_cookie ipc;
@@ -709,8 +707,8 @@ out: return ret;
* we return it, otherwise we block.
*/
-static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
@@ -873,7 +871,7 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL)
+ if (skb)
amount = skb->len;
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ad5064362c5c..e681b852ced1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -152,7 +152,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
- .protocol = cpu_to_be16(ETH_P_IP),
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.mtu = ipv4_mtu,
@@ -458,12 +457,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
}
#define IP_IDENTS_SZ 2048u
-struct ip_ident_bucket {
- atomic_t id;
- u32 stamp32;
-};
-static struct ip_ident_bucket *ip_idents __read_mostly;
+static atomic_t *ip_idents __read_mostly;
+static u32 *ip_tstamps __read_mostly;
/* In order to protect privacy, we add a perturbation to identifiers
* if one generator is seldom used. This makes hard for an attacker
@@ -471,19 +467,20 @@ static struct ip_ident_bucket *ip_idents __read_mostly;
*/
u32 ip_idents_reserve(u32 hash, int segs)
{
- struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
+ atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
u32 delta = 0;
- if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, &bucket->id) - segs;
+ return atomic_add_return(segs + delta, p_id) - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
-void __ip_select_ident(struct iphdr *iph, int segs)
+void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
static u32 ip_idents_hashrnd __read_mostly;
u32 hash, id;
@@ -492,7 +489,7 @@ void __ip_select_ident(struct iphdr *iph, int segs)
hash = jhash_3words((__force u32)iph->daddr,
(__force u32)iph->saddr,
- iph->protocol,
+ iph->protocol ^ net_hash_mix(net),
ip_idents_hashrnd);
id = ip_idents_reserve(hash, segs);
iph->id = htons(id);
@@ -750,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
if (!(n->nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
} else {
- if (fib_lookup(net, fl4, &res) == 0) {
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, new_gw,
@@ -903,6 +900,10 @@ static int ip_error(struct sk_buff *skb)
bool send;
int code;
+ /* IP on this device is disabled. */
+ if (!in_dev)
+ goto out;
+
net = dev_net(rt->dst.dev);
if (!IN_DEV_FORWARD(in_dev)) {
switch (rt->dst.error) {
@@ -963,10 +964,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (dst_metric_locked(dst, RTAX_MTU))
return;
- if (dst->dev->mtu < mtu)
- return;
-
- if (rt->rt_pmtu && rt->rt_pmtu < mtu)
+ if (ipv4_mtu(dst) < mtu)
return;
if (mtu < ip_rt_min_pmtu)
@@ -977,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+ if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
@@ -1057,7 +1055,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
- if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
+ if (odst->obsolete && !odst->ops->check(odst, 0)) {
rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
if (IS_ERR(rt))
goto out;
@@ -1188,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
fl4.flowi4_mark = skb->mark;
rcu_read_lock();
- if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
else
src = inet_select_addr(rt->dst.dev,
@@ -1451,7 +1449,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
/* Primary sanity checks. */
- if (in_dev == NULL)
+ if (!in_dev)
return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
@@ -1554,7 +1552,7 @@ static int __mkroute_input(struct sk_buff *skb,
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
- if (out_dev == NULL) {
+ if (!out_dev) {
net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
}
@@ -1592,7 +1590,7 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe != NULL)
+ if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input);
else
rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
@@ -1718,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.daddr = daddr;
fl4.saddr = saddr;
- err = fib_lookup(net, &fl4, &res);
+ err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
err = -EHOSTUNREACH;
@@ -2055,7 +2053,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
ipv4_is_lbcast(fl4->daddr))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = __ip_dev_find(net, fl4->saddr, false);
- if (dev_out == NULL)
+ if (!dev_out)
goto out;
/* Special hack: user can direct multicasts
@@ -2088,7 +2086,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
if (fl4->flowi4_oif) {
dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
rth = ERR_PTR(-ENODEV);
- if (dev_out == NULL)
+ if (!dev_out)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
@@ -2097,7 +2095,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto out;
}
if (ipv4_is_local_multicast(fl4->daddr) ||
- ipv4_is_lbcast(fl4->daddr)) {
+ ipv4_is_lbcast(fl4->daddr) ||
+ fl4->flowi4_proto == IPPROTO_IGMP) {
if (!fl4->saddr)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
@@ -2124,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
- if (fib_lookup(net, fl4, &res)) {
+ if (fib_lookup(net, fl4, &res, 0)) {
res.fi = NULL;
res.table = NULL;
if (fl4->flowi4_oif) {
@@ -2177,7 +2176,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
if (!res.prefixlen &&
res.table->tb_num_default > 1 &&
res.type == RTN_UNICAST && !fl4->flowi4_oif)
- fib_select_default(&res);
+ fib_select_default(fl4, &res);
if (!fl4->saddr)
fl4->saddr = FIB_RES_PREFSRC(net, res);
@@ -2225,7 +2224,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
- .protocol = cpu_to_be16(ETH_P_IP),
.check = ipv4_blackhole_dst_check,
.mtu = ipv4_blackhole_mtu,
.default_advmss = ipv4_default_advmss,
@@ -2301,7 +2299,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
u32 metrics[RTAX_MAX];
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
- if (nlh == NULL)
+ if (!nlh)
return -EMSGSIZE;
r = nlmsg_data(nlh);
@@ -2321,11 +2319,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
r->rtm_flags |= RTCF_DOREDIRECT;
- if (nla_put_be32(skb, RTA_DST, dst))
+ if (nla_put_in_addr(skb, RTA_DST, dst))
goto nla_put_failure;
if (src) {
r->rtm_src_len = 32;
- if (nla_put_be32(skb, RTA_SRC, src))
+ if (nla_put_in_addr(skb, RTA_SRC, src))
goto nla_put_failure;
}
if (rt->dst.dev &&
@@ -2338,11 +2336,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
#endif
if (!rt_is_input_route(rt) &&
fl4->saddr != src) {
- if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
+ if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
if (rt->rt_uses_gateway &&
- nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
goto nla_put_failure;
expires = rt->dst.expires;
@@ -2423,7 +2421,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
rtm = nlmsg_data(nlh);
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (skb == NULL) {
+ if (!skb) {
err = -ENOBUFS;
goto errout;
}
@@ -2438,8 +2436,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
ip_hdr(skb)->protocol = IPPROTO_ICMP;
skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
- src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
- dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
+ src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
+ dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
@@ -2454,7 +2452,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
struct net_device *dev;
dev = __dev_get_by_index(net, iif);
- if (dev == NULL) {
+ if (!dev) {
err = -ENODEV;
goto errout_free;
}
@@ -2653,7 +2651,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
tbl = ipv4_route_flush_table;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
- if (tbl == NULL)
+ if (!tbl)
goto err_dup;
/* Don't export sysctls to unprivileged users */
@@ -2663,7 +2661,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
tbl[0].extra1 = net;
net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
- if (net->ipv4.route_hdr == NULL)
+ if (!net->ipv4.route_hdr)
goto err_reg;
return 0;
@@ -2743,6 +2741,10 @@ int __init ip_rt_init(void)
prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
+ if (!ip_tstamps)
+ panic("IP: failed to allocate ip_tstamps\n");
+
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 45fe60c5238e..d70b1f603692 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -219,22 +219,23 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_check);
-static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
- if (child)
+ if (child) {
+ atomic_set(&req->rsk_refcnt, 1);
inet_csk_reqsk_queue_add(sk, req, child);
- else
+ } else {
reqsk_free(req);
-
+ }
return child;
}
-
+EXPORT_SYMBOL(tcp_get_cookie_sock);
/*
* when syncookies are in effect and tcp timestamps are enabled we stored
@@ -325,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
if (!req)
goto out;
@@ -336,8 +337,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
- ireq->ir_loc_addr = ip_hdr(skb)->daddr;
- ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
ireq->ir_mark = inet_request_mark(sk, skb);
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
@@ -345,7 +346,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
- treq->listener = NULL;
+ treq->tfo_listener = false;
+
+ ireq->ir_iif = sk->sk_bound_dev_if;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -357,7 +360,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
}
- req->expires = 0UL;
req->num_retrans = 0;
/*
@@ -389,7 +391,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
- ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
* Normal sockets get it right from inet_csk_route_child_sock()
*/
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d151539da8e6..433231ccfb17 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -41,11 +41,19 @@ static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int min_sndbuf = SOCK_MIN_SNDBUF;
+static int min_rcvbuf = SOCK_MIN_RCVBUF;
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
+ bool same_parity = !((range[0] ^ range[1]) & 1);
+
write_seqlock(&net->ipv4.ip_local_ports.lock);
+ if (same_parity && !net->ipv4.ip_local_ports.warned) {
+ net->ipv4.ip_local_ports.warned = true;
+ pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
+ }
net->ipv4.ip_local_ports.range[0] = range[0];
net->ipv4.ip_local_ports.range[1] = range[1];
write_sequnlock(&net->ipv4.ip_local_ports.lock);
@@ -522,7 +530,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_tcp_wmem),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = &min_sndbuf,
},
{
.procname = "tcp_notsent_lowat",
@@ -537,7 +545,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_tcp_rmem),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = &min_rcvbuf,
},
{
.procname = "tcp_app_win",
@@ -702,7 +710,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = &one,
.extra2 = &gso_max_segs,
},
{
@@ -750,7 +758,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_rmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one
+ .extra1 = &min_rcvbuf,
},
{
.procname = "udp_wmem_min",
@@ -758,7 +766,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_wmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one
+ .extra1 = &min_sndbuf,
},
{ }
};
@@ -821,6 +829,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_ecn_fallback",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_local_port_range",
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
.data = &init_net.ipv4.ip_local_ports.range,
@@ -883,6 +898,20 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -895,7 +924,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
int i;
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
- if (table == NULL)
+ if (!table)
goto err_alloc;
/* Update the variables to point into the current struct net */
@@ -904,7 +933,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
}
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
- if (net->ipv4.ipv4_hdr == NULL)
+ if (!net->ipv4.ipv4_hdr)
goto err_reg;
net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
@@ -942,7 +971,7 @@ static __init int sysctl_ipv4_init(void)
struct ctl_table_header *hdr;
hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
- if (hdr == NULL)
+ if (!hdr)
return -ENOMEM;
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 995a2259bcfc..45534a5ab430 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -252,6 +252,7 @@
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
+#include <linux/inet_diag.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
@@ -401,6 +402,7 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
+ u64_stats_init(&tp->syncp);
tp->reordering = sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
@@ -496,7 +498,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Connected or passive Fast Open socket? */
if (sk->sk_state != TCP_SYN_SENT &&
- (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
+ (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq &&
@@ -520,8 +522,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Race breaker. If space is freed after
* wspace test but before the flags are set,
- * IO signal will be lost.
+ * IO signal will be lost. Memory barrier
+ * pairs with the input side.
*/
+ smp_mb__after_atomic();
if (sk_stream_is_writeable(sk))
mask |= POLLOUT | POLLWRNORM;
}
@@ -691,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
- ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
- tss->flags);
+ ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+ min(rd_desc->count, len), tss->flags,
+ skb_socket_splice);
if (ret > 0)
rd_desc->count -= ret;
return ret;
@@ -775,7 +780,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ret = -EAGAIN;
break;
}
- sk_wait_data(sk, &timeo);
+ sk_wait_data(sk, &timeo, NULL);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
break;
@@ -805,16 +810,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_SYMBOL(tcp_splice_read);
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+ bool force_schedule)
{
struct sk_buff *skb;
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
+ if (unlikely(tcp_under_memory_pressure(sk)))
+ sk_mem_reclaim_partial(sk);
+
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
- if (skb) {
- if (sk_wmem_schedule(sk, skb->truesize)) {
+ if (likely(skb)) {
+ bool mem_scheduled;
+
+ if (force_schedule) {
+ mem_scheduled = true;
+ sk_forced_mem_schedule(sk, skb->truesize);
+ } else {
+ mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
+ }
+ if (likely(mem_scheduled)) {
skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
@@ -904,7 +921,8 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ skb_queue_empty(&sk->sk_write_queue));
if (!skb)
goto wait_for_memory;
@@ -983,6 +1001,9 @@ do_error:
if (copied)
goto out;
out_err:
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
return sk_stream_error(sk, flags, err);
}
@@ -1028,7 +1049,7 @@ static inline int select_size(const struct sock *sk, bool sg)
void tcp_free_fastopen_req(struct tcp_sock *tp)
{
- if (tp->fastopen_req != NULL) {
+ if (tp->fastopen_req) {
kfree(tp->fastopen_req);
tp->fastopen_req = NULL;
}
@@ -1042,12 +1063,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
return -EOPNOTSUPP;
- if (tp->fastopen_req != NULL)
+ if (tp->fastopen_req)
return -EALREADY; /* Another Fast Open is in progress */
tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
sk->sk_allocation);
- if (unlikely(tp->fastopen_req == NULL))
+ if (unlikely(!tp->fastopen_req))
return -ENOBUFS;
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
@@ -1060,8 +1081,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return err;
}
-int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t size)
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1120,7 +1140,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sg = !!(sk->sk_route_caps & NETIF_F_SG);
- while (iov_iter_count(&msg->msg_iter)) {
+ while (msg_data_left(msg)) {
int copy = 0;
int max = size_goal;
@@ -1141,7 +1161,8 @@ new_segment:
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
- sk->sk_allocation);
+ sk->sk_allocation,
+ skb_queue_empty(&sk->sk_write_queue));
if (!skb)
goto wait_for_memory;
@@ -1164,8 +1185,8 @@ new_segment:
}
/* Try to append data to the end of skb. */
- if (copy > iov_iter_count(&msg->msg_iter))
- copy = iov_iter_count(&msg->msg_iter);
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
/* Where to copy to? */
if (skb_availroom(skb) > 0) {
@@ -1222,7 +1243,7 @@ new_segment:
tcp_skb_pcount_set(skb, 0);
copied += copy;
- if (!iov_iter_count(&msg->msg_iter)) {
+ if (!msg_data_left(msg)) {
tcp_tx_timestamp(sk, skb);
goto out;
}
@@ -1272,6 +1293,9 @@ do_error:
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
release_sock(sk);
return err;
}
@@ -1539,8 +1563,8 @@ EXPORT_SYMBOL(tcp_read_sock);
* Probably, code can be easily improved even more.
*/
-int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int nonblock, int flags, int *addr_len)
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
@@ -1551,7 +1575,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
u32 urg_hole = 0;
if (unlikely(flags & MSG_ERRQUEUE))
@@ -1611,7 +1635,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Next get a buffer. */
+ last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
+ last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
@@ -1730,8 +1756,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
- } else
- sk_wait_data(sk, &timeo);
+ } else {
+ sk_wait_data(sk, &timeo, last);
+ }
if (user_recv) {
int chunk;
@@ -1914,18 +1941,19 @@ EXPORT_SYMBOL_GPL(tcp_set_state);
static const unsigned char new_state[16] = {
/* current state: new state: action: */
- /* (Invalid) */ TCP_CLOSE,
- /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
- /* TCP_SYN_SENT */ TCP_CLOSE,
- /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
- /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
- /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
- /* TCP_TIME_WAIT */ TCP_CLOSE,
- /* TCP_CLOSE */ TCP_CLOSE,
- /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
- /* TCP_LAST_ACK */ TCP_LAST_ACK,
- /* TCP_LISTEN */ TCP_CLOSE,
- /* TCP_CLOSING */ TCP_CLOSING,
+ [0 /* (Invalid) */] = TCP_CLOSE,
+ [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_SYN_SENT] = TCP_CLOSE,
+ [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
+ [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
+ [TCP_TIME_WAIT] = TCP_CLOSE,
+ [TCP_CLOSE] = TCP_CLOSE,
+ [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
+ [TCP_LAST_ACK] = TCP_LAST_ACK,
+ [TCP_LISTEN] = TCP_CLOSE,
+ [TCP_CLOSING] = TCP_CLOSING,
+ [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
};
static int tcp_close_state(struct sock *sk)
@@ -2138,7 +2166,7 @@ adjudge_to_death:
* aborted (e.g., closed with unread data) before 3WHS
* finishes.
*/
- if (req != NULL)
+ if (req)
reqsk_fastopen_remove(sk, req, false);
inet_csk_destroy_sock(sk);
}
@@ -2479,6 +2507,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
icsk->icsk_syn_retries = val;
break;
+ case TCP_SAVE_SYN:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->save_syn = val;
+ break;
+
case TCP_LINGER2:
if (val < 0)
tp->linger2 = -1;
@@ -2541,10 +2576,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
- TCPF_LISTEN)))
+ TCPF_LISTEN))) {
+ tcp_fastopen_init_key_once(true);
+
err = fastopen_init_queue(sk, val);
- else
+ } else {
err = -EINVAL;
+ }
break;
case TCP_TIMESTAMP:
if (!tp->repair)
@@ -2590,13 +2628,17 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
/* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(const struct sock *sk, struct tcp_info *info)
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
+ unsigned int start;
+ u32 rate;
memset(info, 0, sizeof(*info));
+ if (sk->sk_type != SOCK_STREAM)
+ return;
info->tcpi_state = sk->sk_state;
info->tcpi_ca_state = icsk->icsk_ca_state;
@@ -2655,10 +2697,19 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
- info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
- sk->sk_pacing_rate : ~0ULL;
- info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
- sk->sk_max_pacing_rate : ~0ULL;
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tp->syncp);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+ info->tcpi_segs_out = tp->segs_out;
+ info->tcpi_segs_in = tp->segs_in;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2730,6 +2781,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
}
+ case TCP_CC_INFO: {
+ const struct tcp_congestion_ops *ca_ops;
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ ca_ops = icsk->icsk_ca_ops;
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ~0U, &attr, &info);
+
+ len = min_t(unsigned int, len, sz);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
case TCP_QUICKACK:
val = !icsk->icsk_ack.pingpong;
break;
@@ -2776,7 +2847,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_FASTOPEN:
- if (icsk->icsk_accept_queue.fastopenq != NULL)
+ if (icsk->icsk_accept_queue.fastopenq)
val = icsk->icsk_accept_queue.fastopenq->max_qlen;
else
val = 0;
@@ -2788,6 +2859,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
+ case TCP_SAVE_SYN:
+ val = tp->save_syn;
+ break;
+ case TCP_SAVED_SYN: {
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ if (tp->saved_syn) {
+ if (len < tp->saved_syn[0]) {
+ if (put_user(tp->saved_syn[0], optlen)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ release_sock(sk);
+ return -EINVAL;
+ }
+ len = tp->saved_syn[0];
+ if (put_user(len, optlen)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ tcp_saved_syn_free(tp);
+ release_sock(sk);
+ } else {
+ release_sock(sk);
+ len = 0;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ }
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -2960,7 +3067,7 @@ void tcp_done(struct sock *sk)
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
- if (req != NULL)
+ if (req)
reqsk_fastopen_remove(sk, req, false);
sk->sk_shutdown = SHUTDOWN_MASK;
@@ -2992,21 +3099,21 @@ __setup("thash_entries=", set_thash_entries);
static void __init tcp_init_mem(void)
{
- unsigned long limit = nr_free_buffer_pages() / 8;
+ unsigned long limit = nr_free_buffer_pages() / 16;
+
limit = max(limit, 128UL);
- sysctl_tcp_mem[0] = limit / 4 * 3;
- sysctl_tcp_mem[1] = limit;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
+ sysctl_tcp_mem[1] = limit; /* 6.25 % */
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
}
void __init tcp_init(void)
{
- struct sk_buff *skb = NULL;
unsigned long limit;
int max_rshare, max_wshare, cnt;
unsigned int i;
- BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
+ sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
new file mode 100644
index 000000000000..8c6fd3d5e40f
--- /dev/null
+++ b/net/ipv4/tcp_cdg.c
@@ -0,0 +1,433 @@
+/*
+ * CAIA Delay-Gradient (CDG) congestion control
+ *
+ * This implementation is based on the paper:
+ * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011.
+ *
+ * Scavenger traffic (Less-than-Best-Effort) should disable coexistence
+ * heuristics using parameters use_shadow=0 and use_ineff=0.
+ *
+ * Parameters window, backoff_beta, and backoff_factor are crucial for
+ * throughput and delay. Future work is needed to determine better defaults,
+ * and to provide guidelines for use in different environments/contexts.
+ *
+ * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/.
+ * Parameter window is only configurable when loading tcp_cdg as a module.
+ *
+ * Notable differences from paper/FreeBSD:
+ * o Using Hybrid Slow start and Proportional Rate Reduction.
+ * o Add toggle for shadow window mechanism. Suggested by David Hayes.
+ * o Add toggle for non-congestion loss tolerance.
+ * o Scaling parameter G is changed to a backoff factor;
+ * conversion is given by: backoff_factor = 1000/(G * window).
+ * o Limit shadow window to 2 * cwnd, or to cwnd when application limited.
+ * o More accurate e^-x.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define HYSTART_ACK_TRAIN 1
+#define HYSTART_DELAY 2
+
+static int window __read_mostly = 8;
+static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */
+static unsigned int backoff_factor __read_mostly = 42;
+static unsigned int hystart_detect __read_mostly = 3;
+static unsigned int use_ineff __read_mostly = 5;
+static bool use_shadow __read_mostly = true;
+static bool use_tolerance __read_mostly;
+
+module_param(window, int, 0444);
+MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)");
+module_param(backoff_beta, uint, 0644);
+MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)");
+module_param(backoff_factor, uint, 0644);
+MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor");
+module_param(hystart_detect, uint, 0644);
+MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start "
+ "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)");
+module_param(use_ineff, uint, 0644);
+MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)");
+module_param(use_shadow, bool, 0644);
+MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
+module_param(use_tolerance, bool, 0644);
+MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
+
+struct minmax {
+ union {
+ struct {
+ s32 min;
+ s32 max;
+ };
+ u64 v64;
+ };
+};
+
+enum cdg_state {
+ CDG_UNKNOWN = 0,
+ CDG_NONFULL = 1,
+ CDG_FULL = 2,
+ CDG_BACKOFF = 3,
+};
+
+struct cdg {
+ struct minmax rtt;
+ struct minmax rtt_prev;
+ struct minmax *gradients;
+ struct minmax gsum;
+ bool gfilled;
+ u8 tail;
+ u8 state;
+ u8 delack;
+ u32 rtt_seq;
+ u32 undo_cwnd;
+ u32 shadow_wnd;
+ u16 backoff_cnt;
+ u16 sample_cnt;
+ s32 delay_min;
+ u32 last_ack;
+ u32 round_start;
+};
+
+/**
+ * nexp_u32 - negative base-e exponential
+ * @ux: x in units of micro
+ *
+ * Returns exp(ux * -1e-6) * U32_MAX.
+ */
+static u32 __pure nexp_u32(u32 ux)
+{
+ static const u16 v[] = {
+ /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */
+ 65535,
+ 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
+ 61378, 57484, 50423, 38795, 22965, 8047, 987, 14,
+ };
+ u32 msb = ux >> 8;
+ u32 res;
+ int i;
+
+ /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */
+ if (msb > U16_MAX)
+ return 0;
+
+ /* Scale first eight bits linearly: */
+ res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
+
+ /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */
+ for (i = 1; msb; i++, msb >>= 1) {
+ u32 y = v[i & -(msb & 1)] + U32_C(1);
+
+ res = ((u64)res * y) >> 16;
+ }
+
+ return res;
+}
+
+/* Based on the HyStart algorithm (by Ha et al.) that is implemented in
+ * tcp_cubic. Differences/experimental changes:
+ * o Using Hayes' delayed ACK filter.
+ * o Using a usec clock for the ACK train.
+ * o Reset ACK train when application limited.
+ * o Invoked at any cwnd (i.e. also when cwnd < 16).
+ * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh).
+ */
+static void tcp_cdg_hystart_update(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min);
+ if (ca->delay_min == 0)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
+
+ if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
+ ca->last_ack = now_us;
+ ca->round_start = now_us;
+ } else if (before(now_us, ca->last_ack + 3000)) {
+ u32 base_owd = max(ca->delay_min / 2U, 125U);
+
+ ca->last_ack = now_us;
+ if (after(now_us, ca->round_start + base_owd)) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ return;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ if (ca->sample_cnt < 8) {
+ ca->sample_cnt++;
+ } else {
+ s32 thresh = max(ca->delay_min + ca->delay_min / 8U,
+ 125U);
+
+ if (ca->rtt.min > thresh) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+static s32 tcp_cdg_grad(struct cdg *ca)
+{
+ s32 gmin = ca->rtt.min - ca->rtt_prev.min;
+ s32 gmax = ca->rtt.max - ca->rtt_prev.max;
+ s32 grad;
+
+ if (ca->gradients) {
+ ca->gsum.min += gmin - ca->gradients[ca->tail].min;
+ ca->gsum.max += gmax - ca->gradients[ca->tail].max;
+ ca->gradients[ca->tail].min = gmin;
+ ca->gradients[ca->tail].max = gmax;
+ ca->tail = (ca->tail + 1) & (window - 1);
+ gmin = ca->gsum.min;
+ gmax = ca->gsum.max;
+ }
+
+ /* We keep sums to ignore gradients during cwnd reductions;
+ * the paper's smoothed gradients otherwise simplify to:
+ * (rtt_latest - rtt_oldest) / window.
+ *
+ * We also drop division by window here.
+ */
+ grad = gmin > 0 ? gmin : gmax;
+
+ /* Extrapolate missing values in gradient window: */
+ if (!ca->gfilled) {
+ if (!ca->gradients && window > 1)
+ grad *= window; /* Memory allocation failed. */
+ else if (ca->tail == 0)
+ ca->gfilled = true;
+ else
+ grad = (grad * window) / (int)ca->tail;
+ }
+
+ /* Backoff was effectual: */
+ if (gmin <= -32 || gmax <= -32)
+ ca->backoff_cnt = 0;
+
+ if (use_tolerance) {
+ /* Reduce small variations to zero: */
+ gmin = DIV_ROUND_CLOSEST(gmin, 64);
+ gmax = DIV_ROUND_CLOSEST(gmax, 64);
+
+ if (gmin > 0 && gmax <= 0)
+ ca->state = CDG_FULL;
+ else if ((gmin > 0 && gmax > 0) || gmax < 0)
+ ca->state = CDG_NONFULL;
+ }
+ return grad;
+}
+
+static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (prandom_u32() <= nexp_u32(grad * backoff_factor))
+ return false;
+
+ if (use_ineff) {
+ ca->backoff_cnt++;
+ if (ca->backoff_cnt > use_ineff)
+ return false;
+ }
+
+ ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd);
+ ca->state = CDG_BACKOFF;
+ tcp_enter_cwr(sk);
+ return true;
+}
+
+/* Not called in CWR or Recovery state. */
+static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_cwnd;
+ u32 incr;
+
+ if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect)
+ tcp_cdg_hystart_update(sk);
+
+ if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
+ s32 grad = 0;
+
+ if (ca->rtt_prev.v64)
+ grad = tcp_cdg_grad(ca);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ ca->last_ack = 0;
+ ca->sample_cnt = 0;
+
+ if (grad > 0 && tcp_cdg_backoff(sk, grad))
+ return;
+ }
+
+ if (!tcp_is_cwnd_limited(sk)) {
+ ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd);
+ return;
+ }
+
+ prior_snd_cwnd = tp->snd_cwnd;
+ tcp_reno_cong_avoid(sk, ack, acked);
+
+ incr = tp->snd_cwnd - prior_snd_cwnd;
+ ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
+}
+
+static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rtt_us <= 0)
+ return;
+
+ /* A heuristic for filtering delayed ACKs, adapted from:
+ * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support
+ * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
+ */
+ if (tp->sacked_out == 0) {
+ if (num_acked == 1 && ca->delack) {
+ /* A delayed ACK is only used for the minimum if it is
+ * provenly lower than an existing non-zero minimum.
+ */
+ ca->rtt.min = min(ca->rtt.min, rtt_us);
+ ca->delack--;
+ return;
+ } else if (num_acked > 1 && ca->delack < 5) {
+ ca->delack++;
+ }
+ }
+
+ ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
+ ca->rtt.max = max(ca->rtt.max, rtt_us);
+}
+
+static u32 tcp_cdg_ssthresh(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->undo_cwnd = tp->snd_cwnd;
+
+ if (ca->state == CDG_BACKOFF)
+ return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
+
+ if (ca->state == CDG_NONFULL && use_tolerance)
+ return tp->snd_cwnd;
+
+ ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd);
+ if (use_shadow)
+ return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1);
+ return max(2U, tp->snd_cwnd >> 1);
+}
+
+static u32 tcp_cdg_undo_cwnd(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd);
+}
+
+static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct minmax *gradients;
+
+ switch (ev) {
+ case CA_EVENT_CWND_RESTART:
+ gradients = ca->gradients;
+ if (gradients)
+ memset(gradients, 0, window * sizeof(gradients[0]));
+ memset(ca, 0, sizeof(*ca));
+
+ ca->gradients = gradients;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tp->snd_cwnd;
+ break;
+ case CA_EVENT_COMPLETE_CWR:
+ ca->state = CDG_UNKNOWN;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcp_cdg_init(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* We silently fall back to window = 1 if allocation fails. */
+ if (window > 1)
+ ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
+ GFP_NOWAIT | __GFP_NOWARN);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tp->snd_cwnd;
+}
+
+static void tcp_cdg_release(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ kfree(ca->gradients);
+}
+
+struct tcp_congestion_ops tcp_cdg __read_mostly = {
+ .cong_avoid = tcp_cdg_cong_avoid,
+ .cwnd_event = tcp_cdg_cwnd_event,
+ .pkts_acked = tcp_cdg_acked,
+ .undo_cwnd = tcp_cdg_undo_cwnd,
+ .ssthresh = tcp_cdg_ssthresh,
+ .release = tcp_cdg_release,
+ .init = tcp_cdg_init,
+ .owner = THIS_MODULE,
+ .name = "cdg",
+};
+
+static int __init tcp_cdg_register(void)
+{
+ if (backoff_beta > 1024 || window < 1 || window > 256)
+ return -ERANGE;
+ if (!is_power_of_2(window))
+ return -EINVAL;
+
+ BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_cdg);
+ return 0;
+}
+
+static void __exit tcp_cdg_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_cdg);
+}
+
+module_init(tcp_cdg_register);
+module_exit(tcp_cdg_unregister);
+MODULE_AUTHOR("Kenneth Klette Jonassen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP CDG");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 62856e185a93..84be008c945c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -83,7 +83,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
- pr_info("%s registered\n", ca->name);
+ pr_debug("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
@@ -187,6 +187,7 @@ static void tcp_reinit_congestion_control(struct sock *sk,
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
+ icsk->icsk_ca_setsockopt = 1;
if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
@@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
/* No change asking for existing value */
- if (ca == icsk->icsk_ca_ops)
+ if (ca == icsk->icsk_ca_ops) {
+ icsk->icsk_ca_setsockopt = 1;
goto out;
+ }
if (!ca)
err = -ENOENT;
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index b504371af742..7092a61c4dc8 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
- /* For avoiding denominator == 1. */
- if (ca->acked_bytes_total == 0)
- ca->acked_bytes_total = 1;
+ u64 bytes_ecn = ca->acked_bytes_ecn;
+ u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */
- ca->dctcp_alpha = ca->dctcp_alpha -
- (ca->dctcp_alpha >> dctcp_shift_g) +
- (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
- ca->acked_bytes_total;
- if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
- /* Clamp dctcp_alpha to max. */
- ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ alpha -= alpha >> dctcp_shift_g;
+ if (bytes_ecn) {
+ /* If dctcp_shift_g == 1, a 32bit value would overflow
+ * after 8 Mbytes.
+ */
+ bytes_ecn <<= (10 - dctcp_shift_g);
+ do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
+ alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
+ }
+ /* dctcp_alpha can be read from dctcp_get_info() without
+ * synchro, so we ask compiler to not use dctcp_alpha
+ * as a temporary variable in prior operations.
+ */
+ WRITE_ONCE(ca->dctcp_alpha, alpha);
dctcp_reset(tp, ca);
}
}
@@ -277,7 +283,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
}
}
-static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct dctcp *ca = inet_csk_ca(sk);
@@ -286,19 +293,19 @@ static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
*/
if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcp_dctcp_info info;
-
- memset(&info, 0, sizeof(info));
+ memset(info, 0, sizeof(struct tcp_dctcp_info));
if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
- info.dctcp_enabled = 1;
- info.dctcp_ce_state = (u16) ca->ce_state;
- info.dctcp_alpha = ca->dctcp_alpha;
- info.dctcp_ab_ecn = ca->acked_bytes_ecn;
- info.dctcp_ab_tot = ca->acked_bytes_total;
+ info->dctcp.dctcp_enabled = 1;
+ info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+ info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+ info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
+ info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
}
- nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+ *attr = INET_DIAG_DCTCPINFO;
+ return sizeof(*info);
}
+ return 0;
}
static struct tcp_congestion_ops dctcp __read_mostly = {
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d73f9ddb55b..479f34946177 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -19,28 +19,29 @@
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_info *info = _info;
if (sk->sk_state == TCP_LISTEN) {
r->idiag_rqueue = sk->sk_ack_backlog;
r->idiag_wqueue = sk->sk_max_ack_backlog;
- } else {
+ } else if (sk->sk_type == SOCK_STREAM) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+
r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
r->idiag_wqueue = tp->write_seq - tp->snd_una;
}
- if (info != NULL)
+ if (info)
tcp_get_info(sk, info);
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
}
static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
}
@@ -50,6 +51,7 @@ static const struct inet_diag_handler tcp_diag_handler = {
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_type = IPPROTO_TCP,
+ .idiag_info_size = sizeof(struct tcp_info),
};
static int __init tcp_diag_init(void)
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ea82fd492c1b..f9c0fb84e435 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -78,8 +78,6 @@ static bool __tcp_fastopen_cookie_gen(const void *path,
struct tcp_fastopen_context *ctx;
bool ok = false;
- tcp_fastopen_init_key_once(true);
-
rcu_read_lock();
ctx = rcu_dereference(tcp_fastopen_ctx);
if (ctx) {
@@ -141,7 +139,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
req->sk = NULL;
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL)
+ if (!child)
return false;
spin_lock(&queue->fastopenq->lock);
@@ -155,12 +153,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tp = tcp_sk(child);
tp->fastopen_rsk = req;
- /* Do a hold on the listner sk so that if the listener is being
- * closed, the child that has been accepted can live on and still
- * access listen_lock.
- */
- sock_hold(sk);
- tcp_rsk(req)->listener = sk;
+ tcp_rsk(req)->tfo_listener = true;
/* RFC1323: The window in SYN & SYN/ACK segments is never
* scaled. So correct it appropriately.
@@ -174,6 +167,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+ atomic_set(&req->rsk_refcnt, 1);
/* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, child);
@@ -210,6 +204,11 @@ static bool tcp_fastopen_create_child(struct sock *sk,
skb_set_owner_r(skb2, child);
__skb_queue_tail(&child->sk_receive_queue, skb2);
tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
} else {
end_seq = TCP_SKB_CB(skb)->seq + 1;
}
@@ -218,10 +217,9 @@ static bool tcp_fastopen_create_child(struct sock *sk,
sk->sk_data_ready(sk);
bh_unlock_sock(child);
sock_put(child);
- WARN_ON(req->sk == NULL);
+ WARN_ON(!req->sk);
return true;
}
-EXPORT_SYMBOL(tcp_fastopen_create_child);
static bool tcp_fastopen_queue_check(struct sock *sk)
{
@@ -238,14 +236,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* temporarily vs a server not supporting Fast Open at all.
*/
fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (fastopenq == NULL || fastopenq->max_qlen == 0)
+ if (!fastopenq || fastopenq->max_qlen == 0)
return false;
if (fastopenq->qlen >= fastopenq->max_qlen) {
struct request_sock *req1;
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
- if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+ if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
spin_unlock(&fastopenq->lock);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
@@ -254,7 +252,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
fastopenq->rskq_rst_head = req1->dl_next;
fastopenq->qlen--;
spin_unlock(&fastopenq->lock);
- reqsk_free(req1);
+ reqsk_put(req1);
}
return true;
}
@@ -308,6 +306,7 @@ fastopen:
} else if (foc->len > 0) /* Client presents an invalid cookie */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ valid_foc.exp = foc->exp;
*foc = valid_foc;
return false;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1d5a30a90adf..f71002e4db0b 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -300,26 +300,27 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
}
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_illinois_info(struct sock *sk, u32 ext,
- struct sk_buff *skb)
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct illinois *ca = inet_csk_ca(sk);
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info info = {
- .tcpv_enabled = 1,
- .tcpv_rttcnt = ca->cnt_rtt,
- .tcpv_minrtt = ca->base_rtt,
- };
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+ info->vegas.tcpv_minrtt = ca->base_rtt;
+ info->vegas.tcpv_rtt = 0;
- if (info.tcpv_rttcnt > 0) {
+ if (info->vegas.tcpv_rttcnt > 0) {
u64 t = ca->sum_rtt;
- do_div(t, info.tcpv_rttcnt);
- info.tcpv_rtt = t;
+ do_div(t, info->vegas.tcpv_rttcnt);
+ info->vegas.tcpv_rtt = t;
}
- nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
+ return 0;
}
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f501ac048366..728f5b3d3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !sk_under_memory_pressure(sk)) {
+ !tcp_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !sk_under_memory_pressure(sk) &&
+ !tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
@@ -866,7 +866,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- if ((tp->retransmit_skb_hint == NULL) ||
+ if (!tp->retransmit_skb_hint ||
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
@@ -1130,7 +1130,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sacktag_state {
int reord;
int fack_count;
- long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ struct skb_mstamp first_sackt;
+ struct skb_mstamp last_sackt;
int flag;
};
@@ -1233,14 +1238,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
- /* Pick the earliest sequence sacked for RTT */
- if (state->rtt_us < 0) {
- struct skb_mstamp now;
-
- skb_mstamp_get(&now);
- state->rtt_us = skb_mstamp_us_delta(&now,
- xmit_time);
- }
+ if (state->first_sackt.v64 == 0)
+ state->first_sackt = *xmit_time;
+ state->last_sackt = *xmit_time;
}
if (sacked & TCPCB_LOST) {
@@ -1256,7 +1256,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
fack_count += pcount;
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
+ if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
@@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* code can come after this skb later on it's better to keep
* setting gso_size to something.
*/
- if (!skb_shinfo(prev)->gso_size) {
- skb_shinfo(prev)->gso_size = mss;
- skb_shinfo(prev)->gso_type = sk->sk_gso_type;
- }
+ if (!TCP_SKB_CB(prev)->tcp_gso_size)
+ TCP_SKB_CB(prev)->tcp_gso_size = mss;
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
- if (tcp_skb_pcount(skb) <= 1) {
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- }
+ if (tcp_skb_pcount(skb) <= 1)
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1535,7 +1531,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- if ((next_dup != NULL) &&
+ if (next_dup &&
before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
in_sack = tcp_match_skb_to_sack(sk, skb,
next_dup->start_seq,
@@ -1551,7 +1547,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
if (in_sack <= 0) {
tmp = tcp_shift_skb_data(sk, skb, state,
start_seq, end_seq, dup_sack);
- if (tmp != NULL) {
+ if (tmp) {
if (tmp != skb) {
skb = tmp;
continue;
@@ -1614,7 +1610,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
- if (next_dup == NULL)
+ if (!next_dup)
return skb;
if (before(next_dup->start_seq, skip_to_seq)) {
@@ -1634,7 +1630,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
- u32 prior_snd_una, long *sack_rtt_us)
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1642,7 +1638,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
- struct tcp_sacktag_state state;
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
@@ -1650,9 +1645,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
int i, j;
int first_sack_index;
- state.flag = 0;
- state.reord = tp->packets_out;
- state.rtt_us = -1L;
+ state->flag = 0;
+ state->reord = tp->packets_out;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1663,7 +1657,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
- state.flag |= FLAG_DSACKING_ACK;
+ state->flag |= FLAG_DSACKING_ACK;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1728,7 +1722,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
skb = tcp_write_queue_head(sk);
- state.fack_count = 0;
+ state->fack_count = 0;
i = 0;
if (!tp->sacked_out) {
@@ -1762,10 +1756,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
- skb = tcp_sacktag_skip(skb, sk, &state,
+ skb = tcp_sacktag_skip(skb, sk, state,
start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
- &state,
+ state,
start_seq,
cache->start_seq,
dup_sack);
@@ -1776,21 +1770,21 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
goto advance_sp;
skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
- &state,
+ state,
cache->end_seq);
/* ...tail remains todo... */
if (tcp_highest_sack_seq(tp) == cache->end_seq) {
/* ...but better entrypoint exists! */
skb = tcp_highest_sack(sk);
- if (skb == NULL)
+ if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
cache++;
goto walk;
}
- skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
@@ -1798,14 +1792,14 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (!before(start_seq, tcp_highest_sack_seq(tp))) {
skb = tcp_highest_sack(sk);
- if (skb == NULL)
+ if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
}
- skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, start_seq);
walk:
- skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+ skb = tcp_sacktag_walk(skb, sk, next_dup, state,
start_seq, end_seq, dup_sack);
advance_sp:
@@ -1820,14 +1814,12 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- tcp_mark_lost_retrans(sk);
-
- tcp_verify_left_out(tp);
-
- if ((state.reord < tp->fackets_out) &&
+ if ((state->reord < tp->fackets_out) &&
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
- tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+ tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
+ tcp_mark_lost_retrans(sk);
+ tcp_verify_left_out(tp);
out:
#if FASTRETRANS_DEBUG > 0
@@ -1836,8 +1828,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
- *sack_rtt_us = state.rtt_us;
- return state.flag;
+ return state->flag;
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
@@ -1926,14 +1917,13 @@ void tcp_enter_loss(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- bool new_recovery = false;
+ bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
- new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2257,7 +2247,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
(oldcnt >= packets))
break;
- mss = skb_shinfo(skb)->gso_size;
+ mss = tcp_skb_mss(skb);
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
mss, GFP_ATOMIC);
if (err < 0)
@@ -2557,6 +2547,7 @@ void tcp_enter_cwr(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
+EXPORT_SYMBOL(tcp_enter_cwr);
static void tcp_try_keep_open(struct sock *sk)
{
@@ -2700,16 +2691,21 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
+ if ((flag & FLAG_SND_UNA_ADVANCED) &&
+ tcp_try_undo_loss(sk, false))
+ return;
+
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
/* Step 3.b. A timeout is spurious if not all data are
* lost, i.e., never-retransmitted data are (s)acked.
*/
- if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, true))
return;
- if (after(tp->snd_nxt, tp->high_seq) &&
- (flag & FLAG_DATA_SACKED || is_dupack)) {
- tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+ if (after(tp->snd_nxt, tp->high_seq)) {
+ if (flag & FLAG_DATA_SACKED || is_dupack)
+ tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
@@ -2734,8 +2730,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
- if (tcp_try_undo_loss(sk, false))
- return;
tcp_xmit_retransmit_queue(sk);
}
@@ -3054,7 +3048,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, long sack_rtt_us)
+ u32 prior_snd_una,
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt, now;
@@ -3062,8 +3057,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
bool fully_acked = true;
- long ca_seq_rtt_us = -1L;
+ long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
+ long ca_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
bool rtt_update;
@@ -3099,17 +3095,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
- } else {
+ } else if (!(sacked & TCPCB_SACKED_ACKED)) {
last_ackt = skb->skb_mstamp;
WARN_ON_ONCE(last_ackt.v64 == 0);
if (!first_ackt.v64)
first_ackt = last_ackt;
- if (!(sacked & TCPCB_SACKED_ACKED)) {
- reord = min(pkts_acked, reord);
- if (!after(scb->end_seq, tp->high_seq))
- flag |= FLAG_ORIG_SACK_ACKED;
- }
+ reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_SACKED_ACKED)
@@ -3154,15 +3148,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
skb_mstamp_get(&now);
if (likely(first_ackt.v64)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+ if (sack->first_sackt.v64) {
+ sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
if (flag & FLAG_ACKED) {
- const struct tcp_congestion_ops *ca_ops
- = inet_csk(sk)->icsk_ca_ops;
-
tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
@@ -3185,11 +3180,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- if (ca_ops->pkts_acked) {
- long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
- ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
- }
-
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3199,6 +3189,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
@@ -3239,7 +3232,7 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
- unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
when, TCP_RTO_MAX);
@@ -3282,6 +3275,28 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+ u32 delta = ack - tp->snd_una;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_acked += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->snd_una = ack;
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+ u32 delta = seq - tp->rcv_nxt;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_received += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->rcv_nxt = seq;
+}
+
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -3317,11 +3332,41 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
}
}
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
return flag;
}
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{
@@ -3415,6 +3460,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sacktag_state sack_state;
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -3423,7 +3469,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
- long sack_rtt_us = -1L;
+
+ sack_state.first_sackt.v64 = 0;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3469,7 +3516,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
@@ -3487,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
@@ -3512,7 +3559,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
- sack_rtt_us);
+ &sack_state);
acked -= tp->packets_out;
/* Advance cwnd if state allows */
@@ -3564,7 +3611,7 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
@@ -3573,6 +3620,23 @@ old_ack:
return 0;
}
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+ bool syn, struct tcp_fastopen_cookie *foc,
+ bool exp_opt)
+{
+ /* Valid only in SYN or SYN-ACK with an even length. */
+ if (!foc || !syn || len < 0 || (len & 1))
+ return;
+
+ if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+ len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, cookie, len);
+ else if (len != 0)
+ len = -1;
+ foc->len = len;
+ foc->exp = exp_opt;
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3662,21 +3726,22 @@ void tcp_parse_options(const struct sk_buff *skb,
*/
break;
#endif
+ case TCPOPT_FASTOPEN:
+ tcp_parse_fastopen_option(
+ opsize - TCPOLEN_FASTOPEN_BASE,
+ ptr, th->syn, foc, false);
+ break;
+
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
- * 16 bits magic number. It's valid only in
- * SYN or SYN-ACK with an even size.
+ * 16 bits magic number.
*/
- if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
- get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
- foc == NULL || !th->syn || (opsize & 1))
- break;
- foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
- if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
- foc->len <= TCP_FASTOPEN_COOKIE_MAX)
- memcpy(foc->val, ptr + 2, foc->len);
- else if (foc->len != 0)
- foc->len = -1;
+ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+ get_unaligned_be16(ptr) ==
+ TCPOPT_FASTOPEN_MAGIC)
+ tcp_parse_fastopen_option(opsize -
+ TCPOLEN_EXP_FASTOPEN_BASE,
+ ptr + 2, th->syn, foc, true);
break;
}
@@ -4190,7 +4255,7 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4358,7 +4423,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
__skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
- tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
@@ -4445,13 +4510,15 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
- if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
-
+ if (eaten < 0) {
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4640,7 +4707,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
struct sk_buff *head;
u32 start, end;
- if (skb == NULL)
+ if (!skb)
return;
start = TCP_SKB_CB(skb)->seq;
@@ -4719,7 +4786,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (sk_under_memory_pressure(sk))
+ else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4763,7 +4830,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
return false;
/* If we are under soft global TCP memory pressure, do not expand. */
@@ -4799,6 +4866,8 @@ static void tcp_check_space(struct sock *sk)
{
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+ /* pairs with tcp_poll() */
+ smp_mb__after_atomic();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_new_space(sk);
@@ -5095,7 +5164,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (unlikely(sk->sk_rx_dst == NULL))
+ if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
@@ -5197,7 +5266,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
@@ -5292,7 +5361,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_ESTABLISHED);
- if (skb != NULL) {
+ if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
}
@@ -5330,8 +5399,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
- u16 mss = tp->rx_opt.mss_clamp;
- bool syn_drop;
+ u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+ bool syn_drop = false;
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
@@ -5343,16 +5412,25 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
mss = opt.mss_clamp;
}
- if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
+ if (!tp->syn_fastopen) {
+ /* Ignore an unsolicited cookie */
cookie->len = -1;
+ } else if (tp->total_retrans) {
+ /* SYN timed out and the SYN-ACK neither has a cookie nor
+ * acknowledges data. Presumably the remote received only
+ * the retransmitted (regular) SYNs: either the original
+ * SYN-data or the corresponding SYN-ACK was dropped.
+ */
+ syn_drop = (cookie->len < 0 && data);
+ } else if (cookie->len < 0 && !tp->syn_data) {
+ /* We requested a cookie but didn't get it. If we did not use
+ * the (old) exp opt format then try so next time (try_exp=1).
+ * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+ */
+ try_exp = tp->syn_fastopen_exp ? 2 : 1;
+ }
- /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
- * the remote receives only the retransmitted (regular) SYNs: either
- * the original SYN-data or the corresponding SYN-ACK is lost.
- */
- syn_drop = (cookie->len <= 0 && data && tp->total_retrans);
-
- tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
tcp_for_write_queue_from(data, sk) {
@@ -5661,11 +5739,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
req = tp->fastopen_rsk;
- if (req != NULL) {
+ if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+ if (!tcp_check_req(sk, skb, req, true))
goto discard;
}
@@ -5751,7 +5829,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* ACK we have received, this would have acknowledged
* our SYNACK so stop the SYNACK timer.
*/
- if (req != NULL) {
+ if (req) {
/* Return RST if ack_seq is invalid.
* Note that RFC793 only says to generate a
* DUPACK for it but for TCP Fast Open it seems
@@ -5913,6 +5991,97 @@ static void tcp_ecn_create_request(struct request_sock *req,
inet_rsk(req)->ecn_ok = 1;
}
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->cookie_ts = 0;
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+}
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ kmemcheck_annotate_bitfield(ireq, flags);
+ ireq->opt = NULL;
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+/*
+ * Return true if a syncookie should be sent
+ */
+static bool tcp_syn_flood_action(struct sock *sk,
+ const struct sk_buff *skb,
+ const char *proto)
+{
+ const char *msg = "Dropping request";
+ bool want_cookie = false;
+ struct listen_sock *lopt;
+
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies) {
+ msg = "Sending cookies";
+ want_cookie = true;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ } else
+#endif
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+
+ lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
+ if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
+ lopt->synflood_warned = 1;
+ pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
+ proto, ntohs(tcp_hdr(skb)->dest), msg);
+ }
+ return want_cookie;
+}
+
+static void tcp_reqsk_record_syn(const struct sock *sk,
+ struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ if (tcp_sk(sk)->save_syn) {
+ u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+ u32 *copy;
+
+ copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
+ if (copy) {
+ copy[0] = len;
+ memcpy(&copy[1], skb_network_header(skb), len);
+ req->saved_syn = copy;
+ }
+ }
+}
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -5950,7 +6119,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops);
+ req = inet_reqsk_alloc(rsk_ops, sk);
if (!req)
goto drop;
@@ -5967,6 +6136,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
+ /* Note: tcp_v6_init_req() might override ir_iif for link locals */
+ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+
af_ops->init_req(req, sk, skb);
if (security_inet_conn_request(sk, skb, req))
@@ -6039,9 +6211,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (err || want_cookie)
goto drop_and_free;
- tcp_rsk(req)->listener = NULL;
+ tcp_rsk(req)->tfo_listener = false;
af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
+ tcp_reqsk_record_syn(sk, req, skb);
return 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f1756ee02207..d7d4c2b79cf2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -122,7 +122,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (twp == NULL || (sysctl_tcp_tw_reuse &&
+ (!twp || (sysctl_tcp_tw_reuse &&
get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (tp->write_seq == 0)
@@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr;
- inet->inet_rcv_saddr = inet->inet_saddr;
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
@@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tcp_fetch_timewait_stamp(sk, &rt->dst);
inet->inet_dport = usin->sin_port;
- inet->inet_daddr = daddr;
+ sk_daddr_set(sk, daddr);
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet_opt)
@@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
dst->ops->redirect(dst, sk, skb);
}
+
+/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
+void tcp_req_err(struct sock *sk, u32 seq)
+{
+ struct request_sock *req = inet_reqsk(sk);
+ struct net *net = sock_net(sk);
+
+ /* ICMPs are not backlogged, hence we cannot get
+ * an established socket here.
+ */
+ WARN_ON(req->sk);
+
+ if (seq != tcp_rsk(req)->snt_isn) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ reqsk_put(req);
+ } else {
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ }
+}
+EXPORT_SYMBOL(tcp_req_err);
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
int err;
struct net *net = dev_net(icmp_skb->dev);
- sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
- iph->saddr, th->source, inet_iif(icmp_skb));
+ sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
+ th->dest, iph->saddr, ntohs(th->source),
+ inet_iif(icmp_skb));
if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
@@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
inet_twsk_put(inet_twsk(sk));
return;
}
+ seq = ntohl(th->seq);
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_err(sk, seq);
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk);
tp = tcp_sk(sk);
- seq = ntohl(th->seq);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = tp->fastopen_rsk;
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
@@ -458,42 +489,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
switch (sk->sk_state) {
- struct request_sock *req, **prev;
- case TCP_LISTEN:
- if (sock_owned_by_user(sk))
- goto out;
-
- req = inet_csk_search_req(sk, &prev, th->dest,
- iph->daddr, iph->saddr);
- if (!req)
- goto out;
-
- /* ICMPs are not backlogged, hence we cannot get
- an established socket here.
- */
- WARN_ON(req->sk);
-
- if (seq != tcp_rsk(req)->snt_isn) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- /*
- * Still in SYN_RECV, just remove it silently.
- * There is no good way to pass the error to the newly
- * created socket, and POSIX does not want network
- * errors returned from accept().
- */
- inet_csk_reqsk_queue_drop(sk, req, prev);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
- goto out;
-
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
* is already accepted it is treated as a connected one below.
*/
- if (fastopen && fastopen->sk == NULL)
+ if (fastopen && !fastopen->sk)
break;
if (!sock_owned_by_user(sk)) {
@@ -647,7 +648,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
if (!key)
goto release_sk1;
- genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
+ genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
goto release_sk1;
} else {
@@ -855,35 +856,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-/*
- * Return true if a syncookie should be sent
- */
-bool tcp_syn_flood_action(struct sock *sk,
- const struct sk_buff *skb,
- const char *proto)
-{
- const char *msg = "Dropping request";
- bool want_cookie = false;
- struct listen_sock *lopt;
-
-#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
- msg = "Sending cookies";
- want_cookie = true;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
- } else
-#endif
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
-
- lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
- if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
- lopt->synflood_warned = 1;
- pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
- }
- return want_cookie;
-}
-EXPORT_SYMBOL(tcp_syn_flood_action);
#ifdef CONFIG_TCP_MD5SIG
/*
@@ -897,10 +869,10 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
const union tcp_md5_addr *addr,
int family)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
unsigned int size = sizeof(struct in_addr);
- struct tcp_md5sig_info *md5sig;
+ const struct tcp_md5sig_info *md5sig;
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
@@ -923,24 +895,15 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
EXPORT_SYMBOL(tcp_md5_do_lookup);
struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
- struct sock *addr_sk)
+ const struct sock *addr_sk)
{
- union tcp_md5_addr *addr;
+ const union tcp_md5_addr *addr;
- addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
+ addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
return tcp_md5_do_lookup(sk, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
-static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
- struct request_sock *req)
-{
- union tcp_md5_addr *addr;
-
- addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
- return tcp_md5_do_lookup(sk, addr, AF_INET);
-}
-
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
@@ -1101,8 +1064,8 @@ clear_hash_noput:
return 1;
}
-int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
- const struct sock *sk, const struct request_sock *req,
+int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk,
const struct sk_buff *skb)
{
struct tcp_md5sig_pool *hp;
@@ -1110,12 +1073,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
- if (sk) {
- saddr = inet_sk(sk)->inet_saddr;
- daddr = inet_sk(sk)->inet_daddr;
- } else if (req) {
- saddr = inet_rsk(req)->ir_loc_addr;
- daddr = inet_rsk(req)->ir_rmt_addr;
+ if (sk) { /* valid for establish/request sockets */
+ saddr = sk->sk_rcv_saddr;
+ daddr = sk->sk_daddr;
} else {
const struct iphdr *iph = ip_hdr(skb);
saddr = iph->saddr;
@@ -1152,8 +1112,9 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
- const struct sk_buff *skb)
+/* Called with rcu_read_lock() */
+static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+ const struct sk_buff *skb)
{
/*
* This gets called for each TCP segment that arrives
@@ -1193,7 +1154,7 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
*/
genhash = tcp_v4_md5_hash_skb(newhash,
hash_expected,
- NULL, NULL, skb);
+ NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
@@ -1205,28 +1166,16 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
}
return false;
}
-
-static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
-{
- bool ret;
-
- rcu_read_lock();
- ret = __tcp_v4_inbound_md5_hash(sk, skb);
- rcu_read_unlock();
-
- return ret;
-}
-
#endif
-static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
- ireq->ir_loc_addr = ip_hdr(skb)->daddr;
- ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
- ireq->no_srccheck = inet_sk(sk)->transparent;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->no_srccheck = inet_sk(sk_listener)->transparent;
ireq->opt = tcp_v4_save_options(skb);
}
@@ -1259,7 +1208,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
- .md5_lookup = tcp_v4_reqsk_md5_lookup,
+ .req_md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
#endif
.init_req = tcp_v4_init_req,
@@ -1318,8 +1267,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->inet_daddr = ireq->ir_rmt_addr;
- newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ sk_daddr_set(newsk, ireq->ir_rmt_addr);
+ sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = ireq->opt;
rcu_assign_pointer(newinet->inet_opt, inet_opt);
@@ -1356,7 +1305,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
/* Copy over the MD5 key from the original socket */
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
AF_INET);
- if (key != NULL) {
+ if (key) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
@@ -1391,15 +1340,18 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
+ struct request_sock *req;
struct sock *nsk;
- struct request_sock **prev;
- /* Find possible connection requests. */
- struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
- iph->saddr, iph->daddr);
- if (req)
- return tcp_check_req(sk, skb, req, prev, false);
+
+ req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
+ if (req) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk)
+ reqsk_put(req);
+ return nsk;
+ }
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1439,7 +1391,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- dst->ops->check(dst, 0) == NULL) {
+ !dst->ops->check(dst, 0)) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
@@ -1448,7 +1400,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
- if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
@@ -1517,7 +1469,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
- if (sk->sk_state != TCP_TIME_WAIT) {
+ if (sk_fullsock(sk)) {
struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
if (dst)
@@ -1674,6 +1626,7 @@ process:
skb->dev = NULL;
bh_lock_sock_nested(sk);
+ tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1694,7 +1647,7 @@ no_tcp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+ if (tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
@@ -1718,10 +1671,6 @@ do_time_wait:
goto discard_it;
}
- if (skb->len < (th->doff << 2)) {
- inet_twsk_put(inet_twsk(sk));
- goto bad_packet;
- }
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
@@ -1734,7 +1683,7 @@ do_time_wait:
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_deschedule(inet_twsk(sk));
inet_twsk_put(inet_twsk(sk));
sk = sk2;
goto process;
@@ -1846,10 +1795,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- BUG_ON(tp->fastopen_rsk != NULL);
+ BUG_ON(tp->fastopen_rsk);
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
+ tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
sock_release_memcg(sk);
@@ -1904,13 +1854,13 @@ get_req:
}
sk = sk_nulls_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else {
icsk = inet_csk(sk);
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req;
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
sk = sk_nulls_next(sk);
}
get_sk:
@@ -1922,7 +1872,7 @@ get_sk:
goto out;
}
icsk = inet_csk(sk);
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
start_req:
st->uid = sock_i_uid(sk);
@@ -1931,7 +1881,7 @@ start_req:
st->sbucket = 0;
goto get_req;
}
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
@@ -2150,7 +2100,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
case TCP_SEQ_STATE_OPENREQ:
if (v) {
struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
@@ -2204,17 +2154,17 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
}
EXPORT_SYMBOL(tcp_proc_unregister);
-static void get_openreq4(const struct sock *sk, const struct request_sock *req,
+static void get_openreq4(const struct request_sock *req,
struct seq_file *f, int i, kuid_t uid)
{
const struct inet_request_sock *ireq = inet_rsk(req);
- long delta = req->expires - jiffies;
+ long delta = req->rsk_timer.expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
i,
ireq->ir_loc_addr,
- ntohs(inet_sk(sk)->inet_sport),
+ ireq->ir_num,
ireq->ir_rmt_addr,
ntohs(ireq->ir_rmt_port),
TCP_SYN_RECV,
@@ -2225,7 +2175,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
from_kuid_munged(seq_user_ns(f), uid),
0, /* non standard timer */
0, /* open_requests have no inode */
- atomic_read(&sk->sk_refcnt),
+ 0,
req);
}
@@ -2291,9 +2241,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
struct seq_file *f, int i)
{
+ long delta = tw->tw_timer.expires - jiffies;
__be32 dest, src;
__u16 destp, srcp;
- s32 delta = tw->tw_ttd - inet_tw_time_stamp();
dest = tw->tw_daddr;
src = tw->tw_rcv_saddr;
@@ -2332,7 +2282,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
get_tcp4_sock(v, seq, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
+ get_openreq4(v, seq, st->num, st->uid);
break;
}
out:
@@ -2458,10 +2408,15 @@ static int __net_init tcp_sk_init(struct net *net)
goto fail;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
+
net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
- return 0;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ return 0;
fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index e5f41bd5ec1b..a51d63a43e33 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -28,7 +28,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
struct tcp_fastopen_metrics {
u16 mss;
- u16 syn_loss:10; /* Recurring Fast Open SYN losses */
+ u16 syn_loss:10, /* Recurring Fast Open SYN losses */
+ try_exp:2; /* Request w/ exp. option (once) */
unsigned long last_syn_loss; /* Last Fast Open SYN loss */
struct tcp_fastopen_cookie cookie;
};
@@ -40,6 +41,7 @@ struct tcp_fastopen_metrics {
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
+ possible_net_t tcpm_net;
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
@@ -52,6 +54,11 @@ struct tcp_metrics_block {
struct rcu_head rcu_head;
};
+static inline struct net *tm_net(struct tcp_metrics_block *tm)
+{
+ return read_pnet(&tm->tcpm_net);
+}
+
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
@@ -74,23 +81,20 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
- const struct in6_addr *a6, *b6;
-
if (a->family != b->family)
return false;
if (a->family == AF_INET)
return a->addr.a4 == b->addr.a4;
-
- a6 = (const struct in6_addr *) &a->addr.a6[0];
- b6 = (const struct in6_addr *) &b->addr.a6[0];
-
- return ipv6_addr_equal(a6, b6);
+ return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
}
struct tcpm_hash_bucket {
struct tcp_metrics_block __rcu *chain;
};
+static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
+static unsigned int tcp_metrics_hash_log __read_mostly;
+
static DEFINE_SPINLOCK(tcp_metrics_lock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
@@ -128,6 +132,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
if (fastopen_clear) {
tm->tcpm_fastopen.mss = 0;
tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.try_exp = 0;
+ tm->tcpm_fastopen.cookie.exp = false;
tm->tcpm_fastopen.cookie.len = 0;
}
}
@@ -143,6 +149,9 @@ static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst
#define TCP_METRICS_RECLAIM_DEPTH 5
#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+#define deref_locked(p) \
+ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
+
static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct inetpeer_addr *saddr,
struct inetpeer_addr *daddr,
@@ -171,9 +180,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (unlikely(reclaim)) {
struct tcp_metrics_block *oldest;
- oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
- for (tm = rcu_dereference(oldest->tcpm_next); tm;
- tm = rcu_dereference(tm->tcpm_next)) {
+ oldest = deref_locked(tcp_metrics_hash[hash].chain);
+ for (tm = deref_locked(oldest->tcpm_next); tm;
+ tm = deref_locked(tm->tcpm_next)) {
if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
oldest = tm;
}
@@ -183,14 +192,15 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (!tm)
goto out_unlock;
}
+ write_pnet(&tm->tcpm_net, net);
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
tcpm_suck_dst(tm, dst, true);
if (likely(!reclaim)) {
- tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
- rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+ tm->tcpm_next = tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
}
out_unlock:
@@ -214,10 +224,11 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
struct tcp_metrics_block *tm;
int depth = 0;
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, saddr) &&
- addr_same(&tm->tcpm_daddr, daddr))
+ addr_same(&tm->tcpm_daddr, daddr) &&
+ net_eq(tm_net(tm), net))
break;
depth++;
}
@@ -242,8 +253,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr;
- *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
+ saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
+ daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
break;
#endif
@@ -252,12 +263,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
}
net = dev_net(dst->dev);
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) &&
- addr_same(&tm->tcpm_daddr, &daddr))
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
break;
}
tcpm_check_stamp(tm, dst);
@@ -288,9 +301,9 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
hash = (__force unsigned int) daddr.addr.a4;
} else {
saddr.family = AF_INET6;
- *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr;
+ saddr.addr.in6 = tw->tw_v6_rcv_saddr;
daddr.family = AF_INET6;
- *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr;
+ daddr.addr.in6 = tw->tw_v6_daddr;
hash = ipv6_addr_hash(&tw->tw_v6_daddr);
}
}
@@ -299,12 +312,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
return NULL;
net = twsk_net(tw);
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) &&
- addr_same(&tm->tcpm_daddr, &daddr))
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
break;
}
return tm;
@@ -336,9 +351,9 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
hash = (__force unsigned int) daddr.addr.a4;
} else {
saddr.family = AF_INET6;
- *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr;
+ saddr.addr.in6 = sk->sk_v6_rcv_saddr;
daddr.family = AF_INET6;
- *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr;
+ daddr.addr.in6 = sk->sk_v6_daddr;
hash = ipv6_addr_hash(&sk->sk_v6_daddr);
}
}
@@ -347,7 +362,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
return NULL;
net = dev_net(dst->dev);
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
if (tm == TCP_METRICS_RECLAIM_PTR)
@@ -492,7 +508,7 @@ void tcp_init_metrics(struct sock *sk)
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
- if (dst == NULL)
+ if (!dst)
goto reset;
dst_confirm(dst);
@@ -700,6 +716,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
if (tfom->mss)
*mss = tfom->mss;
*cookie = tfom->cookie;
+ if (cookie->len <= 0 && tfom->try_exp == 1)
+ cookie->exp = true;
*syn_loss = tfom->syn_loss;
*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
} while (read_seqretry(&fastopen_seqlock, seq));
@@ -708,7 +726,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
}
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
- struct tcp_fastopen_cookie *cookie, bool syn_lost)
+ struct tcp_fastopen_cookie *cookie, bool syn_lost,
+ u16 try_exp)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_metrics_block *tm;
@@ -725,6 +744,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
tfom->mss = mss;
if (cookie && cookie->len > 0)
tfom->cookie = *cookie;
+ else if (try_exp > tfom->try_exp &&
+ tfom->cookie.len <= 0 && !tfom->cookie.exp)
+ tfom->try_exp = try_exp;
if (syn_lost) {
++tfom->syn_loss;
tfom->last_syn_loss = jiffies;
@@ -773,19 +795,19 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
switch (tm->tcpm_daddr.family) {
case AF_INET:
- if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
- tm->tcpm_daddr.addr.a4) < 0)
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+ tm->tcpm_daddr.addr.a4) < 0)
goto nla_put_failure;
- if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4,
- tm->tcpm_saddr.addr.a4) < 0)
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+ tm->tcpm_saddr.addr.a4) < 0)
goto nla_put_failure;
break;
case AF_INET6:
- if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
- tm->tcpm_daddr.addr.a6) < 0)
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
+ &tm->tcpm_daddr.addr.in6) < 0)
goto nla_put_failure;
- if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16,
- tm->tcpm_saddr.addr.a6) < 0)
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
+ &tm->tcpm_saddr.addr.in6) < 0)
goto nla_put_failure;
break;
default:
@@ -898,17 +920,19 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
unsigned int row, s_row = cb->args[0];
int s_col = cb->args[1], col = s_col;
for (row = s_row; row < max_rows; row++, s_col = 0) {
struct tcp_metrics_block *tm;
- struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
rcu_read_lock();
for (col = 0, tm = rcu_dereference(hb->chain); tm;
tm = rcu_dereference(tm->tcpm_next), col++) {
+ if (!net_eq(tm_net(tm), net))
+ continue;
if (col < s_col)
continue;
if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
@@ -933,7 +957,7 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
a = info->attrs[v4];
if (a) {
addr->family = AF_INET;
- addr->addr.a4 = nla_get_be32(a);
+ addr->addr.a4 = nla_get_in_addr(a);
if (hash)
*hash = (__force unsigned int) addr->addr.a4;
return 0;
@@ -943,9 +967,9 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
if (nla_len(a) != sizeof(struct in6_addr))
return -EINVAL;
addr->family = AF_INET6;
- memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
+ addr->addr.in6 = nla_get_in6_addr(a);
if (hash)
- *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
+ *hash = ipv6_addr_hash(&addr->addr.in6);
return 0;
}
return optional ? 1 : -EAFNOSUPPORT;
@@ -994,13 +1018,15 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
if (!reply)
goto nla_put_failure;
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
ret = -ESRCH;
rcu_read_lock();
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_daddr, &daddr) &&
- (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
ret = tcp_metrics_fill_info(msg, tm);
break;
}
@@ -1020,34 +1046,27 @@ out_free:
return ret;
}
-#define deref_locked_genl(p) \
- rcu_dereference_protected(p, lockdep_genl_is_held() && \
- lockdep_is_held(&tcp_metrics_lock))
-
-#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
-
-static int tcp_metrics_flush_all(struct net *net)
+static void tcp_metrics_flush_all(struct net *net)
{
- unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
- struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash;
struct tcp_metrics_block *tm;
unsigned int row;
for (row = 0; row < max_rows; row++, hb++) {
+ struct tcp_metrics_block __rcu **pp;
spin_lock_bh(&tcp_metrics_lock);
- tm = deref_locked_genl(hb->chain);
- if (tm)
- hb->chain = NULL;
- spin_unlock_bh(&tcp_metrics_lock);
- while (tm) {
- struct tcp_metrics_block *next;
-
- next = deref_genl(tm->tcpm_next);
- kfree_rcu(tm, rcu_head);
- tm = next;
+ pp = &hb->chain;
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (net_eq(tm_net(tm), net)) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ } else {
+ pp = &tm->tcpm_next;
+ }
}
+ spin_unlock_bh(&tcp_metrics_lock);
}
- return 0;
}
static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
@@ -1064,19 +1083,23 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
ret = parse_nl_addr(info, &daddr, &hash, 1);
if (ret < 0)
return ret;
- if (ret > 0)
- return tcp_metrics_flush_all(net);
+ if (ret > 0) {
+ tcp_metrics_flush_all(net);
+ return 0;
+ }
ret = parse_nl_saddr(info, &saddr);
if (ret < 0)
src = false;
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
- hb = net->ipv4.tcp_metrics_hash + hash;
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ hb = tcp_metrics_hash + hash;
pp = &hb->chain;
spin_lock_bh(&tcp_metrics_lock);
- for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) {
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
if (addr_same(&tm->tcpm_daddr, &daddr) &&
- (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
*pp = tm->tcpm_next;
kfree_rcu(tm, rcu_head);
found = true;
@@ -1126,6 +1149,9 @@ static int __net_init tcp_net_metrics_init(struct net *net)
size_t size;
unsigned int slots;
+ if (!net_eq(net, &init_net))
+ return 0;
+
slots = tcpmhash_entries;
if (!slots) {
if (totalram_pages >= 128 * 1024)
@@ -1134,14 +1160,14 @@ static int __net_init tcp_net_metrics_init(struct net *net)
slots = 8 * 1024;
}
- net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
- size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
+ tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
- net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!net->ipv4.tcp_metrics_hash)
- net->ipv4.tcp_metrics_hash = vzalloc(size);
+ tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!tcp_metrics_hash)
+ tcp_metrics_hash = vzalloc(size);
- if (!net->ipv4.tcp_metrics_hash)
+ if (!tcp_metrics_hash)
return -ENOMEM;
return 0;
@@ -1149,19 +1175,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
static void __net_exit tcp_net_metrics_exit(struct net *net)
{
- unsigned int i;
-
- for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
- struct tcp_metrics_block *tm, *next;
-
- tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
- while (tm) {
- next = rcu_dereference_protected(tm->tcpm_next, 1);
- kfree(tm);
- tm = next;
- }
- }
- kvfree(net->ipv4.tcp_metrics_hash);
+ tcp_metrics_flush_all(net);
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
@@ -1175,16 +1189,10 @@ void __init tcp_metrics_init(void)
ret = register_pernet_subsys(&tcp_net_metrics_ops);
if (ret < 0)
- goto cleanup;
+ panic("Could not allocate the tcp_metrics hash table\n");
+
ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
tcp_metrics_nl_ops);
if (ret < 0)
- goto cleanup_subsys;
- return;
-
-cleanup_subsys:
- unregister_pernet_subsys(&tcp_net_metrics_ops);
-
-cleanup:
- return;
+ panic("Could not register tcp_metrics generic netlink\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dd11ac7798c6..4bc00cb79e60 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly;
struct inet_timewait_death_row tcp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
- .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
- .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
.hashinfo = &tcp_hashinfo,
- .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
- (unsigned long)&tcp_death_row),
- .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
- inet_twdr_twkill_work),
-/* Short-time timewait calendar */
-
- .twcal_hand = -1,
- .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
- (unsigned long)&tcp_death_row),
};
EXPORT_SYMBOL_GPL(tcp_death_row);
@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->fin ||
TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_deschedule(tw);
inet_twsk_put(tw);
return TCP_TW_RST;
}
@@ -174,11 +163,9 @@ kill_with_rst:
if (tcp_death_row.sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw))
- inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, tw->tw_timeout);
else
- inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -211,13 +198,12 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_deschedule(tw);
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
}
- inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -267,8 +253,7 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
@@ -283,18 +268,17 @@ EXPORT_SYMBOL(tcp_timewait_state_process);
*/
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
- struct inet_timewait_sock *tw = NULL;
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_timewait_sock *tw;
bool recycle_ok = false;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
- if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
- tw = inet_twsk_alloc(sk, state);
+ tw = inet_twsk_alloc(sk, &tcp_death_row, state);
- if (tw != NULL) {
+ if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
struct inet_sock *inet = inet_sk(sk);
@@ -316,7 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_v6_daddr = sk->sk_v6_daddr;
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
- tw->tw_flowlabel = np->flow_label >> 12;
+ tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
@@ -332,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct tcp_md5sig_key *key;
tcptw->tw_md5_key = NULL;
key = tp->af_specific->md5_lookup(sk, sk);
- if (key != NULL) {
+ if (key) {
tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
BUG();
@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
timeo = TCP_TIMEWAIT_LEN;
}
- inet_twsk_schedule(tw, &tcp_death_row, timeo,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, timeo);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
@@ -437,7 +420,10 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
rcu_read_unlock();
}
- if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
+ /* If no valid choice made yet, assign current system default ca. */
+ if (!ca_got_dst &&
+ (!icsk->icsk_ca_setsockopt ||
+ !try_module_get(icsk->icsk_ca_ops->owner)))
tcp_assign_congestion_control(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -454,7 +440,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
- if (newsk != NULL) {
+ if (newsk) {
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -465,6 +451,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
+ newtp->segs_in = 0;
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -553,6 +540,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
+ newtp->saved_syn = req->saved_syn;
+ req->saved_syn = NULL;
+
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
return newsk;
@@ -572,7 +562,6 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct request_sock **prev,
bool fastopen)
{
struct tcp_options_received tmp_opt;
@@ -629,9 +618,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time) &&
- !inet_rtx_syn_ack(sk, req))
- req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
- TCP_RTO_MAX) + jiffies;
+ !inet_rtx_syn_ack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer, expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
return NULL;
}
@@ -763,13 +759,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* socket is created, wait for troubles.
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL)
+ if (!child)
goto listen_overflow;
- inet_csk_reqsk_queue_unlink(sk, req, prev);
- inet_csk_reqsk_queue_removed(sk, req);
-
+ inet_csk_reqsk_queue_drop(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
+ /* Warning: caller must not call reqsk_put(req);
+ * child stole last reference on it.
+ */
return child;
listen_overflow:
@@ -791,7 +788,7 @@ embryonic_reset:
tcp_reset(sk);
}
if (!fastopen) {
- inet_csk_reqsk_queue_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9d7930ba8e0f..9864a2dbadce 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -29,8 +29,8 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
}
}
-struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
@@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
- mss = tcp_skb_mss(skb);
+ mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
@@ -242,7 +242,7 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
- mss = tcp_skb_mss(p);
+ mss = skb_shinfo(p)->gso_size;
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1db253e36045..b1c218df2c85 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
*/
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-/* Default TSQ limit of two TSO segments */
-int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+/* Default TSQ limit of four TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
@@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
}
}
+static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+ /* tp->ecn_flags are cleared at a later point in time when
+ * SYN ACK is ultimatively being received.
+ */
+ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+}
+
static void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
struct sock *sk)
@@ -393,8 +402,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
-
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
@@ -402,8 +409,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->sacked = 0;
tcp_skb_pcount_set(skb, 1);
- shinfo->gso_size = 0;
- shinfo->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -518,17 +523,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+ u8 *p = (u8 *)ptr;
+ u32 len; /* Fast Open option length */
- *ptr++ = htonl((TCPOPT_EXP << 24) |
- ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
- TCPOPT_FASTOPEN_MAGIC);
+ if (foc->exp) {
+ len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+ p += TCPOLEN_EXP_FASTOPEN_BASE;
+ } else {
+ len = TCPOLEN_FASTOPEN_BASE + foc->len;
+ *p++ = TCPOPT_FASTOPEN;
+ *p++ = len;
+ }
- memcpy(ptr, foc->val, foc->len);
- if ((foc->len & 3) == 2) {
- u8 *align = ((u8 *)ptr) + foc->len;
- align[0] = align[1] = TCPOPT_NOP;
+ memcpy(p, foc->val, foc->len);
+ if ((len & 3) == 2) {
+ p[foc->len] = TCPOPT_NOP;
+ p[foc->len + 1] = TCPOPT_NOP;
}
- ptr += (foc->len + 3) >> 2;
+ ptr += (len + 3) >> 2;
}
}
@@ -565,7 +579,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+ if (likely(sysctl_tcp_timestamps && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
@@ -583,13 +597,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
if (fastopen && fastopen->cookie.len >= 0) {
- u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+ u32 need = fastopen->cookie.len;
+
+ need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
opts->fastopen_cookie = &fastopen->cookie;
remaining -= need;
tp->syn_fastopen = 1;
+ tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
}
}
@@ -601,15 +619,14 @@ static unsigned int tcp_synack_options(struct sock *sk,
struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5,
+ const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
- if (*md5) {
+ if (md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
@@ -620,8 +637,6 @@ static unsigned int tcp_synack_options(struct sock *sk,
*/
ireq->tstamp_ok &= !ireq->sack_ok;
}
-#else
- *md5 = NULL;
#endif
/* We always send an MSS option. */
@@ -645,7 +660,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
if (foc != NULL && foc->len >= 0) {
- u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ u32 need = foc->len;
+
+ need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
@@ -981,6 +999,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
tcp_ecn_send(sk, skb, tcp_header_size);
@@ -989,7 +1008,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
- md5, sk, NULL, skb);
+ md5, sk, skb);
}
#endif
@@ -1005,8 +1024,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
- /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ tp->segs_out += tcp_skb_pcount(skb);
+ /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+ skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
skb->tstamp.tv64 = 0;
@@ -1043,25 +1064,17 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now)
+static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
-
- /* Make sure we own this skb before messing gso_size/gso_segs */
- WARN_ON_ONCE(skb_cloned(skb));
-
if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
tcp_skb_pcount_set(skb, 1);
- shinfo->gso_size = 0;
- shinfo->gso_type = 0;
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
} else {
tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
- shinfo->gso_size = mss_now;
- shinfo->gso_type = sk->sk_gso_type;
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
}
}
@@ -1150,8 +1163,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, gfp);
- if (buff == NULL)
+ buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
+ if (!buff)
return -ENOMEM; /* We'll just try again later. */
sk->sk_wmem_queued += buff->truesize;
@@ -1193,8 +1206,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* If this packet has been sent out already, we must
* adjust the various packet counters.
@@ -1274,7 +1287,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
return 0;
}
@@ -1354,6 +1367,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
EXPORT_SYMBOL(tcp_mtup_init);
@@ -1604,13 +1619,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now)
+static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
@@ -1665,7 +1679,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
- tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_init_tso_segs(skb, cur_mss);
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
return 0;
@@ -1707,8 +1721,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now, gfp);
- buff = sk_stream_alloc_skb(sk, 0, gfp);
- if (unlikely(buff == NULL))
+ buff = sk_stream_alloc_skb(sk, 0, gfp, true);
+ if (unlikely(!buff))
return -ENOMEM;
sk->sk_wmem_queued += buff->truesize;
@@ -1734,8 +1748,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* Link BUFF into the send queue. */
__skb_header_release(buff);
@@ -1752,20 +1766,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited, u32 max_segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 send_win, cong_win, limit, in_flight;
+ u32 age, send_win, cong_win, limit, in_flight;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ struct sk_buff *head;
int win_divisor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
goto send_now;
- /* Defer for less than two clock ticks. */
- if (tp->tso_deferred &&
- (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent.
+ */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1807,11 +1824,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
goto send_now;
}
- /* Ok, it looks like it is advisable to defer.
- * Do not rearm the timer if already set to not break TCP ACK clocking.
- */
- if (!tp->tso_deferred)
- tp->tso_deferred = 1 | (jiffies << 1);
+ head = tcp_write_queue_head(sk);
+ skb_mstamp_get(&now);
+ age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+ /* If next ACK is likely to come too late (half srtt), do not defer */
+ if (age < (tp->srtt_us >> 4))
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer. */
if (cong_win < send_win && cong_win < skb->len)
*is_cwnd_limited = true;
@@ -1819,10 +1839,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
return true;
send_now:
- tp->tso_deferred = 0;
return false;
}
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -1837,11 +1881,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
@@ -1854,12 +1900,25 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
- /* Very simple search strategy: just double the MSS. */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
mss_now = tcp_current_mss(sk);
- probe_size = 2 * tp->mss_cache;
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
@@ -1881,7 +1940,8 @@ static int tcp_mtu_probe(struct sock *sk)
}
/* We're allowed to probe. Build it now. */
- if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
+ if (!nskb)
return -1;
sk->sk_wmem_queued += nskb->truesize;
sk_mem_charge(sk, nskb->truesize);
@@ -1923,7 +1983,7 @@ static int tcp_mtu_probe(struct sock *sk)
skb->len, 0);
} else {
__pskb_trim_head(skb, copy);
- tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
}
TCP_SKB_CB(skb)->seq += copy;
}
@@ -1933,7 +1993,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (len >= probe_size)
break;
}
- tcp_init_tso_segs(sk, nskb, nskb->len);
+ tcp_init_tso_segs(nskb, nskb->len);
/* We're ready to send. If this fails, the probe will
* be resegmented into mss-sized pieces by tcp_write_xmit().
@@ -1995,7 +2055,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
- tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
@@ -2017,7 +2077,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
- if (tso_segs == 1 || !max_segs) {
+ if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
@@ -2030,7 +2090,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
limit = mss_now;
- if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+ if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
@@ -2179,7 +2239,7 @@ void tcp_send_loss_probe(struct sock *sk)
int mss = tcp_current_mss(sk);
int err = -1;
- if (tcp_send_head(sk) != NULL) {
+ if (tcp_send_head(sk)) {
err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
goto rearm_timer;
}
@@ -2331,7 +2391,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
@@ -2549,11 +2609,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (unlikely(oldpcount > 1)) {
if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
- tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_init_tso_segs(skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
}
+ /* RFC3168, section 6.1.1.1. ECN fallback */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+
tcp_retrans_try_collapse(sk, skb, cur_mss);
/* Make a copy, if the first transmission SKB clone we made
@@ -2689,7 +2753,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
/* we could do better than to assign each time */
- if (hole == NULL)
+ if (!hole)
tp->retransmit_skb_hint = skb;
/* Assume this retransmit will generate
@@ -2713,7 +2777,7 @@ begin_fwd:
if (!tcp_can_forward_retransmit(sk))
break;
/* Backtrack if necessary to non-L'ed skb */
- if (hole != NULL) {
+ if (hole) {
skb = hole;
hole = NULL;
}
@@ -2721,7 +2785,7 @@ begin_fwd:
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) {
- if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+ if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
@@ -2751,39 +2815,67 @@ begin_fwd:
}
}
-/* Send a fin. The caller locks the socket for us. This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int mss_now;
- /* Optimization, tack on the FIN if we have a queue of
- * unsent frames. But be careful about outgoing SACKS
- * and IP options.
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
*/
- mss_now = tcp_current_mss(sk);
-
- if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
- TCP_SKB_CB(skb)->end_seq++;
+ if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
} else {
- /* Socket is locked, keep trying until memory is available. */
- for (;;) {
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
- if (skb)
- break;
- yield();
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
}
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
@@ -2824,14 +2916,14 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_debug("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
- if (nskb == NULL)
+ if (!nskb)
return -ENOMEM;
tcp_unlink_write_queue(skb, sk);
__skb_header_release(nskb);
@@ -2866,7 +2958,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
struct sk_buff *skb;
- struct tcp_md5sig_key *md5;
+ struct tcp_md5sig_key *md5 = NULL;
int tcp_header_size;
int mss;
@@ -2879,7 +2971,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
skb_reserve(skb, MAX_TCP_HEADER);
skb_dst_set(skb, dst);
- security_skb_owned_by(skb, sk);
mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2892,7 +2983,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
else
#endif
skb_mstamp_get(&skb->skb_mstamp);
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+
+#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
+ md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
@@ -2923,12 +3019,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
- if (md5) {
+ if (md5)
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
- md5, NULL, req, skb);
- }
+ md5, req_to_sk(req), skb);
+ rcu_read_unlock();
#endif
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
@@ -2966,7 +3064,7 @@ static void tcp_connect_init(struct sock *sk)
(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk) != NULL)
+ if (tp->af_specific->md5_lookup(sk, sk))
tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
@@ -3082,7 +3180,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
if (!syn_data)
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
@@ -3148,7 +3246,7 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
@@ -3252,7 +3350,7 @@ void tcp_send_ack(struct sock *sk)
* sock.
*/
buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (buff == NULL) {
+ if (!buff) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3289,14 +3387,14 @@ EXPORT_SYMBOL_GPL(tcp_send_ack);
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (skb == NULL)
+ if (!skb)
return -1;
/* Reserve space for headers and set control bits. */
@@ -3307,6 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
+ NET_INC_STATS_BH(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -3314,12 +3413,12 @@ void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
- tcp_xmit_probe_skb(sk, 0);
+ tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
}
}
/* Initiate keepalive or window probe from timer. */
-int tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -3327,8 +3426,8 @@ int tcp_write_wakeup(struct sock *sk)
if (sk->sk_state == TCP_CLOSE)
return -1;
- if ((skb = tcp_send_head(sk)) != NULL &&
- before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ skb = tcp_send_head(sk);
+ if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
@@ -3347,7 +3446,7 @@ int tcp_write_wakeup(struct sock *sk)
if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb, mss);
+ tcp_set_skb_tso_segs(skb, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
@@ -3356,8 +3455,8 @@ int tcp_write_wakeup(struct sock *sk)
return err;
} else {
if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
- tcp_xmit_probe_skb(sk, 1);
- return tcp_xmit_probe_skb(sk, 0);
+ tcp_xmit_probe_skb(sk, 1, mib);
+ return tcp_xmit_probe_skb(sk, 0, mib);
}
}
@@ -3371,7 +3470,7 @@ void tcp_send_probe0(struct sock *sk)
unsigned long probe_max;
int err;
- err = tcp_write_wakeup(sk);
+ err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
if (tp->packets_out || !tcp_send_head(sk)) {
/* Cancel probe timer, if it is not required. */
@@ -3397,7 +3496,7 @@ void tcp_send_probe0(struct sock *sk)
probe_max = TCP_RESOURCE_PROBE_INTERVAL;
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- inet_csk_rto_backoff(icsk, probe_max),
+ tcp_probe0_when(sk, probe_max),
TCP_RTO_MAX);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b787904e..5b752f58a900 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
if (net->ipv4.sysctl_tcp_mtu_probing) {
if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
struct net *net = sock_net(sk);
@@ -166,7 +167,7 @@ static int tcp_write_timeout(struct sock *sk)
if (icsk->icsk_retransmits) {
dst_negative_advice(sk);
if (tp->syn_fastopen || tp->syn_data)
- tcp_fastopen_cache_set(sk, 0, NULL, true);
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (tp->syn_data)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
@@ -246,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk)
}
out:
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
sk_mem_reclaim(sk);
}
@@ -326,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk;
- req->rsk_ops->syn_ack_timeout(sk, req);
+ req->rsk_ops->syn_ack_timeout(req);
if (req->num_timeout >= max_retries) {
tcp_write_err(sk);
@@ -538,19 +539,11 @@ static void tcp_write_timer(unsigned long data)
sock_put(sk);
}
-/*
- * Timer for listening sockets
- */
-
-static void tcp_synack_timer(struct sock *sk)
+void tcp_syn_ack_timeout(const struct request_sock *req)
{
- inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
- TCP_TIMEOUT_INIT, TCP_RTO_MAX);
-}
+ struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
-void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
-{
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
}
EXPORT_SYMBOL(tcp_syn_ack_timeout);
@@ -582,7 +575,7 @@ static void tcp_keepalive_timer (unsigned long data)
}
if (sk->sk_state == TCP_LISTEN) {
- tcp_synack_timer(sk);
+ pr_err("Hmm... keepalive on a LISTEN ???\n");
goto out;
}
@@ -623,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data)
tcp_write_err(sk);
goto out;
}
- if (tcp_write_wakeup(sk) <= 0) {
+ if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6afde666ab1..a6cea1d5e20d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -286,19 +286,21 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
/* Extract info for Tcp socket info provided via netlink. */
-void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct vegas *ca = inet_csk_ca(sk);
+
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info info = {
- .tcpv_enabled = ca->doing_vegas_now,
- .tcpv_rttcnt = ca->cntRTT,
- .tcpv_rtt = ca->baseRTT,
- .tcpv_minrtt = ca->minRTT,
- };
+ info->vegas.tcpv_enabled = ca->doing_vegas_now,
+ info->vegas.tcpv_rttcnt = ca->cntRTT,
+ info->vegas.tcpv_rtt = ca->baseRTT,
+ info->vegas.tcpv_minrtt = ca->minRTT,
- nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
+ return 0;
}
EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 0531b99d8637..ef9da5306c68 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk);
void tcp_vegas_state(struct sock *sk, u8 ca_state);
void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bb63fba47d47..c10732e39837 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -256,20 +256,21 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
}
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct sock *sk, u32 ext,
- struct sk_buff *skb)
+static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct westwood *ca = inet_csk_ca(sk);
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info info = {
- .tcpv_enabled = 1,
- .tcpv_rtt = jiffies_to_usecs(ca->rtt),
- .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
- };
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = 0;
+ info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt),
+ info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
- nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
+ return 0;
}
static struct tcp_congestion_ops tcp_westwood __read_mostly = {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 97ef1f8b7be8..83aa604f9273 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -90,6 +90,7 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
@@ -318,8 +319,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
}
-static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
- unsigned int port)
+static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
+ unsigned int port)
{
return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}
@@ -421,9 +422,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
return score;
}
-static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
{
static u32 udp_ehash_secret __read_mostly;
@@ -433,7 +434,6 @@ static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
udp_ehash_secret + net_hash_mix(net));
}
-
/* called with read_rcu_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
@@ -633,7 +633,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
iph->saddr, uh->source, skb->dev->ifindex, udptable);
- if (sk == NULL) {
+ if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
}
@@ -873,8 +873,7 @@ out:
}
EXPORT_SYMBOL(udp_push_pending_frames);
-int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
@@ -1012,7 +1011,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (connected)
rt = (struct rtable *)sk_dst_check(sk, 0);
- if (rt == NULL) {
+ if (!rt) {
struct net *net = sock_net(sk);
fl4 = &fl4_stack;
@@ -1136,7 +1135,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
* sendpage interface can't pass.
* This will succeed only when the socket is connected.
*/
- ret = udp_sendmsg(NULL, sk, &msg, 0);
+ ret = udp_sendmsg(sk, &msg, 0);
if (ret < 0)
return ret;
}
@@ -1172,7 +1171,6 @@ out:
return ret;
}
-
/**
* first_packet_length - return length of first packet in receive queue
* @sk: socket
@@ -1254,8 +1252,8 @@ EXPORT_SYMBOL(udp_ioctl);
* return it, otherwise we block.
*/
-int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
@@ -1348,15 +1346,12 @@ csum_copy_err:
}
unlock_sock_fast(sk, slow);
- if (noblock)
- return -EAGAIN;
-
- /* starting over for a new packet */
+ /* starting over for a new packet, but check if we need to yield */
+ cond_resched();
msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
-
int udp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1523,7 +1518,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/* if we're overly short, let UDP handle it */
encap_rcv = ACCESS_ONCE(up->encap_rcv);
- if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
+ if (skb->len > sizeof(struct udphdr) && encap_rcv) {
int ret;
/* Verify checksum before giving to encap */
@@ -1580,7 +1575,6 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
-
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
@@ -1610,7 +1604,6 @@ drop:
return -1;
}
-
static void flush_stack(struct sock **stack, unsigned int count,
struct sk_buff *skb, unsigned int final)
{
@@ -1620,7 +1613,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
for (i = 0; i < count; i++) {
sk = stack[i];
- if (likely(skb1 == NULL))
+ if (likely(!skb1))
skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
if (!skb1) {
@@ -1803,7 +1796,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
saddr, daddr, udptable, proto);
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
- if (sk != NULL) {
+ if (sk) {
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
@@ -1968,6 +1961,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
struct sock *sk;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int ours;
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
@@ -1977,14 +1971,24 @@ void udp_v4_early_demux(struct sk_buff *skb)
uh = udp_hdr(skb);
if (skb->pkt_type == PACKET_BROADCAST ||
- skb->pkt_type == PACKET_MULTICAST)
+ skb->pkt_type == PACKET_MULTICAST) {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+ if (!in_dev)
+ return;
+
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return;
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr, dif);
- else if (skb->pkt_type == PACKET_HOST)
+ } else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr, dif);
- else
+ } else {
return;
+ }
if (!sk)
return;
@@ -2525,6 +2529,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)
}
}
+u32 udp_flow_hashrnd(void)
+{
+ static u32 hashrnd __read_mostly;
+
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+ return hashrnd;
+}
+EXPORT_SYMBOL(udp_flow_hashrnd);
+
void __init udp_init(void)
{
unsigned long limit;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4a000f1dd757..6116604bf6e8 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -18,8 +18,9 @@
#include <linux/sock_diag.h>
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb, struct inet_diag_req_v2 *req,
- struct nlattr *bc)
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -31,7 +32,8 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
}
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
{
int err = -EINVAL;
struct sock *sk;
@@ -56,7 +58,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
goto out_nosk;
err = -ENOENT;
- if (sk == NULL)
+ if (!sk)
goto out_nosk;
err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
@@ -90,8 +92,9 @@ out_nosk:
return err;
}
-static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+static void udp_dump(struct udp_table *table, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
@@ -144,13 +147,13 @@ done:
}
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
udp_dump(&udp_table, skb, cb, r, bc);
}
static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
return udp_dump_one(&udp_table, in_skb, nlh, req);
}
@@ -167,16 +170,18 @@ static const struct inet_diag_handler udp_diag_handler = {
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
+ .idiag_info_size = 0,
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
{
udp_dump(&udplite_table, skb, cb, r, bc);
}
static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
return udp_dump_one(&udplite_table, in_skb, nlh, req);
}
@@ -186,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = {
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
+ .idiag_info_size = 0,
};
static int __init udp_diag_init(void)
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index f3c27899f62b..7e0fe4bdd967 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -21,8 +21,8 @@ int compat_udp_setsockopt(struct sock *sk, int level, int optname,
int compat_udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
#endif
-int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 4915d8284a86..f9386160cbee 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -285,7 +285,7 @@ void udp_del_offload(struct udp_offload *uo)
pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
unlock:
spin_unlock(&udp_offload_lock);
- if (uo_priv != NULL)
+ if (uo_priv)
call_rcu(&uo_priv->rcu, udp_offload_free_routine);
}
EXPORT_SYMBOL(udp_del_offload);
@@ -394,7 +394,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
break;
}
- if (uo_priv != NULL) {
+ if (uo_priv) {
NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
err = uo_priv->offload->callbacks.gro_complete(skb,
nhoff + sizeof(struct udphdr),
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index c83b35485056..933ea903f7b8 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket *sock = NULL;
struct sockaddr_in udp_addr;
- err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
if (err < 0)
goto error;
- sk_change_net(sock->sk, net);
-
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
error:
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
*sockp = NULL;
return err;
@@ -75,7 +73,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
-int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb,
+int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
bool xnet, bool nocheck)
@@ -92,7 +90,7 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb,
udp_set_csum(nocheck, skb, src, dst, skb->len);
- return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP,
+ return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,
tos, ttl, df, xnet);
}
EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
@@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock)
{
rcu_assign_sk_user_data(sock->sk, NULL);
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index aac6197b7a71..60b032f58ccc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -22,9 +22,9 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm4_extract_header(skb);
}
-static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
+static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
{
- if (skb_dst(skb) == NULL) {
+ if (!skb_dst(skb)) {
const struct iphdr *iph = ip_hdr(skb);
if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
@@ -52,7 +52,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
xfrm4_rcv_encap_finish);
return 0;
}
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 91771a7c802f..35feda676464 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -63,7 +63,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;
- ip_select_ident(skb, NULL);
+ ip_select_ident(dev_net(dst->dev), skb, NULL);
return 0;
}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index dab73813cb92..2878dbfffeb7 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
}
EXPORT_SYMBOL(xfrm4_prepare_output);
-int xfrm4_output_finish(struct sk_buff *skb)
+int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
{
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
@@ -77,26 +77,26 @@ int xfrm4_output_finish(struct sk_buff *skb)
IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
#endif
- return xfrm_output(skb);
+ return xfrm_output(sk, skb);
}
-static int __xfrm4_output(struct sk_buff *skb)
+static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
{
struct xfrm_state *x = skb_dst(skb)->xfrm;
#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(skb);
+ return dst_output_sk(sk, skb);
}
#endif
- return x->outer_mode->afinfo->output_finish(skb);
+ return x->outer_mode->afinfo->output_finish(sk, skb);
}
int xfrm4_output(struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
NULL, skb_dst(skb)->dev, __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6156f68a1e90..bff69746e05f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -232,7 +232,6 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static struct dst_ops xfrm4_dst_ops = {
.family = AF_INET,
- .protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
.redirect = xfrm4_redirect,
@@ -299,7 +298,7 @@ static void __net_exit xfrm4_net_exit(struct net *net)
{
struct ctl_table *table;
- if (net->ipv4.xfrm4_hdr == NULL)
+ if (!net->ipv4.xfrm4_hdr)
return;
table = net->ipv4.xfrm4_hdr->ctl_table_arg;