diff options
Diffstat (limited to 'net/ipv4')
96 files changed, 6014 insertions, 4384 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 70491d9035eb..7c3a7d191249 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -166,7 +166,7 @@ config IP_PNP_DHCP If unsure, say Y. Note that if you want to use DHCP, a DHCP server must be operating on your network. Read - <file:Documentation/filesystems/nfsroot.txt> for details. + <file:Documentation/filesystems/nfs/nfsroot.txt> for details. config IP_PNP_BOOTP bool "IP: BOOTP support" @@ -181,7 +181,7 @@ config IP_PNP_BOOTP does BOOTP itself, providing all necessary information on the kernel command line, you can say N here. If unsure, say Y. Note that if you want to use BOOTP, a BOOTP server must be operating on your network. - Read <file:Documentation/filesystems/nfsroot.txt> for details. + Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details. config IP_PNP_RARP bool "IP: RARP support" @@ -194,7 +194,7 @@ config IP_PNP_RARP older protocol which is being obsoleted by BOOTP and DHCP), say Y here. Note that if you want to use RARP, a RARP server must be operating on your network. Read - <file:Documentation/filesystems/nfsroot.txt> for details. + <file:Documentation/filesystems/nfs/nfsroot.txt> for details. # not yet ready.. # bool ' IP: ARP support' CONFIG_IP_PNP_ARP @@ -250,6 +250,20 @@ config IP_MROUTE <file:Documentation/networking/multicast.txt>. If you haven't heard about it, you don't need it. +config IP_MROUTE_MULTIPLE_TABLES + bool "IP: multicast policy routing" + depends on IP_MROUTE && IP_ADVANCED_ROUTER + select FIB_RULES + help + Normally, a multicast router runs a userspace daemon and decides + what to do with a multicast packet based on the source and + destination addresses. If you say Y here, the multicast router + will also be able to take interfaces and packet marks into + account and run multiple instances of userspace daemons + simultaneously, each one handling a single table. + + If unsure, say N. + config IP_PIMSM_V1 bool "IP: PIM-SM version 1 support" depends on IP_MROUTE @@ -289,7 +303,7 @@ config ARPD If unsure, say N. config SYN_COOKIES - bool "IP: TCP syncookie support (disabled per default)" + bool "IP: TCP syncookie support" ---help--- Normal TCP/IP networking is open to an attack known as "SYN flooding". This denial-of-service attack prevents legitimate remote @@ -314,13 +328,13 @@ config SYN_COOKIES server is really overloaded. If this happens frequently better turn them off. - If you say Y here, note that SYN cookies aren't enabled by default; - you can enable them by saying Y to "/proc file system support" and + If you say Y here, you can disable SYN cookies at run time by + saying Y to "/proc file system support" and "Sysctl support" below and executing the command - echo 1 >/proc/sys/net/ipv4/tcp_syncookies + echo 0 > /proc/sys/net/ipv4/tcp_syncookies - at boot time after the /proc file system has been mounted. + after the /proc file system has been mounted. If unsure, say N. @@ -587,9 +601,15 @@ choice config DEFAULT_HTCP bool "Htcp" if TCP_CONG_HTCP=y + config DEFAULT_HYBLA + bool "Hybla" if TCP_CONG_HYBLA=y + config DEFAULT_VEGAS bool "Vegas" if TCP_CONG_VEGAS=y + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y @@ -610,8 +630,10 @@ config DEFAULT_TCP_CONG default "bic" if DEFAULT_BIC default "cubic" if DEFAULT_CUBIC default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA default "vegas" if DEFAULT_VEGAS default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO default "cubic" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 566ea6c4321d..551ce564b035 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -86,6 +86,7 @@ #include <linux/poll.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -124,7 +125,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); struct ipv4_config ipv4_config; - EXPORT_SYMBOL(ipv4_config); /* New destruction routine */ @@ -139,12 +139,12 @@ void inet_sock_destruct(struct sock *sk) sk_mem_reclaim(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { - printk("Attempt to release TCP socket in state %d %p\n", + pr_err("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); return; } if (!sock_flag(sk, SOCK_DEAD)) { - printk("Attempt to release alive inet socket %p\n", sk); + pr_err("Attempt to release alive inet socket %p\n", sk); return; } @@ -154,9 +154,10 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(sk->sk_forward_alloc); kfree(inet->opt); - dst_release(sk->sk_dst_cache); + dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); sk_refcnt_debug_dec(sk); } +EXPORT_SYMBOL(inet_sock_destruct); /* * The routines beyond this point handle the behaviour of an AF_INET @@ -174,12 +175,12 @@ static int inet_autobind(struct sock *sk) /* We may need to bind the socket. */ lock_sock(sk); inet = inet_sk(sk); - if (!inet->num) { + if (!inet->inet_num) { if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } - inet->sport = htons(inet->num); + inet->inet_sport = htons(inet->inet_num); } release_sock(sk); return 0; @@ -219,6 +220,7 @@ out: release_sock(sk); return err; } +EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); @@ -243,7 +245,7 @@ EXPORT_SYMBOL(build_ehash_secret); static inline int inet_netns_ok(struct net *net, int protocol) { int hash; - struct net_protocol *ipprot; + const struct net_protocol *ipprot; if (net_eq(net, &init_net)) return 1; @@ -261,7 +263,8 @@ static inline int inet_netns_ok(struct net *net, int protocol) * Create an inet socket. */ -static int inet_create(struct net *net, struct socket *sock, int protocol) +static int inet_create(struct net *net, struct socket *sock, int protocol, + int kern) { struct sock *sk; struct inet_protosw *answer; @@ -324,7 +327,7 @@ lookup_protocol: } err = -EPERM; - if (answer->capability > 0 && !capable(answer->capability)) + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) goto out_rcu_unlock; err = -EAFNOSUPPORT; @@ -353,7 +356,7 @@ lookup_protocol: inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; if (SOCK_RAW == sock->type) { - inet->num = protocol; + inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } @@ -363,7 +366,7 @@ lookup_protocol: else inet->pmtudisc = IP_PMTUDISC_WANT; - inet->id = 0; + inet->inet_id = 0; sock_init_data(sock, sk); @@ -380,13 +383,13 @@ lookup_protocol: sk_refcnt_debug_inc(sk); - if (inet->num) { + if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ - inet->sport = htons(inet->num); + inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } @@ -416,6 +419,8 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + sock_rps_reset_flow(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -435,9 +440,11 @@ int inet_release(struct socket *sock) } return 0; } +EXPORT_SYMBOL(inet_release); /* It is off by default, see below. */ int sysctl_ip_nonlocal_bind __read_mostly; +EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { @@ -491,27 +498,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Check these errors (active socket, double bind). */ err = -EINVAL; - if (sk->sk_state != TCP_CLOSE || inet->num) + if (sk->sk_state != TCP_CLOSE || inet->inet_num) goto out_release_sock; - inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) - inet->saddr = 0; /* Use device */ + inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ if (sk->sk_prot->get_port(sk, snum)) { - inet->saddr = inet->rcv_saddr = 0; + inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; } - if (inet->rcv_saddr) + if (inet->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; - inet->sport = htons(inet->num); - inet->daddr = 0; - inet->dport = 0; + inet->inet_sport = htons(inet->inet_num); + inet->inet_daddr = 0; + inet->inet_dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: @@ -519,25 +526,29 @@ out_release_sock: out: return err; } +EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); - if (!inet_sk(sk)->num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } +EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -550,9 +561,9 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) lock_sock(sk); if (signal_pending(current) || !timeo) break; - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return timeo; } @@ -567,6 +578,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; long timeo; + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { @@ -641,6 +655,7 @@ sock_error: sock->state = SS_DISCONNECTING; goto out; } +EXPORT_SYMBOL(inet_stream_connect); /* * Accept a pending connection. The TCP layer now gives BSD semantics. @@ -668,6 +683,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) do_err: return err; } +EXPORT_SYMBOL(inet_accept); /* @@ -678,47 +694,53 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); sin->sin_family = AF_INET; if (peer) { - if (!inet->dport || + if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) return -ENOTCONN; - sin->sin_port = inet->dport; - sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->inet_dport; + sin->sin_addr.s_addr = inet->inet_daddr; } else { - __be32 addr = inet->rcv_saddr; + __be32 addr = inet->inet_rcv_saddr; if (!addr) - addr = inet->saddr; - sin->sin_port = inet->sport; + addr = inet->inet_saddr; + sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *uaddr_len = sizeof(*sin); return 0; } +EXPORT_SYMBOL(inet_getname); int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; + sock_rps_record_flow(sk); + /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->sendmsg(iocb, sk, msg, size); } +EXPORT_SYMBOL(inet_sendmsg); - -static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) { struct sock *sk = sock->sk; + sock_rps_record_flow(sk); + /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; if (sk->sk_prot->sendpage) @@ -726,6 +748,22 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, return sock_no_sendpage(sock, page, offset, size, flags); } +int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + sock_rps_record_flow(sk); + + err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} +EXPORT_SYMBOL(inet_recvmsg); int inet_shutdown(struct socket *sock, int how) { @@ -780,6 +818,7 @@ int inet_shutdown(struct socket *sock, int how) release_sock(sk); return err; } +EXPORT_SYMBOL(inet_shutdown); /* * ioctl() calls you can issue on an INET socket. Most of these are @@ -798,44 +837,45 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct net *net = sock_net(sk); switch (cmd) { - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *)arg); - break; - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *)arg); - break; - case SIOCADDRT: - case SIOCDELRT: - case SIOCRTMSG: - err = ip_rt_ioctl(net, cmd, (void __user *)arg); - break; - case SIOCDARP: - case SIOCGARP: - case SIOCSARP: - err = arp_ioctl(net, cmd, (void __user *)arg); - break; - case SIOCGIFADDR: - case SIOCSIFADDR: - case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - case SIOCGIFDSTADDR: - case SIOCSIFDSTADDR: - case SIOCSIFPFLAGS: - case SIOCGIFPFLAGS: - case SIOCSIFFLAGS: - err = devinet_ioctl(net, cmd, (void __user *)arg); - break; - default: - if (sk->sk_prot->ioctl) - err = sk->sk_prot->ioctl(sk, cmd, arg); - else - err = -ENOIOCTLCMD; - break; + case SIOCGSTAMP: + err = sock_get_timestamp(sk, (struct timeval __user *)arg); + break; + case SIOCGSTAMPNS: + err = sock_get_timestampns(sk, (struct timespec __user *)arg); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCRTMSG: + err = ip_rt_ioctl(net, cmd, (void __user *)arg); + break; + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + err = arp_ioctl(net, cmd, (void __user *)arg); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: + err = devinet_ioctl(net, cmd, (void __user *)arg); + break; + default: + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; + break; } return err; } +EXPORT_SYMBOL(inet_ioctl); const struct proto_ops inet_stream_ops = { .family = PF_INET, @@ -853,7 +893,7 @@ const struct proto_ops inet_stream_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = tcp_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = tcp_sendpage, .splice_read = tcp_splice_read, @@ -862,6 +902,7 @@ const struct proto_ops inet_stream_ops = { .compat_getsockopt = compat_sock_common_getsockopt, #endif }; +EXPORT_SYMBOL(inet_stream_ops); const struct proto_ops inet_dgram_ops = { .family = PF_INET, @@ -879,7 +920,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT @@ -887,6 +928,7 @@ const struct proto_ops inet_dgram_ops = { .compat_getsockopt = compat_sock_common_getsockopt, #endif }; +EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without @@ -908,7 +950,7 @@ static const struct proto_ops inet_sockraw_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT @@ -917,7 +959,7 @@ static const struct proto_ops inet_sockraw_ops = { #endif }; -static struct net_proto_family inet_family_ops = { +static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, @@ -933,7 +975,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, @@ -944,7 +985,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .capability = -1, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -955,7 +995,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .capability = CAP_NET_RAW, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } @@ -1016,6 +1055,7 @@ out_illegal: p->type); goto out; } +EXPORT_SYMBOL(inet_register_protosw); void inet_unregister_protosw(struct inet_protosw *p) { @@ -1031,6 +1071,7 @@ void inet_unregister_protosw(struct inet_protosw *p) synchronize_net(); } } +EXPORT_SYMBOL(inet_unregister_protosw); /* * Shall we try to damage output packets if routing dev changes? @@ -1043,9 +1084,9 @@ static int inet_sk_reselect_saddr(struct sock *sk) struct inet_sock *inet = inet_sk(sk); int err; struct rtable *rt; - __be32 old_saddr = inet->saddr; + __be32 old_saddr = inet->inet_saddr; __be32 new_saddr; - __be32 daddr = inet->daddr; + __be32 daddr = inet->inet_daddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; @@ -1055,7 +1096,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, - inet->sport, inet->dport, sk, 0); + inet->inet_sport, inet->inet_dport, sk, 0); if (err) return err; @@ -1071,7 +1112,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) __func__, &old_saddr, &new_saddr); } - inet->saddr = inet->rcv_saddr = new_saddr; + inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; /* * XXX The only one ugly spot where we need to @@ -1097,16 +1138,17 @@ int inet_sk_rebuild_header(struct sock *sk) return 0; /* Reroute. */ - daddr = inet->daddr; + daddr = inet->inet_daddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; { struct flowi fl = { .oif = sk->sk_bound_dev_if, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = inet->saddr, + .saddr = inet->inet_saddr, .tos = RT_CONN_FLAGS(sk), }, }, @@ -1114,8 +1156,8 @@ int inet_sk_rebuild_header(struct sock *sk) .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { - .sport = inet->sport, - .dport = inet->dport, + .sport = inet->inet_sport, + .dport = inet->inet_dport, }, }, }; @@ -1141,13 +1183,12 @@ int inet_sk_rebuild_header(struct sock *sk) return err; } - EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) { struct iphdr *iph; - struct net_protocol *ops; + const struct net_protocol *ops; int proto; int ihl; int err = -EINVAL; @@ -1183,10 +1224,11 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; - struct net_protocol *ops; + const struct net_protocol *ops; int proto; int ihl; int id; + unsigned int offset = 0; if (!(features & NETIF_F_V4_CSUM)) features &= ~NETIF_F_SG; @@ -1229,7 +1271,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) skb = segs; do { iph = ip_hdr(skb); - iph->id = htons(id++); + if (proto == IPPROTO_UDP) { + iph->id = htons(id); + iph->frag_off = htons(offset >> 3); + if (skb->next != NULL) + iph->frag_off |= htons(IP_MF); + offset += (skb->len - skb->mac_len - iph->ihl * 4); + } else + iph->id = htons(id++); iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); @@ -1242,7 +1291,7 @@ out: static struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct net_protocol *ops; + const struct net_protocol *ops; struct sk_buff **pp = NULL; struct sk_buff *p; struct iphdr *iph; @@ -1274,8 +1323,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - id = ntohl(*(u32 *)&iph->id); - flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + id = ntohl(*(__be32 *)&iph->id); + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); id >>= 16; for (p = *head; p; p = p->next) { @@ -1288,8 +1337,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if ((iph->protocol ^ iph2->protocol) | (iph->tos ^ iph2->tos) | - (iph->saddr ^ iph2->saddr) | - (iph->daddr ^ iph2->daddr)) { + ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | + ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -1319,7 +1368,7 @@ out: static int inet_gro_complete(struct sk_buff *skb) { - struct net_protocol *ops; + const struct net_protocol *ops; struct iphdr *iph = ip_hdr(skb); int proto = iph->protocol & (MAX_INET_PROTOS - 1); int err = -ENOSYS; @@ -1361,10 +1410,9 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } return rc; } - EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib[], int offt) { unsigned long res = 0; int i; @@ -1377,13 +1425,13 @@ unsigned long snmp_fold_field(void *mib[], int offt) } EXPORT_SYMBOL_GPL(snmp_fold_field); -int snmp_mib_init(void *ptr[2], size_t mibsize) +int snmp_mib_init(void __percpu *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); + ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long)); if (!ptr[0]) goto err0; - ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); + ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long)); if (!ptr[1]) goto err1; return 0; @@ -1395,7 +1443,7 @@ err0: } EXPORT_SYMBOL_GPL(snmp_mib_init); -void snmp_mib_free(void *ptr[2]) +void snmp_mib_free(void __percpu *ptr[2]) { BUG_ON(ptr == NULL); free_percpu(ptr[0]); @@ -1405,13 +1453,13 @@ void snmp_mib_free(void *ptr[2]) EXPORT_SYMBOL_GPL(snmp_mib_free); #ifdef CONFIG_IP_MULTICAST -static struct net_protocol igmp_protocol = { +static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, .netns_ok = 1, }; #endif -static struct net_protocol tcp_protocol = { +static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .gso_send_check = tcp_v4_gso_send_check, @@ -1422,14 +1470,16 @@ static struct net_protocol tcp_protocol = { .netns_ok = 1, }; -static struct net_protocol udp_protocol = { +static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, + .gso_send_check = udp4_ufo_send_check, + .gso_segment = udp4_ufo_fragment, .no_policy = 1, .netns_ok = 1, }; -static struct net_protocol icmp_protocol = { +static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .no_policy = 1, .netns_ok = 1, @@ -1437,25 +1487,25 @@ static struct net_protocol icmp_protocol = { static __net_init int ipv4_mib_init_net(struct net *net) { - if (snmp_mib_init((void **)net->mib.tcp_statistics, + if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, sizeof(struct tcp_mib)) < 0) goto err_tcp_mib; - if (snmp_mib_init((void **)net->mib.ip_statistics, + if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp_mib_init((void **)net->mib.net_statistics, + if (snmp_mib_init((void __percpu **)net->mib.net_statistics, sizeof(struct linux_mib)) < 0) goto err_net_mib; - if (snmp_mib_init((void **)net->mib.udp_statistics, + if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, sizeof(struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)net->mib.udplite_statistics, + if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, sizeof(struct udp_mib)) < 0) goto err_udplite_mib; - if (snmp_mib_init((void **)net->mib.icmp_statistics, + if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, + if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; @@ -1463,30 +1513,30 @@ static __net_init int ipv4_mib_init_net(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void **)net->mib.icmp_statistics); + snmp_mib_free((void __percpu **)net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void **)net->mib.udplite_statistics); + snmp_mib_free((void __percpu **)net->mib.udplite_statistics); err_udplite_mib: - snmp_mib_free((void **)net->mib.udp_statistics); + snmp_mib_free((void __percpu **)net->mib.udp_statistics); err_udp_mib: - snmp_mib_free((void **)net->mib.net_statistics); + snmp_mib_free((void __percpu **)net->mib.net_statistics); err_net_mib: - snmp_mib_free((void **)net->mib.ip_statistics); + snmp_mib_free((void __percpu **)net->mib.ip_statistics); err_ip_mib: - snmp_mib_free((void **)net->mib.tcp_statistics); + snmp_mib_free((void __percpu **)net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { - snmp_mib_free((void **)net->mib.icmpmsg_statistics); - snmp_mib_free((void **)net->mib.icmp_statistics); - snmp_mib_free((void **)net->mib.udplite_statistics); - snmp_mib_free((void **)net->mib.udp_statistics); - snmp_mib_free((void **)net->mib.net_statistics); - snmp_mib_free((void **)net->mib.ip_statistics); - snmp_mib_free((void **)net->mib.tcp_statistics); + snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); + snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + snmp_mib_free((void __percpu **)net->mib.udp_statistics); + snmp_mib_free((void __percpu **)net->mib.net_statistics); + snmp_mib_free((void __percpu **)net->mib.ip_statistics); + snmp_mib_free((void __percpu **)net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1523,9 +1573,13 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!sysctl_local_reserved_ports) + goto out; + rc = proto_register(&tcp_prot, 1); if (rc) - goto out; + goto out_free_reserved_ports; rc = proto_register(&udp_prot, 1); if (rc) @@ -1624,6 +1678,8 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); +out_free_reserved_ports: + kfree(sysctl_local_reserved_ports); goto out; } @@ -1666,19 +1722,3 @@ static int __init ipv4_proc_init(void) MODULE_ALIAS_NETPROTO(PF_INET); -EXPORT_SYMBOL(inet_accept); -EXPORT_SYMBOL(inet_bind); -EXPORT_SYMBOL(inet_dgram_connect); -EXPORT_SYMBOL(inet_dgram_ops); -EXPORT_SYMBOL(inet_getname); -EXPORT_SYMBOL(inet_ioctl); -EXPORT_SYMBOL(inet_listen); -EXPORT_SYMBOL(inet_register_protosw); -EXPORT_SYMBOL(inet_release); -EXPORT_SYMBOL(inet_sendmsg); -EXPORT_SYMBOL(inet_shutdown); -EXPORT_SYMBOL(inet_sock_destruct); -EXPORT_SYMBOL(inet_stream_connect); -EXPORT_SYMBOL(inet_stream_ops); -EXPORT_SYMBOL(inet_unregister_protosw); -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e878e494296e..880a5ec6dce0 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -1,14 +1,73 @@ +#include <crypto/hash.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> -#include <linux/spinlock.h> +#include <linux/scatterlist.h> #include <net/icmp.h> #include <net/protocol.h> +struct ah_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) + +static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, + unsigned int size) +{ + unsigned int len; + + len = size + crypto_ahash_digestsize(ahash) + + (crypto_ahash_alignmask(ahash) & + ~(crypto_tfm_ctx_alignment() - 1)); + + len = ALIGN(len, crypto_tfm_ctx_alignment()); + + len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) +{ + return tmp + offset; +} + +static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, + unsigned int offset) +{ + return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); +} + +static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, + u8 *icv) +{ + struct ahash_request *req; + + req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), + crypto_tfm_ctx_alignment()); + + ahash_request_set_tfm(req, ahash); + + return req; +} + +static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, + struct ahash_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_ahash_reqsize(ahash), + __alignof__(struct scatterlist)); +} /* Clear mutable options and find final destination to substitute * into IP header for icv calculation. Options are already checked @@ -54,20 +113,72 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) return 0; } +static void ah_output_done(struct crypto_async_request *base, int err) +{ + u8 *icv; + struct iphdr *iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = skb_dst(skb)->xfrm; + struct ah_data *ahp = x->data; + struct iphdr *top_iph = ip_hdr(skb); + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int ihl = ip_hdrlen(skb); + + iph = AH_SKB_CB(skb)->tmp; + icv = ah_tmp_icv(ahp->ahash, iph, ihl); + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + top_iph->frag_off = iph->frag_off; + if (top_iph->ihl != 5) { + top_iph->daddr = iph->daddr; + memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + } + + err = ah->nexthdr; + + kfree(AH_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + static int ah_output(struct xfrm_state *x, struct sk_buff *skb) { int err; + int nfrags; + int ihl; + u8 *icv; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; + + ahp = x->data; + ahash = ahp->ahash; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; skb_push(skb, -skb_network_offset(skb)); + ah = ip_auth_hdr(skb); + ihl = ip_hdrlen(skb); + + err = -ENOMEM; + iph = ah_alloc_tmp(ahash, nfrags, ihl); + if (!iph) + goto out; + + icv = ah_tmp_icv(ahash, iph, ihl); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + memset(ah->auth_data, 0, ahp->icv_trunc_len); + top_iph = ip_hdr(skb); - iph = &tmp_iph.iph; iph->tos = top_iph->tos; iph->ttl = top_iph->ttl; @@ -78,10 +189,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); err = ip_clear_mutable_options(top_iph, &top_iph->daddr); if (err) - goto error; + goto out_free; } - ah = ip_auth_hdr(skb); ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; @@ -91,20 +201,31 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ttl = 0; top_iph->check = 0; - ahp = x->data; ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); - spin_lock_bh(&x->lock); - err = ah_mac_digest(ahp, skb, ah->auth_data); - memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); - spin_unlock_bh(&x->lock); + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); - if (err) - goto error; + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah_output_done, skb); + + AH_SKB_CB(skb)->tmp = iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; + } + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; @@ -114,28 +235,67 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } - err = 0; - -error: +out_free: + kfree(iph); +out: return err; } +static void ah_input_done(struct crypto_async_request *base, int err) +{ + u8 *auth_data; + u8 *icv; + struct iphdr *work_iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = xfrm_input_state(skb); + struct ah_data *ahp = x->data; + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int ihl = ip_hdrlen(skb); + int ah_hlen = (ah->hdrlen + 2) << 2; + + work_iph = AH_SKB_CB(skb)->tmp; + auth_data = ah_tmp_auth(work_iph, ihl); + icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, ihl); + __skb_pull(skb, ah_hlen + ihl); + skb_set_transport_header(skb, -ihl); + + err = ah->nexthdr; +out: + kfree(AH_SKB_CB(skb)->tmp); + xfrm_input_resume(skb, err); +} + static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; int nexthdr; - int err = -EINVAL; - struct iphdr *iph; + int nfrags; + u8 *auth_data; + u8 *icv; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct iphdr *iph, *work_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; - char work_buf[60]; + int err = -ENOMEM; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; + ahash = ahp->ahash; + nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; @@ -156,9 +316,24 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); + ihl = ip_hdrlen(skb); + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (!work_iph) + goto out; + + auth_data = ah_tmp_auth(work_iph, ihl); + icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); - ihl = skb->data - skb_network_header(skb); - memcpy(work_buf, iph, ihl); + memcpy(work_iph, iph, ihl); + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); iph->ttl = 0; iph->tos = 0; @@ -166,35 +341,44 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) iph->check = 0; if (ihl > sizeof(*iph)) { __be32 dummy; - if (ip_clear_mutable_options(iph, &dummy)) - goto out; + err = ip_clear_mutable_options(iph, &dummy); + if (err) + goto out_free; } - spin_lock(&x->lock); - { - u8 auth_data[MAX_AH_AUTH_LEN]; + skb_push(skb, ihl); - memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, ihl); - err = ah_mac_digest(ahp, skb, ah->auth_data); - if (err) - goto unlock; - if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) - err = -EBADMSG; + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah_input_done, skb); + + AH_SKB_CB(skb)->tmp = work_iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; } -unlock: - spin_unlock(&x->lock); + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; if (err) - goto out; + goto out_free; skb->network_header += ah_hlen; - memcpy(skb_network_header(skb), work_buf, ihl); - skb->transport_header = skb->network_header; + memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); + skb_set_transport_header(skb, -ihl); - return nexthdr; + err = nexthdr; +out_free: + kfree (work_iph); out: return err; } @@ -210,7 +394,7 @@ static void ah4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return; printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", @@ -222,7 +406,7 @@ static int ah_init_state(struct xfrm_state *x) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; - struct crypto_hash *tfm; + struct crypto_ahash *ahash; if (!x->aalg) goto error; @@ -231,44 +415,40 @@ static int ah_init_state(struct xfrm_state *x) goto error; ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); - if (ahp == NULL) + if (!ahp) return -ENOMEM; - tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) + ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); + if (IS_ERR(ahash)) goto error; - ahp->tfm = tfm; - if (crypto_hash_setkey(tfm, x->aalg->alg_key, - (x->aalg->alg_key_len + 7) / 8)) + ahp->ahash = ahash; + if (crypto_ahash_setkey(ahash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) goto error; /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here - * after a successful crypto_alloc_hash(). + * after a successful crypto_alloc_ahash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != - crypto_hash_digestsize(tfm)) { + crypto_ahash_digestsize(ahash)) { printk(KERN_INFO "AH: %s digestsize %u != %hu\n", - x->aalg->alg_name, crypto_hash_digestsize(tfm), + x->aalg->alg_name, crypto_ahash_digestsize(ahash), aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; - ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); - if (!ahp->work_icv) - goto error; - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) @@ -279,8 +459,7 @@ static int ah_init_state(struct xfrm_state *x) error: if (ahp) { - kfree(ahp->work_icv); - crypto_free_hash(ahp->tfm); + crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; @@ -293,8 +472,7 @@ static void ah_destroy(struct xfrm_state *x) if (!ahp) return; - kfree(ahp->work_icv); - crypto_free_hash(ahp->tfm); + crypto_free_ahash(ahp->ahash); kfree(ahp); } @@ -311,7 +489,7 @@ static const struct xfrm_type ah_type = .output = ah_output }; -static struct net_protocol ah4_protocol = { +static const struct net_protocol ah4_protocol = { .handler = xfrm4_rcv, .err_handler = ah4_err, .no_policy = 1, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 090e9991ac2a..f094b75810db 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -70,6 +70,7 @@ * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash + * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #include <linux/module.h> @@ -97,6 +98,7 @@ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/jhash.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -130,7 +132,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); -static struct neigh_ops arp_generic_ops = { +static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -140,7 +142,7 @@ static struct neigh_ops arp_generic_ops = { .queue_xmit = dev_queue_xmit, }; -static struct neigh_ops arp_hh_ops = { +static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -150,7 +152,7 @@ static struct neigh_ops arp_hh_ops = { .queue_xmit = dev_queue_xmit, }; -static struct neigh_ops arp_direct_ops = { +static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = dev_queue_xmit, .connected_output = dev_queue_xmit, @@ -158,7 +160,7 @@ static struct neigh_ops arp_direct_ops = { .queue_xmit = dev_queue_xmit, }; -struct neigh_ops arp_broken_ops = { +const struct neigh_ops arp_broken_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -524,12 +526,15 @@ int arp_bind_neighbour(struct dst_entry *dst) /* * Check if we can use proxy ARP for this path */ - -static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +static inline int arp_fwd_proxy(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; + if (rt->u.dst.dev == dev) + return 0; + if (!IN_DEV_PROXY_ARP(in_dev)) return 0; @@ -548,6 +553,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) } /* + * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) + * + * RFC3069 supports proxy arp replies back to the same interface. This + * is done to support (ethernet) switch features, like RFC 3069, where + * the individual ports are not allowed to communicate with each + * other, BUT they are allowed to talk to the upstream router. As + * described in RFC 3069, it is possible to allow these hosts to + * communicate through the upstream router, by proxy_arp'ing. + * + * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" + * + * This technology is known by different names: + * In RFC 3069 it is called VLAN Aggregation. + * Cisco and Allied Telesyn call it Private VLAN. + * Hewlett-Packard call it Source-Port filtering or port-isolation. + * Ericsson call it MAC-Forced Forwarding (RFC Draft). + * + */ +static inline int arp_fwd_pvlan(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt, + __be32 sip, __be32 tip) +{ + /* Private VLAN is only concerned about the same ethernet segment */ + if (rt->u.dst.dev != dev) + return 0; + + /* Don't reply on self probes (often done by windowz boxes)*/ + if (sip == tip) + return 0; + + if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) + return 1; + else + return 0; +} + +/* * Interface to link layer: send routine and receive handler. */ @@ -619,13 +661,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, #endif #endif -#ifdef CONFIG_FDDI +#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif -#ifdef CONFIG_TR +#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) case ARPHRD_IEEE802_TR: arp->ar_hrd = htons(ARPHRD_IEEE802); arp->ar_pro = htons(ETH_P_IP); @@ -812,7 +854,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input(skb, tip, sip, 0, dev) == 0) { + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; @@ -833,8 +875,11 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + if (addr_type == RTN_UNICAST && + (arp_fwd_proxy(in_dev, dev, rt) || + arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) + { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -863,7 +908,8 @@ static int arp_process(struct sk_buff *skb) devices (strip is candidate) */ if (n == NULL && - arp->ar_op == htons(ARPOP_REPLY) && + (arp->ar_op == htons(ARPOP_REPLY) || + (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && inet_addr_type(net, sip) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -1005,7 +1051,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, return -EINVAL; } switch (dev->type) { -#ifdef CONFIG_FDDI +#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP @@ -1239,8 +1285,7 @@ void __init arp_init(void) dev_add_pack(&arp_packet_type); arp_proc_init(); #ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); + neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 039cc1ffe977..3a92a76ae41d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -44,6 +44,7 @@ #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> @@ -289,8 +290,6 @@ void cipso_v4_cache_invalidate(void) cipso_v4_cache[iter].size = 0; spin_unlock_bh(&cipso_v4_cache[iter].lock); } - - return; } /** @@ -2017,7 +2016,7 @@ req_setattr_failure: * values on failure. * */ -int cipso_v4_delopt(struct ip_options **opt_ptr) +static int cipso_v4_delopt(struct ip_options **opt_ptr) { int hdr_delta = 0; struct ip_options *opt = *opt_ptr; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 5e6c5a0f3fde..fb2465811b48 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); oif = sk->sk_bound_dev_if; - saddr = inet->saddr; + saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif) oif = inet->mc_index; @@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, - inet->sport, usin->sin_port, sk, 1); + inet->inet_sport, usin->sin_port, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); @@ -60,14 +60,14 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ip_rt_put(rt); return -EACCES; } - if (!inet->saddr) - inet->saddr = rt->rt_src; /* Update source address */ - if (!inet->rcv_saddr) - inet->rcv_saddr = rt->rt_src; - inet->daddr = rt->rt_dst; - inet->dport = usin->sin_port; + if (!inet->inet_saddr) + inet->inet_saddr = rt->rt_src; /* Update source address */ + if (!inet->inet_rcv_saddr) + inet->inet_rcv_saddr = rt->rt_src; + inet->inet_daddr = rt->rt_dst; + inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; - inet->id = jiffies; + inet->inet_id = jiffies; sk_dst_set(sk, &rt->u.dst); return(0); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3863c3a4223f..382bc768ed56 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -50,6 +50,7 @@ #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -64,20 +65,20 @@ static struct ipv4_devconf ipv4_devconf = { .data = { - [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { - [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, - [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, }, }; @@ -140,11 +141,11 @@ void in_dev_finish_destroy(struct in_device *idev) #endif dev_put(dev); if (!idev->dead) - printk("Freeing alive in_device %p\n", idev); - else { + pr_err("Freeing alive in_device %p\n", idev); + else kfree(idev); - } } +EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { @@ -159,7 +160,8 @@ static struct in_device *inetdev_init(struct net_device *dev) sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; - if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) + in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); + if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); @@ -405,13 +407,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = in_dev_get(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return in_dev; } +EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ @@ -557,7 +561,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg * Determine a default network mask, based on the IP address. */ -static __inline__ int inet_abc_len(__be32 addr) +static inline int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ @@ -646,13 +650,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev) goto done; if (colon) *colon = ':'; - if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ @@ -720,7 +726,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; - if ((ifa = inet_alloc_ifa()) == NULL) + ifa = inet_alloc_ifa(); + if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); @@ -822,10 +829,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) struct ifreq ifr; int done = 0; - if (!in_dev || (ifa = in_dev->ifa_list) == NULL) + if (!in_dev) goto out; - for (; ifa; ifa = ifa->ifa_next) { + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (!buf) { done += sizeof(ifr); continue; @@ -875,36 +882,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!addr) addr = ifa->ifa_local; } endfor_ifa(in_dev); -no_in_dev: - rcu_read_unlock(); if (addr) - goto out; + goto out_unlock; +no_in_dev: /* Not loopback addresses on loopback should be preferred in this case. It is importnat that lo is the first interface in dev_base list. */ - read_lock(&dev_base_lock); - rcu_read_lock(); - for_each_netdev(net, dev) { - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) continue; for_primary_ifa(in_dev) { if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) { addr = ifa->ifa_local; - goto out_unlock_both; + goto out_unlock; } } endfor_ifa(in_dev); } -out_unlock_both: - read_unlock(&dev_base_lock); +out_unlock: rcu_read_unlock(); -out: return addr; } +EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) @@ -940,7 +944,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, } } endfor_ifa(in_dev); - return same? addr : 0; + return same ? addr : 0; } /* @@ -961,17 +965,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev, return confirm_addr_indev(in_dev, dst, local, scope); net = dev_net(in_dev->dev); - read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(net, dev) { - if ((in_dev = __in_dev_get_rcu(dev))) { + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); - read_unlock(&dev_base_lock); return addr; } @@ -984,14 +987,16 @@ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } +EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } +EXPORT_SYMBOL(unregister_inetaddr_notifier); -/* Rename ifa_labels for a device name change. Make some effort to preserve existing - * alias numbering and to create unique labels if possible. +/* Rename ifa_labels for a device name change. Make some effort to preserve + * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { @@ -1010,11 +1015,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) sprintf(old, ":%d", named); dot = old; } - if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { + if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); - } else { + else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); - } skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } @@ -1061,8 +1065,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { - struct in_ifaddr *ifa; - if ((ifa = inet_alloc_ifa()) != NULL) { + struct in_ifaddr *ifa = inet_alloc_ifa(); + + if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1077,16 +1082,26 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, ip_mc_up(in_dev); /* fall through */ case NETDEV_CHANGEADDR: - if (IN_DEV_ARP_NOTIFY(in_dev)) - arp_send(ARPOP_REQUEST, ETH_P_ARP, - in_dev->ifa_list->ifa_address, - dev, - in_dev->ifa_list->ifa_address, - NULL, dev->dev_addr, NULL); + /* Send gratuitous ARP to notify of link change */ + if (IN_DEV_ARP_NOTIFY(in_dev)) { + struct in_ifaddr *ifa = in_dev->ifa_list; + + if (ifa) + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_address, dev, + ifa->ifa_address, NULL, + dev->dev_addr, NULL); + } break; case NETDEV_DOWN: ip_mc_down(in_dev); break; + case NETDEV_PRE_TYPE_CHANGE: + ip_mc_unmap(in_dev); + break; + case NETDEV_POST_TYPE_CHANGE: + ip_mc_remap(in_dev); + break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; @@ -1160,38 +1175,54 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int idx, ip_idx; + int h, s_h; + int idx, s_idx; + int ip_idx, s_ip_idx; struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; - int s_ip_idx, s_idx = cb->args[0]; + struct hlist_head *head; + struct hlist_node *node; - s_ip_idx = ip_idx = cb->args[1]; - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - s_ip_idx = 0; - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) - goto cont; - - for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; - ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (h > s_h || idx > s_idx) + s_ip_idx = 0; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto cont; + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) <= 0) - goto done; - } + RTM_NEWADDR, NLM_F_MULTI) <= 0) { + rcu_read_unlock(); + goto done; + } + } cont: - idx++; + idx++; + } + rcu_read_unlock(); } done: - cb->args[0] = idx; - cb->args[1] = ip_idx; + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; return skb->len; } @@ -1229,18 +1260,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { struct in_device *in_dev; - rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; - rcu_read_unlock(); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); } +/* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; @@ -1249,7 +1280,6 @@ static void inet_forward_change(struct net *net) IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; - read_lock(&dev_base_lock); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) @@ -1260,14 +1290,13 @@ static void inet_forward_change(struct net *net) IN_DEV_CONF_SET(in_dev, FORWARDING, on); rcu_read_unlock(); } - read_unlock(&dev_base_lock); } static int devinet_conf_proc(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { struct ipv4_devconf *cnf = ctl->extra1; @@ -1283,72 +1312,25 @@ static int devinet_conf_proc(ctl_table *ctl, int write, return ret; } -static int devinet_conf_sysctl(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ipv4_devconf *cnf; - struct net *net; - int *valp = table->data; - int new; - int i; - - if (!newval || !newlen) - return 0; - - if (newlen != sizeof(int)) - return -EINVAL; - - if (get_user(new, (int __user *)newval)) - return -EFAULT; - - if (new == *valp) - return 0; - - if (oldval && oldlenp) { - size_t len; - - if (get_user(len, oldlenp)) - return -EFAULT; - - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, valp, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - *valp = new; - - cnf = table->extra1; - net = table->extra2; - i = (int *)table->data - cnf->data; - - set_bit(i, cnf->state); - - if (cnf == net->ipv4.devconf_dflt) - devinet_copy_dflt_conf(net, i); - - return 1; -} - static int devinet_sysctl_forward(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + loff_t pos = *ppos; + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { struct net *net = ctl->extra2; if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { - if (!rtnl_trylock()) + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *valp = val; + *ppos = pos; return restart_syscall(); + } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else if (*valp) { @@ -1366,12 +1348,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, } int ipv4_doint_and_flush(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) @@ -1380,57 +1362,37 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, return ret; } -int ipv4_doint_and_flush_strategy(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen); - struct net *net = table->extra2; - - if (ret == 1) - rt_cache_flush(net, 0); - - return ret; -} - - -#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \ +#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ - .ctl_name = NET_IPV4_CONF_ ## attr, \ .procname = name, \ .data = ipv4_devconf.data + \ - NET_IPV4_CONF_ ## attr - 1, \ + IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ - .strategy = sysctl, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ - DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \ - devinet_conf_sysctl) + DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ - DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \ - devinet_conf_sysctl) + DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) -#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \ - DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl) +#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ - DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \ - ipv4_doint_and_flush_strategy) + DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; + struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; char *dev_name; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", - devinet_sysctl_forward, - devinet_conf_sysctl), + devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), @@ -1440,6 +1402,8 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), + DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), @@ -1450,6 +1414,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -1461,7 +1426,7 @@ static struct devinet_sysctl_table { }; static int __devinet_sysctl_register(struct net *net, char *dev_name, - int ctl_name, struct ipv4_devconf *p) + struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; @@ -1469,9 +1434,9 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, #define DEVINET_CTL_PATH_DEV 3 struct ctl_path devinet_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "conf", .ctl_name = NET_IPV4_CONF, }, + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "conf", }, { /* to be set */ }, { }, }; @@ -1496,7 +1461,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, goto free; devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; - devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name; t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, t->devinet_vars); @@ -1529,10 +1493,9 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) static void devinet_sysctl_register(struct in_device *idev) { - neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); + neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, - idev->dev->ifindex, &idev->cnf); + &idev->cnf); } static void devinet_sysctl_unregister(struct in_device *idev) @@ -1543,14 +1506,12 @@ static void devinet_sysctl_unregister(struct in_device *idev) static struct ctl_table ctl_forward_entry[] = { { - .ctl_name = NET_IPV4_FORWARD, .procname = "ip_forward", .data = &ipv4_devconf.data[ - NET_IPV4_CONF_FORWARDING - 1], + IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, - .strategy = devinet_conf_sysctl, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, @@ -1558,8 +1519,8 @@ static struct ctl_table ctl_forward_entry[] = { }; static __net_initdata struct ctl_path net_ipv4_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "net", }, + { .procname = "ipv4", }, { }, }; #endif @@ -1577,7 +1538,7 @@ static __net_init int devinet_init_net(struct net *net) all = &ipv4_devconf; dflt = &ipv4_devconf_dflt; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); if (all == NULL) goto err_alloc_all; @@ -1591,20 +1552,18 @@ static __net_init int devinet_init_net(struct net *net) if (tbl == NULL) goto err_alloc_ctl; - tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif } #ifdef CONFIG_SYSCTL - err = __devinet_sysctl_register(net, "all", - NET_PROTO_CONF_ALL, all); + err = __devinet_sysctl_register(net, "all", all); if (err < 0) goto err_reg_all; - err = __devinet_sysctl_register(net, "default", - NET_PROTO_CONF_DEFAULT, dflt); + err = __devinet_sysctl_register(net, "default", dflt); if (err < 0) goto err_reg_dflt; @@ -1670,8 +1629,3 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); } -EXPORT_SYMBOL(in_dev_finish_destroy); -EXPORT_SYMBOL(inet_select_addr); -EXPORT_SYMBOL(inetdev_by_index); -EXPORT_SYMBOL(register_inetaddr_notifier); -EXPORT_SYMBOL(unregister_inetaddr_notifier); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 18bb383ea393..14ca1f1c3fb0 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -422,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", @@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x) } err = crypto_aead_setauthsize( - aead, aalg_desc->uinfo.auth.icv_truncbits / 8); + aead, x->aalg->alg_trunc_len / 8); if (err) goto free_key; } @@ -615,7 +615,7 @@ static const struct xfrm_type esp_type = .output = esp_output }; -static struct net_protocol esp4_protocol = { +static const struct net_protocol esp4_protocol = { .handler = xfrm4_rcv, .err_handler = esp4_err, .no_policy = 1, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e2f950592566..4f0ed458c883 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -34,6 +34,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/protocol.h> @@ -125,7 +126,7 @@ void fib_select_default(struct net *net, #endif tb = fib_get_table(net, table); if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - tb->tb_select_default(tb, flp, res); + fib_table_select_default(tb, flp, res); } static void fib_flush(struct net *net) @@ -139,7 +140,7 @@ static void fib_flush(struct net *net) for (h = 0; h < FIB_TABLE_HASHSZ; h++) { head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry(tb, node, head, tb_hlist) - flushed += tb->tb_flush(tb); + flushed += fib_table_flush(tb); } if (flushed) @@ -162,7 +163,7 @@ struct net_device * ip_dev_find(struct net *net, __be32 addr) #endif local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) + if (!local_table || fib_table_lookup(local_table, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) goto out; @@ -200,7 +201,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; - if (!local_table->tb_lookup(local_table, &fl, &res)) { + if (!fib_table_lookup(local_table, &fl, &res)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; fib_res_put(&res); @@ -229,25 +230,31 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, */ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, - struct net_device *dev, __be32 *spec_dst, u32 *itag) + struct net_device *dev, __be32 *spec_dst, + u32 *itag, u32 mark) { struct in_device *in_dev; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = src, .saddr = dst, .tos = tos } }, + .mark = mark, .iif = oif }; + struct fib_result res; - int no_addr, rpf; + int no_addr, rpf, accept_local; int ret; struct net *net; - no_addr = rpf = 0; + no_addr = rpf = accept_local = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); + if (mark && !IN_DEV_SRC_VMARK(in_dev)) + fl.mark = 0; } rcu_read_unlock(); @@ -257,8 +264,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, net = dev_net(dev); if (fib_lookup(net, &fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) - goto e_inval_res; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL || !accept_local) + goto e_inval_res; + } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -473,13 +482,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = tb->tb_delete(tb, &cfg); + err = fib_table_delete(tb, &cfg); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = tb->tb_insert(tb, &cfg); + err = fib_table_insert(tb, &cfg); else err = -ENOBUFS; } @@ -594,7 +603,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar goto errout; } - err = tb->tb_delete(tb, &cfg); + err = fib_table_delete(tb, &cfg); errout: return err; } @@ -616,7 +625,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar goto errout; } - err = tb->tb_insert(tb, &cfg); + err = fib_table_insert(tb, &cfg); errout: return err; } @@ -647,7 +656,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (tb->tb_dump(tb, skb, cb) < 0) + if (fib_table_dump(tb, skb, cb) < 0) goto out; dumped = 1; next: @@ -701,9 +710,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - tb->tb_insert(tb, &cfg); + fib_table_insert(tb, &cfg); else - tb->tb_delete(tb, &cfg); + fib_table_delete(tb, &cfg); } void fib_add_ifaddr(struct in_ifaddr *ifa) @@ -832,7 +841,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) local_bh_disable(); frn->tb_id = tb->tb_id; - frn->err = tb->tb_lookup(tb, &fl, &res); + frn->err = fib_table_lookup(tb, &fl, &res); if (!frn->err) { frn->prefixlen = res.prefixlen; @@ -875,7 +884,7 @@ static void nl_fib_input(struct sk_buff *skb) netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); } -static int nl_fib_lookup_init(struct net *net) +static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, @@ -892,11 +901,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, int force, int delay) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev), delay); arp_ifdown(dev); } @@ -919,7 +928,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, 1, 0); } else { rt_cache_flush(dev_net(dev), -1); } @@ -934,7 +943,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct in_device *in_dev = __in_dev_get_rtnl(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, 2, -1); return NOTIFY_DONE; } @@ -952,12 +961,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, 0, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: rt_cache_flush(dev_net(dev), 0); break; + case NETDEV_UNREGISTER_BATCH: + rt_cache_flush_batch(); + break; } return NOTIFY_DONE; } @@ -993,7 +1005,7 @@ fail: return err; } -static void __net_exit ip_fib_net_exit(struct net *net) +static void ip_fib_net_exit(struct net *net) { unsigned int i; @@ -1009,7 +1021,7 @@ static void __net_exit ip_fib_net_exit(struct net *net) head = &net->ipv4.fib_table_hash[i]; hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); - tb->tb_flush(tb); + fib_table_flush(tb); kfree(tb); } } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index ecd39454235c..4ed7e0dea1bc 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -32,6 +32,7 @@ #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> @@ -242,8 +243,8 @@ fn_new_zone(struct fn_hash *table, int z) return fz; } -static int -fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +int fib_table_lookup(struct fib_table *tb, + const struct flowi *flp, struct fib_result *res) { int err; struct fn_zone *fz; @@ -274,8 +275,8 @@ out: return err; } -static void -fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +void fib_table_select_default(struct fib_table *tb, + const struct flowi *flp, struct fib_result *res) { int order, last_idx; struct hlist_node *node; @@ -366,7 +367,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } -static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fib_node *new_f = NULL; @@ -544,8 +545,7 @@ out: return err; } - -static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *)tb->tb_data; struct fib_node *f; @@ -662,7 +662,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx) return found; } -static int fn_hash_flush(struct fib_table *tb) +int fib_table_flush(struct fib_table *tb) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; @@ -743,7 +743,8 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, return skb->len; } -static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { int m, s_m; struct fn_zone *fz; @@ -787,12 +788,7 @@ struct fib_table *fib_hash_table(u32 id) tb->tb_id = id; tb->tb_default = -1; - tb->tb_lookup = fn_hash_lookup; - tb->tb_insert = fn_hash_insert; - tb->tb_delete = fn_hash_delete; - tb->tb_flush = fn_hash_flush; - tb->tb_select_default = fn_hash_select_default; - tb->tb_dump = fn_hash_dump; + memset(tb->tb_data, 0, sizeof(struct fn_hash)); return tb; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 92d9d97ec5e3..76daeb5ff564 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -94,7 +94,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) goto errout; - err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); + err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); if (err > 0) err = -EAGAIN; errout: @@ -213,7 +213,6 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, { struct fib4_rule *rule4 = (struct fib4_rule *) rule; - frh->family = AF_INET; frh->dst_len = rule4->dst_len; frh->src_len = rule4->src_len; frh->tos = rule4->tos; @@ -234,23 +233,6 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib4_rule_default_pref(struct fib_rules_ops *ops) -{ - struct list_head *pos; - struct fib_rule *rule; - - if (!list_empty(&ops->rules_list)) { - pos = ops->rules_list.next; - if (pos->next != &ops->rules_list) { - rule = list_entry(pos->next, struct fib_rule, list); - if (rule->pref) - return rule->pref - 1; - } - } - - return 0; -} - static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(4) /* dst */ @@ -263,7 +245,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops) rt_cache_flush(ops->fro_net, -1); } -static struct fib_rules_ops fib4_rules_ops_template = { +static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), @@ -272,7 +254,7 @@ static struct fib_rules_ops fib4_rules_ops_template = { .configure = fib4_rule_configure, .compare = fib4_rule_compare, .fill = fib4_rule_fill, - .default_pref = fib4_rule_default_pref, + .default_pref = fib_default_rule_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, @@ -284,7 +266,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); @@ -301,13 +283,9 @@ int __net_init fib4_rules_init(struct net *net) int err; struct fib_rules_ops *ops; - ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); - if (ops == NULL) - return -ENOMEM; - INIT_LIST_HEAD(&ops->rules_list); - ops->fro_net = net; - - fib_rules_register(ops); + ops = fib_rules_register(&fib4_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); err = fib_default_rules_init(ops); if (err < 0) @@ -318,12 +296,10 @@ int __net_init fib4_rules_init(struct net *net) fail: /* also cleans all rules already added */ fib_rules_unregister(ops); - kfree(ops); return err; } void __net_exit fib4_rules_exit(struct net *net) { fib_rules_unregister(net->ipv4.rules_ops); - kfree(net->ipv4.rules_ops); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9b096d6ff3f2..20f09c5b31e8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -32,6 +32,7 @@ #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/init.h> +#include <linux/slab.h> #include <net/arp.h> #include <net/ip.h> @@ -62,8 +63,8 @@ static DEFINE_SPINLOCK(fib_multipath_lock); #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) -#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ -for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) +#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ +for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -72,7 +73,7 @@ for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ for (nhsel=0; nhsel < 1; nhsel++) -#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \ +#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel=0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -145,9 +146,9 @@ void free_fib_info(struct fib_info *fi) return; } change_nexthops(fi) { - if (nh->nh_dev) - dev_put(nh->nh_dev); - nh->nh_dev = NULL; + if (nexthop_nh->nh_dev) + dev_put(nexthop_nh->nh_dev); + nexthop_nh->nh_dev = NULL; } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); @@ -162,9 +163,9 @@ void fib_release_info(struct fib_info *fi) if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); change_nexthops(fi) { - if (!nh->nh_dev) + if (!nexthop_nh->nh_dev) continue; - hlist_del(&nh->nh_hash); + hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) fi->fib_dead = 1; fib_info_put(fi); @@ -228,7 +229,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) head = &fib_info_hash[hash]; hlist_for_each_entry(fi, node, head, fib_hash) { - if (fi->fib_net != nfi->fib_net) + if (!net_eq(fi->fib_net, nfi->fib_net)) continue; if (fi->fib_nhs != nfi->fib_nhs) continue; @@ -395,19 +396,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; - nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; - nh->nh_oif = rtnh->rtnh_ifindex; - nh->nh_weight = rtnh->rtnh_hops + 1; + nexthop_nh->nh_flags = + (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; + nexthop_nh->nh_oif = rtnh->rtnh_ifindex; + nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nh->nh_gw = nla ? nla_get_be32(nla) : 0; + nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; #ifdef CONFIG_NET_CLS_ROUTE nla = nla_find(attrs, attrlen, RTA_FLOW); - nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif } @@ -527,10 +529,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_gw) { struct fib_result res; -#ifdef CONFIG_IP_ROUTE_PERVASIVE - if (nh->nh_flags&RTNH_F_PERVASIVE) - return 0; -#endif if (nh->nh_flags&RTNH_F_ONLINK) { struct net_device *dev; @@ -738,7 +736,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_nhs = nhs; change_nexthops(fi) { - nh->nh_parent = fi; + nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mx) { @@ -808,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } else { change_nexthops(fi) { - if ((err = fib_check_nh(cfg, fi, nh)) != 0) + if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) goto failure; } endfor_nexthops(fi) } @@ -843,11 +841,11 @@ link_it: struct hlist_head *head; unsigned int hash; - if (!nh->nh_dev) + if (!nexthop_nh->nh_dev) continue; - hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); head = &fib_info_devhash[hash]; - hlist_add_head(&nh->nh_hash, head); + hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) spin_unlock_bh(&fib_info_lock); return fi; @@ -1047,7 +1045,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return 0; hlist_for_each_entry(fi, node, head, fib_lhash) { - if (fi->fib_net != net) + if (!net_eq(fi->fib_net, net)) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; @@ -1080,21 +1078,21 @@ int fib_sync_down_dev(struct net_device *dev, int force) prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) + if (nexthop_nh->nh_flags&RTNH_F_DEAD) dead++; - else if (nh->nh_dev == dev && - nh->nh_scope != scope) { - nh->nh_flags |= RTNH_F_DEAD; + else if (nexthop_nh->nh_dev == dev && + nexthop_nh->nh_scope != scope) { + nexthop_nh->nh_flags |= RTNH_F_DEAD; #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nh->nh_power; - nh->nh_power = 0; + fi->fib_power -= nexthop_nh->nh_power; + nexthop_nh->nh_power = 0; spin_unlock_bh(&fib_multipath_lock); #endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nh->nh_dev == dev) { + if (force > 1 && nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } @@ -1144,18 +1142,20 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { alive++; continue; } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + if (nexthop_nh->nh_dev == NULL || + !(nexthop_nh->nh_dev->flags&IFF_UP)) continue; - if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) + if (nexthop_nh->nh_dev != dev || + !__in_dev_get_rtnl(dev)) continue; alive++; spin_lock_bh(&fib_multipath_lock); - nh->nh_power = 0; - nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_power = 0; + nexthop_nh->nh_flags &= ~RTNH_F_DEAD; spin_unlock_bh(&fib_multipath_lock); } endfor_nexthops(fi) @@ -1182,9 +1182,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { - power += nh->nh_weight; - nh->nh_power = nh->nh_weight; + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } } endfor_nexthops(fi); fi->fib_power = power; @@ -1204,9 +1204,10 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) w = jiffies % fi->fib_power; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { - if ((w -= nh->nh_power) <= 0) { - nh->nh_power--; + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && + nexthop_nh->nh_power) { + if ((w -= nexthop_nh->nh_power) <= 0) { + nexthop_nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; spin_unlock_bh(&fib_multipath_lock); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 63c2fa7b68c4..79d057a939ba 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -48,7 +48,7 @@ * Patrick McHardy <kaber@trash.net> */ -#define VERSION "0.408" +#define VERSION "0.409" #include <asm/uaccess.h> #include <asm/system.h> @@ -71,6 +71,7 @@ #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -164,6 +165,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ static struct tnode *tnode_free_head; +static size_t tnode_free_size; + +/* + * synchronize_rcu after call_rcu for that many pages; it should be especially + * useful before resizing the root node with PREEMPT_NONE configs; the value was + * obtained experimentally, aiming to avoid visible slowdown. + */ +static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; @@ -200,7 +209,9 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { struct node *ret = tnode_get_child(tn, i); - return rcu_dereference(ret); + return rcu_dereference_check(ret, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); } static inline int tnode_child_length(const struct tnode *tn) @@ -317,8 +328,7 @@ static inline void check_tnode(const struct tnode *tn) static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; -static const int inflate_threshold_root = 25; - +static const int inflate_threshold_root = 30; static void __alias_free_mem(struct rcu_head *head) { @@ -393,6 +403,8 @@ static void tnode_free_safe(struct tnode *tn) BUG_ON(IS_LEAF(tn)); tn->tnode_free = tnode_free_head; tnode_free_head = tn; + tnode_free_size += sizeof(struct tnode) + + (sizeof(struct node *) << tn->bits); } static void tnode_free_flush(void) @@ -404,6 +416,11 @@ static void tnode_free_flush(void) tn->tnode_free = NULL; tnode_free(tn); } + + if (tnode_free_size >= PAGE_SIZE * sync_pages) { + tnode_free_size = 0; + synchronize_rcu(); + } } static struct leaf *leaf_new(void) @@ -499,14 +516,14 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, rcu_assign_pointer(tn->child[i], n); } +#define MAX_WORK 10 static struct node *resize(struct trie *t, struct tnode *tn) { int i; - int err = 0; struct tnode *old_tn; int inflate_threshold_use; int halve_threshold_use; - int max_resize; + int max_work; if (!tn) return NULL; @@ -521,18 +538,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) } /* One child */ if (tn->empty_children == tnode_child_length(tn) - 1) - for (i = 0; i < tnode_child_length(tn); i++) { - struct node *n; - - n = tn->child[i]; - if (!n) - continue; - - /* compress one level */ - node_set_parent(n, NULL); - tnode_free_safe(tn); - return n; - } + goto one_child; /* * Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. @@ -601,14 +607,17 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!tn->parent) + if (!node_parent((struct node*) tn)) { inflate_threshold_use = inflate_threshold_root; - else + halve_threshold_use = halve_threshold_root; + } + else { inflate_threshold_use = inflate_threshold; + halve_threshold_use = halve_threshold; + } - err = 0; - max_resize = 10; - while ((tn->full_children > 0 && max_resize-- && + max_work = MAX_WORK; + while ((tn->full_children > 0 && max_work-- && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold_use * tnode_child_length(tn))) { @@ -625,35 +634,19 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } - if (max_resize < 0) { - if (!tn->parent) - pr_warning("Fix inflate_threshold_root." - " Now=%d size=%d bits\n", - inflate_threshold_root, tn->bits); - else - pr_warning("Fix inflate_threshold." - " Now=%d size=%d bits\n", - inflate_threshold, tn->bits); - } - check_tnode(tn); + /* Return if at least one inflate is run */ + if( max_work != MAX_WORK) + return (struct node *) tn; + /* * Halve as long as the number of empty children in this * node is above threshold. */ - - /* Keep root node larger */ - - if (!tn->parent) - halve_threshold_use = halve_threshold_root; - else - halve_threshold_use = halve_threshold; - - err = 0; - max_resize = 10; - while (tn->bits > 1 && max_resize-- && + max_work = MAX_WORK; + while (tn->bits > 1 && max_work-- && 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold_use * tnode_child_length(tn)) { @@ -668,19 +661,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } - if (max_resize < 0) { - if (!tn->parent) - pr_warning("Fix halve_threshold_root." - " Now=%d size=%d bits\n", - halve_threshold_root, tn->bits); - else - pr_warning("Fix halve_threshold." - " Now=%d size=%d bits\n", - halve_threshold, tn->bits); - } /* Only one child remains */ - if (tn->empty_children == tnode_child_length(tn) - 1) + if (tn->empty_children == tnode_child_length(tn) - 1) { +one_child: for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; @@ -694,7 +678,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) tnode_free_safe(tn); return n; } - + } return (struct node *) tn; } @@ -980,7 +964,9 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = rcu_dereference(t->trie); + n = rcu_dereference_check(t->trie, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -1036,8 +1022,6 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) rcu_assign_pointer(t->trie, (struct node *)tn); tnode_free_flush(); - - return; } /* only used from updater-side */ @@ -1193,7 +1177,7 @@ done: /* * Caller must hold RTNL. */ -static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *new_fa; @@ -1392,8 +1376,8 @@ static int check_leaf(struct trie *t, struct leaf *l, return 1; } -static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, - struct fib_result *res) +int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int ret; @@ -1435,7 +1419,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), pos, bits); - n = tnode_get_child(pn, cindex); + n = tnode_get_child_rcu(pn, cindex); if (n == NULL) { #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -1570,7 +1554,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct node *) pn); if (!parent) goto failed; @@ -1614,7 +1598,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) /* * Caller must hold RTNL. */ -static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; u32 key, mask; @@ -1783,7 +1767,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { struct node *c = (struct node *) l; - struct tnode *p = node_parent(c); + struct tnode *p = node_parent_rcu(c); if (!p) return NULL; /* trie with just one leaf */ @@ -1805,7 +1789,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index) /* * Caller must hold RTNL. */ -static int fn_trie_flush(struct fib_table *tb) +int fib_table_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; struct leaf *l, *ll = NULL; @@ -1826,9 +1810,9 @@ static int fn_trie_flush(struct fib_table *tb) return found; } -static void fn_trie_select_default(struct fib_table *tb, - const struct flowi *flp, - struct fib_result *res) +void fib_table_select_default(struct fib_table *tb, + const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int order, last_idx; @@ -1971,8 +1955,8 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, return skb->len; } -static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) +int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { struct leaf *l; struct trie *t = (struct trie *) tb->tb_data; @@ -2039,12 +2023,6 @@ struct fib_table *fib_hash_table(u32 id) tb->tb_id = id; tb->tb_default = -1; - tb->tb_lookup = fn_trie_lookup; - tb->tb_insert = fn_trie_insert; - tb->tb_delete = fn_trie_delete; - tb->tb_flush = fn_trie_flush; - tb->tb_select_default = fn_trie_select_default; - tb->tb_dump = fn_trie_dump; t = (struct trie *) tb->tb_data; memset(t, 0, sizeof(*t)); @@ -2391,7 +2369,7 @@ static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) } } -static const char *rtn_type_names[__RTN_MAX] = { +static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 97c410e84388..d65e9215bcd7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -74,6 +74,7 @@ #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> +#include <linux/slab.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> @@ -114,7 +115,7 @@ struct icmp_bxm { /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ -struct icmp_err icmp_err_convert[] = { +const struct icmp_err icmp_err_convert[] = { { .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ .fatal = 0, @@ -330,9 +331,10 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, - ipc, rt, MSG_DONTWAIT) < 0) + ipc, rt, MSG_DONTWAIT) < 0) { + ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS); ip_flush_pending_frames(sk); - else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; @@ -501,15 +503,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; + rcu_read_lock(); if (rt->fl.iif && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(net, rt->fl.iif); + dev = dev_get_by_index_rcu(net, rt->fl.iif); - if (dev) { + if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); - dev_put(dev); - } else + else saddr = 0; + rcu_read_unlock(); } tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | @@ -584,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) err = __ip_route_output_key(net, &rt2, &fl); else { struct flowi fl2 = {}; - struct dst_entry *odst; + unsigned long orefdst; fl2.fl4_dst = fl.fl4_src; if (ip_route_output_key(net, &rt2, &fl2)) goto relookup_failed; /* Ugh! */ - odst = skb_dst(skb_in); + orefdst = skb_in->_skb_refdst; /* save old refdst */ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, RT_TOS(tos), rt2->u.dst.dev); dst_release(&rt2->u.dst); rt2 = skb_rtable(skb_in); - skb_dst_set(skb_in, odst); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ } if (err) @@ -655,7 +658,7 @@ static void icmp_unreach(struct sk_buff *skb) struct iphdr *iph; struct icmphdr *icmph; int hash, protocol; - struct net_protocol *ipprot; + const struct net_protocol *ipprot; u32 info = 0; struct net *net; @@ -1165,6 +1168,10 @@ static int __net_init icmp_sk_init(struct net *net) sk->sk_sndbuf = (2 * ((64 * 1024) + sizeof(struct sk_buff))); + /* + * Speedup sock_wfree() + */ + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 01b4284ed694..5fff865a4fa7 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -71,6 +71,7 @@ */ #include <linux/module.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> @@ -946,7 +947,6 @@ int igmp_rcv(struct sk_buff *skb) break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (skb_rtable(skb)->fl.iif == 0) break; @@ -960,6 +960,7 @@ int igmp_rcv(struct sk_buff *skb) in_dev_put(in_dev); return pim_rcv_v1(skb); #endif + case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: @@ -997,7 +998,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) --ANK */ if (arp_mc_map(addr, buf, dev, 0) == 0) - dev_mc_add(dev, buf, dev->addr_len, 0); + dev_mc_add(dev, buf); } /* @@ -1010,7 +1011,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) struct net_device *dev = in_dev->dev; if (arp_mc_map(addr, buf, dev, 0) == 0) - dev_mc_delete(dev, buf, dev->addr_len, 0); + dev_mc_del(dev, buf); } #ifdef CONFIG_IP_MULTICAST @@ -1298,6 +1299,28 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) } } +/* Device changing type */ + +void ip_mc_unmap(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + for (i = in_dev->mc_list; i; i = i->next) + igmp_group_dropped(i); +} + +void ip_mc_remap(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + for (i = in_dev->mc_list; i; i = i->next) + igmp_group_added(i); +} + /* Device going down */ void ip_mc_down(struct in_device *in_dev) @@ -1777,7 +1800,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml->next = inet->mc_list; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; - inet->mc_list = iml; + rcu_assign_pointer(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); err = 0; done: @@ -1785,24 +1808,46 @@ done: return err; } +static void ip_sf_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_sf_socklist *psf; + + psf = container_of(rp, struct ip_sf_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(psf); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { + struct ip_sf_socklist *psf = iml->sflist; int err; - if (iml->sflist == NULL) { + if (psf == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, - iml->sfmode, iml->sflist->sl_count, - iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + iml->sfmode, psf->sl_count, psf->sl_addr, 0); + rcu_assign_pointer(iml->sflist, NULL); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + call_rcu(&psf->rcu, ip_sf_socklist_reclaim); return err; } + +static void ip_mc_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_mc_socklist *iml; + + iml = container_of(rp, struct ip_mc_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(iml); +} + + /* * Ask a socket to leave a group. */ @@ -1832,12 +1877,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) (void) ip_mc_leave_src(sk, iml, in_dev); - *imlp = iml->next; + rcu_assign_pointer(*imlp, iml->next); if (in_dev) ip_mc_dec_group(in_dev, group); rtnl_unlock(); - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); return 0; } if (!in_dev) @@ -1877,8 +1924,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct err = -EADDRNOTAVAIL; for (pmc=inet->mc_list; pmc; pmc=pmc->next) { - if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr - && pmc->multi.imr_ifindex == imr.imr_ifindex) + if ((pmc->multi.imr_multiaddr.s_addr == + imr.imr_multiaddr.s_addr) && + (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ @@ -1951,9 +1999,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) { for (i=0; i<psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } - pmc->sflist = psl = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); + psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; i<psl->sl_count; i++) { @@ -2049,11 +2100,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = msf->imsf_fmode; err = 0; done: @@ -2186,30 +2239,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; + int ret; + ret = 1; if (!ipv4_is_multicast(loc_addr)) - return 1; + goto out; - for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + rcu_read_lock(); + for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && pmc->multi.imr_ifindex == dif) break; } + ret = inet->mc_all; if (!pmc) - return inet->mc_all; + goto unlock; psl = pmc->sflist; + ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) - return pmc->sfmode == MCAST_EXCLUDE; + goto unlock; for (i=0; i<psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } + ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - return 0; + goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - return 0; - return 1; + goto unlock; + ret = 1; +unlock: + rcu_read_unlock(); +out: + return ret; } /* @@ -2228,7 +2291,7 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; - inet->mc_list = iml->next; + rcu_assign_pointer(inet->mc_list, iml->next); in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); @@ -2236,7 +2299,9 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); in_dev_put(in_dev); } - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); } rtnl_unlock(); } @@ -2289,9 +2354,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; - for_each_netdev(net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; - in_dev = in_dev_get(state->dev); + + in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; read_lock(&in_dev->mc_list_lock); @@ -2301,7 +2367,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) break; } read_unlock(&in_dev->mc_list_lock); - in_dev_put(in_dev); } return im; } @@ -2311,16 +2376,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = im->next; while (!im) { - if (likely(state->in_dev != NULL)) { + if (likely(state->in_dev != NULL)) read_unlock(&state->in_dev->mc_list_lock); - in_dev_put(state->in_dev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } - state->in_dev = in_dev_get(state->dev); + state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; read_lock(&state->in_dev->mc_list_lock); @@ -2339,9 +2403,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(rcu) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2357,16 +2421,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); if (likely(state->in_dev != NULL)) { read_unlock(&state->in_dev->mc_list_lock); - in_dev_put(state->in_dev); state->in_dev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) @@ -2440,9 +2503,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) state->idev = NULL; state->im = NULL; - for_each_netdev(net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct in_device *idev; - idev = in_dev_get(state->dev); + idev = __in_dev_get_rcu(state->dev); if (unlikely(idev == NULL)) continue; read_lock(&idev->mc_list_lock); @@ -2458,7 +2521,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) spin_unlock_bh(&im->lock); } read_unlock(&idev->mc_list_lock); - in_dev_put(idev); } return psf; } @@ -2472,16 +2534,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { - if (likely(state->idev != NULL)) { + if (likely(state->idev != NULL)) read_unlock(&state->idev->mc_list_lock); - in_dev_put(state->idev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } - state->idev = in_dev_get(state->dev); + state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; read_lock(&state->idev->mc_list_lock); @@ -2506,8 +2567,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2523,6 +2585,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im != NULL)) { @@ -2531,11 +2594,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) } if (likely(state->idev != NULL)) { read_unlock(&state->idev->mc_list_lock); - in_dev_put(state->idev); state->idev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) @@ -2583,7 +2645,7 @@ static const struct file_operations igmp_mcf_seq_fops = { .release = seq_release_net, }; -static int igmp_net_init(struct net *net) +static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; @@ -2601,7 +2663,7 @@ out_igmp: return -ENOMEM; } -static void igmp_net_exit(struct net *net) +static void __net_exit igmp_net_exit(struct net *net) { proc_net_remove(net, "mcfilter"); proc_net_remove(net, "igmp"); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 22cd19ee44e5..70eb3507c406 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = { .range = { 32768, 61000 }, }; +unsigned long *sysctl_local_reserved_ports; +EXPORT_SYMBOL(sysctl_local_reserved_ports); + void inet_get_local_port_range(int *low, int *high) { unsigned seq; @@ -108,11 +111,13 @@ again: smallest_size = -1; do { + if (inet_is_reserved_local_port(rover)) + goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == rover) { + if (net_eq(ib_net(tb), net) && tb->port == rover) { if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN && @@ -130,6 +135,7 @@ again: break; next: spin_unlock(&head->lock); + next_nolock: if (++rover > high) rover = low; } while (--remaining > 0); @@ -158,7 +164,7 @@ have_snum: hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == snum) + if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } tb = NULL; @@ -234,7 +240,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) @@ -253,7 +259,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) if (!timeo) break; } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; } @@ -358,6 +364,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options *opt = inet_rsk(req)->opt; struct flowi fl = { .oif = sk->sk_bound_dev_if, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = ((opt && opt->srr) ? opt->faddr : @@ -367,7 +374,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, + { .sport = inet_sk(sk)->inet_sport, .dport = ireq->rmt_port } } }; struct net *net = sock_net(sk); @@ -446,6 +453,28 @@ extern int sysctl_tcp_synack_retries; EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +/* Decide when to expire the request and when to resend SYN-ACK */ +static inline void syn_ack_recalc(struct request_sock *req, const int thresh, + const int max_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) +{ + if (!rskq_defer_accept) { + *expire = req->retrans >= thresh; + *resend = 1; + return; + } + *expire = req->retrans >= thresh && + (!inet_rsk(req)->acked || req->retrans >= max_retries); + /* + * Do not resend while waiting for data after ACK, + * start to resend on end of deferring period to give + * last chance for data or ACK to create established socket. + */ + *resend = !inet_rsk(req)->acked || + req->retrans >= rskq_defer_accept - 1; +} + void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long interval, const unsigned long timeout, @@ -501,9 +530,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req)) { + int expire = 0, resend = 0; + + syn_ack_recalc(req, thresh, max_retries, + queue->rskq_defer_accept, + &expire, &resend); + if (req->rsk_ops->syn_ack_timeout) + req->rsk_ops->syn_ack_timeout(parent, req); + if (!expire && + (!resend || + !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || + inet_rsk(req)->acked)) { unsigned long timeo; if (req->retrans++ == 0) @@ -546,9 +583,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; - inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; - inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); - inet_sk(newsk)->sport = inet_rsk(req)->loc_port; + inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; @@ -579,8 +616,8 @@ void inet_csk_destroy_sock(struct sock *sk) /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash); + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -615,8 +652,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) * after validation is complete. */ sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); + if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); sk->sk_prot->hash(sk); @@ -692,8 +729,8 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; + sin->sin_addr.s_addr = inet->inet_daddr; + sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); @@ -714,7 +751,7 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a706a47f4dbb..e5fa2ddce320 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/fcntl.h> #include <linux/random.h> +#include <linux/slab.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/time.h> @@ -116,10 +117,10 @@ static int inet_csk_diag_fill(struct sock *sk, r->id.idiag_cookie[0] = (u32)(unsigned long)sk; r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - r->id.idiag_sport = inet->sport; - r->id.idiag_dport = inet->dport; - r->id.idiag_src[0] = inet->rcv_saddr; - r->id.idiag_dst[0] = inet->daddr; + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = inet->inet_dport; + r->id.idiag_src[0] = inet->inet_rcv_saddr; + r->id.idiag_dst[0] = inet->inet_daddr; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { @@ -368,7 +369,7 @@ static int inet_diag_bc_run(const void *bc, int len, yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: - yes = entry->dport <= op[1].no; + yes = entry->sport <= op[1].no; break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; @@ -504,11 +505,11 @@ static int inet_csk_diag_dump(struct sock *sk, } else #endif { - entry.saddr = &inet->rcv_saddr; - entry.daddr = &inet->daddr; + entry.saddr = &inet->inet_rcv_saddr; + entry.daddr = &inet->inet_daddr; } - entry.sport = inet->num; - entry.dport = ntohs(inet->dport); + entry.sport = inet->inet_num; + entry.dport = ntohs(inet->inet_dport); entry.userlocks = sk->sk_userlocks; if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) @@ -584,7 +585,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, if (tmo < 0) tmo = 0; - r->id.idiag_sport = inet->sport; + r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = ireq->rmt_port; r->id.idiag_src[0] = ireq->loc_addr; r->id.idiag_dst[0] = ireq->rmt_addr; @@ -639,7 +640,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { bc = (struct rtattr *)(r + 1); - entry.sport = inet->num; + entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -732,7 +733,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - if (r->id.idiag_sport != inet->sport && + if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_listen; @@ -774,7 +775,7 @@ skip_listen_ht: if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) goto unlock; - for (i = s_i; i < hashinfo->ehash_size; i++) { + for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; @@ -797,10 +798,10 @@ skip_listen_ht: goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) goto next_normal; - if (r->id.idiag_sport != inet->sport && + if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_normal; - if (r->id.idiag_dport != inet->dport && + if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index eaf3e2c8646a..a2ca6aed763b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -19,6 +19,7 @@ #include <linux/random.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <linux/slab.h> #include <net/inet_frag.h> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 625cc5f64c94..d3e160a88219 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->num = snum; + inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb; @@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; @@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk) __sk_del_bind_node(sk); tb->num_owners--; inet_csk(sk)->icsk_bind_hash = NULL; - inet_sk(sk)->num = 0; + inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -105,7 +105,7 @@ EXPORT_SYMBOL(inet_put_port); void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; @@ -126,9 +126,9 @@ static inline int compute_score(struct sock *sk, struct net *net, int score = -1; struct inet_sock *inet = inet_sk(sk); - if (net_eq(sock_net(sk), net) && inet->num == hnum && + if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { - __be32 rcv_saddr = inet->rcv_saddr; + __be32 rcv_saddr = inet->inet_rcv_saddr; score = sk->sk_family == PF_INET ? 1 : 0; if (rcv_saddr) { if (rcv_saddr != daddr) @@ -209,7 +209,7 @@ struct sock * __inet_lookup_established(struct net *net, * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & (hashinfo->ehash_size - 1); + unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; rcu_read_lock(); @@ -273,18 +273,20 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); - __be32 daddr = inet->rcv_saddr; - __be32 saddr = inet->daddr; + __be32 daddr = inet->inet_rcv_saddr; + __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) - const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); - unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); + unsigned int hash = inet_ehashfn(net, daddr, lport, + saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw; + int twrefcnt = 0; spin_lock(lock); @@ -312,25 +314,28 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, unique: /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); + inet->inet_num = lport; + inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + twrefcnt = inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } spin_unlock(lock); + if (twrefcnt) + inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } - return 0; not_unique: @@ -341,16 +346,18 @@ not_unique: static inline u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, + inet->inet_daddr, + inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk) +int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; spinlock_t *lock; struct inet_ehash_bucket *head; + int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); @@ -361,8 +368,13 @@ void __inet_hash_nolisten(struct sock *sk) spin_lock(lock); __sk_nulls_add_node_rcu(sk, list); + if (tw) { + WARN_ON(sk->sk_hash != tw->tw_hash); + twrefcnt = inet_twsk_unhash(tw); + } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return twrefcnt; } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); @@ -372,7 +384,7 @@ static void __inet_hash(struct sock *sk) struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) { - __inet_hash_nolisten(sk); + __inet_hash_nolisten(sk, NULL); return; } @@ -421,14 +433,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **), - void (*hash)(struct sock *sk)) + int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->num; + const unsigned short snum = inet_sk(sk)->inet_num; struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); + int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -443,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; + if (inet_is_reserved_local_port(port)) + continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock(&head->lock); @@ -452,7 +467,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (ib_net(tb) == net && tb->port == port) { + if (net_eq(ib_net(tb), net) && + tb->port == port) { if (tb->fastreuse >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); @@ -485,14 +501,19 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - hash(sk); + inet_sk(sk)->inet_sport = htons(port); + twrefcnt += hash(sk, tw); } + if (tw) + twrefcnt += inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); if (tw) { inet_twsk_deschedule(tw, death_row); - inet_twsk_put(tw); + while (twrefcnt) { + twrefcnt--; + inet_twsk_put(tw); + } } ret = 0; @@ -503,7 +524,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - hash(sk); + hash(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 6a667dae315e..47038cb6c138 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, if (iph->ihl != IPH_LEN_WO_OPTIONS) return -1; - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack - || tcph->rst || tcph->syn || tcph->fin) + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || + tcph->rst || tcph->syn || tcph->fin) return -1; if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) return -1; - if (tcph->doff != TCPH_LEN_WO_OPTIONS - && tcph->doff != TCPH_LEN_W_TIMESTAMP) + if (tcph->doff != TCPH_LEN_WO_OPTIONS && + tcph->doff != TCPH_LEN_W_TIMESTAMP) return -1; /* check tcp options (only timestamp allowed) */ @@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, struct tcphdr *tcph) { - if ((lro_desc->iph->saddr != iph->saddr) - || (lro_desc->iph->daddr != iph->daddr) - || (lro_desc->tcph->source != tcph->source) - || (lro_desc->tcph->dest != tcph->dest)) + if ((lro_desc->iph->saddr != iph->saddr) || + (lro_desc->iph->daddr != iph->daddr) || + (lro_desc->tcph->source != tcph->source) || + (lro_desc->tcph->dest != tcph->dest)) return -1; return 0; } @@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, u64 flags; int vlan_hdr_len = 0; - if (!lro_mgr->get_skb_header - || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) + if (!lro_mgr->get_skb_header || + lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) goto out; if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) @@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, if (!lro_desc) goto out; - if ((skb->protocol == htons(ETH_P_8021Q)) - && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) vlan_hdr_len = VLAN_HLEN; if (!lro_desc->active) { /* start new lro session */ @@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, int hdr_len = LRO_MAX_PG_HLEN; int vlan_hdr_len = 0; - if (!lro_mgr->get_frag_header - || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { + if (!lro_mgr->get_frag_header || + lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { mac_hdr = page_address(frags->page) + frags->page_offset; goto out1; } @@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, if (!skb) goto out; - if ((skb->protocol == htons(ETH_P_8021Q)) - && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) vlan_hdr_len = VLAN_HLEN; iph = (void *)(skb->data + vlan_hdr_len); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 61283f928825..c5af909cf701 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -10,44 +10,92 @@ #include <linux/kernel.h> #include <linux/kmemcheck.h> +#include <linux/slab.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> + +/** + * inet_twsk_unhash - unhash a timewait socket from established hash + * @tw: timewait socket + * + * unhash a timewait socket from established hash, if hashed. + * ehash lock must be held by caller. + * Returns 1 if caller should call inet_twsk_put() after lock release. + */ +int inet_twsk_unhash(struct inet_timewait_sock *tw) +{ + if (hlist_nulls_unhashed(&tw->tw_node)) + return 0; + + hlist_nulls_del_rcu(&tw->tw_node); + sk_nulls_node_init(&tw->tw_node); + /* + * We cannot call inet_twsk_put() ourself under lock, + * caller must call it for us. + */ + return 1; +} + +/** + * inet_twsk_bind_unhash - unhash a timewait socket from bind hash + * @tw: timewait socket + * @hashinfo: hashinfo pointer + * + * unhash a timewait socket from bind hash, if hashed. + * bind hash lock must be held by caller. + * Returns 1 if caller should call inet_twsk_put() after lock release. + */ +int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo) +{ + struct inet_bind_bucket *tb = tw->tw_tb; + + if (!tb) + return 0; + + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + /* + * We cannot call inet_twsk_put() ourself under lock, + * caller must call it for us. + */ + return 1; +} + /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_hashbucket *bhead; - struct inet_bind_bucket *tb; + int refcnt; /* Unlink from established hashes. */ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); spin_lock(lock); - if (hlist_nulls_unhashed(&tw->tw_node)) { - spin_unlock(lock); - return; - } - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); + refcnt = inet_twsk_unhash(tw); spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + refcnt += inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); + #ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); } #endif - inet_twsk_put(tw); + while (refcnt) { + inet_twsk_put(tw); + refcnt--; + } } static noinline void inet_twsk_free(struct inet_timewait_sock *tw) @@ -86,7 +134,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; @@ -101,16 +149,24 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, * Should be done before removing sk from established chain * because readers are lockless and search established first. */ - atomic_inc(&tw->tw_refcnt); inet_twsk_add_node_rcu(tw, &ehead->twchain); /* Step 3: Remove SK from established hash. */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + /* + * Notes : + * - We initially set tw_refcnt to 0 in inet_twsk_alloc() + * - We add one reference for the bhash link + * - We add one reference for the ehash link + * - We want this refcnt update done before allowing other + * threads to find this tw in ehash chain. + */ + atomic_add(1 + 1 + 1, &tw->tw_refcnt); + spin_unlock(lock); } - EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) @@ -124,14 +180,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat kmemcheck_annotate_bitfield(tw, flags); /* Give us an identity. */ - tw->tw_daddr = inet->daddr; - tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_daddr = inet->inet_daddr; + tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; - tw->tw_num = inet->num; + tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; - tw->tw_sport = inet->sport; - tw->tw_dport = inet->dport; + tw->tw_sport = inet->inet_sport; + tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; tw->tw_hash = sk->sk_hash; @@ -139,14 +195,18 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; twsk_net_set(tw, hold_net(sock_net(sk))); - atomic_set(&tw->tw_refcnt, 1); + /* + * Because we use RCU lookups, we should not set tw_refcnt + * to a non null value before everything is setup for this + * timewait socket. + */ + atomic_set(&tw->tw_refcnt, 0); inet_twsk_dead_node_init(tw); __module_get(tw->tw_prot->owner); } return tw; } - EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* Returns non-zero if quota exceeded. */ @@ -218,14 +278,13 @@ void inet_twdr_hangman(unsigned long data) /* We purged the entire slot, anything left? */ if (twdr->tw_count) need_timer = 1; + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); } - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); if (need_timer) mod_timer(&twdr->tw_timer, jiffies + twdr->period); out: spin_unlock(&twdr->death_lock); } - EXPORT_SYMBOL_GPL(inet_twdr_hangman); void inet_twdr_twkill_work(struct work_struct *work) @@ -256,7 +315,6 @@ void inet_twdr_twkill_work(struct work_struct *work) spin_unlock_bh(&twdr->death_lock); } } - EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); /* These are always called from BH context. See callers in @@ -276,7 +334,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw, spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); } - EXPORT_SYMBOL(inet_twsk_deschedule); void inet_twsk_schedule(struct inet_timewait_sock *tw, @@ -357,7 +414,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, mod_timer(&twdr->tw_timer, jiffies + twdr->period); spin_unlock(&twdr->death_lock); } - EXPORT_SYMBOL_GPL(inet_twsk_schedule); void inet_twdr_twcal_tick(unsigned long data) @@ -418,40 +474,48 @@ out: #endif spin_unlock(&twdr->death_lock); } - EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); -void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, +void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { struct inet_timewait_sock *tw; struct sock *sk; struct hlist_nulls_node *node; - int h; + unsigned int slot; - local_bh_disable(); - for (h = 0; h < (hashinfo->ehash_size); h++) { - struct inet_ehash_bucket *head = - inet_ehash_bucket(hashinfo, h); - spinlock_t *lock = inet_ehash_lockp(hashinfo, h); + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + rcu_read_lock(); restart: - spin_lock(lock); - sk_nulls_for_each(sk, node, &head->twchain) { - + sk_nulls_for_each_rcu(sk, node, &head->twchain) { tw = inet_twsk(sk); - if (!net_eq(twsk_net(tw), net) || - tw->tw_family != family) + if ((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count)) continue; - atomic_inc(&tw->tw_refcnt); - spin_unlock(lock); + if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) + continue; + + if (unlikely((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); inet_twsk_deschedule(tw, twdr); inet_twsk_put(tw); - - goto restart; + goto restart_rcu; } - spin_unlock(lock); + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); } - local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b1fbe18feb5a..6bcfe52a9c87 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -67,9 +67,6 @@ * ip_id_count: idlock */ -/* Exported for inet_getid inline function. */ -DEFINE_SPINLOCK(inet_peer_idlock); - static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height @@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) n->v4daddr = daddr; atomic_set(&n->refcnt, 1); atomic_set(&n->rid, 0); - n->ip_id_count = secure_ip_id(daddr); + atomic_set(&n->ip_id_count, secure_ip_id(daddr)); n->tcp_ts_stamp = 0; write_lock_bh(&peer_pool_lock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index a2991bc8e32e..56cdf68a074c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -25,6 +25,7 @@ #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/tcp.h> @@ -111,8 +112,8 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, - ip_forward_finish); + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, + rt->u.dst.dev, ip_forward_finish); sr_failed: /* diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 575f9bd51ccd..75347ea70ea0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -32,6 +32,9 @@ #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/slab.h> +#include <net/route.h> +#include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -205,11 +208,35 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - dev_put(head->dev); + rcu_read_lock(); + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out_rcu_unlock; + + /* + * Only search router table for the head fragment, + * when defraging timeout at PRE_ROUTING HOOK. + */ + if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { + const struct iphdr *iph = ip_hdr(head); + int err = ip_route_input(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (unlikely(err)) + goto out_rcu_unlock; + + /* + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; + } + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); } out: spin_unlock(&qp->q.lock); @@ -563,7 +590,7 @@ out_oversize: printk(KERN_INFO "Oversized IP packet from %pI4.\n", &qp->saddr); out_fail: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); return err; } @@ -603,7 +630,6 @@ static int zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { - .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), @@ -611,7 +637,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), @@ -619,26 +644,22 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_TIME, .procname = "ipfrag_time", .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { } }; static struct ctl_table ip4_frags_ctl_table[] = { { - .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, .procname = "ipfrag_secret_interval", .data = &ip4_frags.secret_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { .procname = "ipfrag_max_dist", @@ -651,13 +672,13 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ns_ctl_register(struct net *net) +static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -675,13 +696,13 @@ static int ip4_frags_ns_ctl_register(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } -static void ip4_frags_ns_ctl_unregister(struct net *net) +static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -709,7 +730,7 @@ static inline void ip4_frags_ctl_register(void) } #endif -static int ipv4_frags_init_net(struct net *net) +static int __net_init ipv4_frags_init_net(struct net *net) { /* * Fragment cache limits. We will commit 256K at one time. Should we @@ -731,7 +752,7 @@ static int ipv4_frags_init_net(struct net *net) return ip4_frags_ns_ctl_register(net); } -static void ipv4_frags_exit_net(struct net *net) +static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 82c11dd10a62..32618e11076d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> @@ -66,10 +67,7 @@ solution, but it supposes maintaing new variable in ALL skb, even if no tunneling is used. - Current solution: t->recursion lock breaks dead loops. It looks - like dev->tbusy flag, but I preferred new variable, because - the semantics is different. One day, when hard_start_xmit - will be multithreaded we will have to use skb->encapsulation. + Current solution: HARD_TX_LOCK lock breaks dead loops. @@ -128,7 +126,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); #define HASH_SIZE 16 -static int ipgre_net_id; +static int ipgre_net_id __read_mostly; struct ipgre_net { struct ip_tunnel *tunnels[4][HASH_SIZE]; @@ -159,8 +157,13 @@ struct ipgre_net { #define tunnels_r tunnels[2] #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] +/* + * Locking : hash tables are protected by RCU and a spinlock + */ +static DEFINE_SPINLOCK(ipgre_lock); -static DEFINE_RWLOCK(ipgre_lock); +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /* Given src, dst and key, find appropriate for input tunnel. */ @@ -178,7 +181,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, ARPHRD_ETHER : ARPHRD_IPGRE; int score, cand_score = 4; - for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { if (local != t->parms.iph.saddr || remote != t->parms.iph.daddr || key != t->parms.i_key || @@ -203,7 +206,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, } } - for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { if (remote != t->parms.iph.daddr || key != t->parms.i_key || !(t->dev->flags & IFF_UP)) @@ -227,7 +230,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, } } - for (t = ign->tunnels_l[h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { if ((local != t->parms.iph.saddr && (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) || @@ -253,7 +256,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, } } - for (t = ign->tunnels_wc[h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { if (t->parms.i_key != key || !(t->dev->flags & IFF_UP)) continue; @@ -279,8 +282,9 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, if (cand != NULL) return cand; - if (ign->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(ign->fb_tunnel_dev); + dev = ign->fb_tunnel_dev; + if (dev->flags & IFF_UP) + return netdev_priv(dev); return NULL; } @@ -314,10 +318,10 @@ static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) { struct ip_tunnel **tp = ipgre_bucket(ign, t); + spin_lock_bh(&ipgre_lock); t->next = *tp; - write_lock_bh(&ipgre_lock); - *tp = t; - write_unlock_bh(&ipgre_lock); + rcu_assign_pointer(*tp, t); + spin_unlock_bh(&ipgre_lock); } static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) @@ -326,9 +330,9 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { if (t == *tp) { - write_lock_bh(&ipgre_lock); + spin_lock_bh(&ipgre_lock); *tp = t->next; - write_unlock_bh(&ipgre_lock); + spin_unlock_bh(&ipgre_lock); break; } } @@ -479,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) break; } - read_lock(&ipgre_lock); + rcu_read_lock(); t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, flags & GRE_KEY ? *(((__be32 *)p) + (grehlen / 4) - 1) : 0, @@ -497,8 +501,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) t->err_count = 1; t->err_time = jiffies; out: - read_unlock(&ipgre_lock); - return; + rcu_read_unlock(); } static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) @@ -534,7 +537,6 @@ static int ipgre_rcv(struct sk_buff *skb) struct ip_tunnel *tunnel; int offset = 4; __be16 gre_proto; - unsigned int len; if (!pskb_may_pull(skb, 16)) goto drop_nolock; @@ -576,7 +578,7 @@ static int ipgre_rcv(struct sk_buff *skb) gre_proto = *(__be16 *)(h + 2); - read_lock(&ipgre_lock); + rcu_read_lock(); if ((tunnel = ipgre_tunnel_lookup(skb->dev, iph->saddr, iph->daddr, key, gre_proto))) { @@ -625,8 +627,6 @@ static int ipgre_rcv(struct sk_buff *skb) tunnel->i_seqno = seqno + 1; } - len = skb->len; - /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { @@ -640,32 +640,29 @@ static int ipgre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } - stats->rx_packets++; - stats->rx_bytes += len; - skb->dev = tunnel->dev; - skb_dst_drop(skb); - nf_reset(skb); + skb_tunnel_rx(skb, tunnel->dev); skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); netif_rx(skb); - read_unlock(&ipgre_lock); + rcu_read_unlock(); return(0); } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: - read_unlock(&ipgre_lock); + rcu_read_unlock(); drop_nolock: kfree_skb(skb); return(0); } -static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->dev->stats; + struct net_device_stats *stats = &dev->stats; + struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; @@ -678,11 +675,6 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) __be32 dst; int mtu; - if (tunnel->recursion++) { - stats->collisions++; - goto tx_error; - } - if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -794,7 +786,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } @@ -811,17 +803,18 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) tunnel->err_count = 0; } - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); - stats->tx_dropped++; + txq->tx_dropped++; dev_kfree_skb(skb); - tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -888,8 +881,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); IPTUNNEL_XMIT(); - tunnel->recursion--; - return 0; + return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); @@ -897,8 +889,7 @@ tx_error_icmp: tx_error: stats->tx_errors++; dev_kfree_skb(skb); - tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } static int ipgre_tunnel_bind_dev(struct net_device *dev) @@ -1148,12 +1139,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (saddr) memcpy(&iph->saddr, saddr, 4); - - if (daddr) { + if (daddr) memcpy(&iph->daddr, daddr, 4); - return t->hlen; - } - if (iph->daddr && !ipv4_is_multicast(iph->daddr)) + if (iph->daddr) return t->hlen; return -t->hlen; @@ -1288,39 +1276,33 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) } -static struct net_protocol ipgre_protocol = { +static const struct net_protocol ipgre_protocol = { .handler = ipgre_rcv, .err_handler = ipgre_err, .netns_ok = 1, }; -static void ipgre_destroy_tunnels(struct ipgre_net *ign) +static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) { int prio; for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - while ((t = ign->tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); + struct ip_tunnel *t = ign->tunnels[prio][h]; + + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = t->next; + } } } } -static int ipgre_init_net(struct net *net) +static int __net_init ipgre_init_net(struct net *net) { + struct ipgre_net *ign = net_generic(net, ipgre_net_id); int err; - struct ipgre_net *ign; - - err = -ENOMEM; - ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL); - if (ign == NULL) - goto err_alloc; - - err = net_assign_generic(net, ipgre_net_id, ign); - if (err < 0) - goto err_assign; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", ipgre_tunnel_setup); @@ -1341,27 +1323,26 @@ static int ipgre_init_net(struct net *net) err_reg_dev: free_netdev(ign->fb_tunnel_dev); err_alloc_dev: - /* nothing */ -err_assign: - kfree(ign); -err_alloc: return err; } -static void ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_net(struct net *net) { struct ipgre_net *ign; + LIST_HEAD(list); ign = net_generic(net, ipgre_net_id); rtnl_lock(); - ipgre_destroy_tunnels(ign); + ipgre_destroy_tunnels(ign, &list); + unregister_netdevice_many(&list); rtnl_unlock(); - kfree(ign); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, .exit = ipgre_exit_net, + .id = &ipgre_net_id, + .size = sizeof(struct ipgre_net), }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1475,14 +1456,14 @@ static void ipgre_tap_setup(struct net_device *dev) ether_setup(dev); - dev->netdev_ops = &ipgre_netdev_ops; + dev->netdev_ops = &ipgre_tap_netdev_ops; dev->destructor = free_netdev; dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; } -static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], +static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel *nt; @@ -1536,25 +1517,29 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], if (t->dev != dev) return -EEXIST; } else { - unsigned nflags = 0; - t = nt; - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; + if (dev->type != ARPHRD_ETHER) { + unsigned nflags = 0; - if ((dev->flags ^ nflags) & - (IFF_POINTOPOINT | IFF_BROADCAST)) - return -EINVAL; + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + } ipgre_tunnel_unlink(ign, t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + } ipgre_tunnel_link(ign, t); netdev_state_change(dev); } @@ -1672,15 +1657,16 @@ static int __init ipgre_init(void) printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); - if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { + err = register_pernet_device(&ipgre_net_ops); + if (err < 0) + return err; + + err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); - return -EAGAIN; + goto add_proto_failed; } - err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); - if (err < 0) - goto gen_device_failed; - err = rtnl_link_register(&ipgre_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1695,9 +1681,9 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); -gen_device_failed: inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); +add_proto_failed: + unregister_pernet_device(&ipgre_net_ops); goto out; } @@ -1705,9 +1691,9 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); + unregister_pernet_device(&ipgre_net_ops); } module_init(ipgre_init); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index db46b4b5b2b9..d930dc5e4d85 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -119,6 +119,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> @@ -161,10 +162,10 @@ int ip_call_ra_chain(struct sk_buff *skb) /* If socket is bound to an interface, only report * the packet if it came from that interface. */ - if (sk && inet_sk(sk)->num == protocol && + if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - sock_net(sk) == dev_net(dev)) { + net_eq(sock_net(sk), dev_net(dev))) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { read_unlock(&ip_ra_lock); @@ -202,7 +203,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb) { int protocol = ip_hdr(skb)->protocol; int hash, raw; - struct net_protocol *ipprot; + const struct net_protocol *ipprot; resubmit: raw = raw_local_deliver(skb, protocol); @@ -265,7 +266,7 @@ int ip_local_deliver(struct sk_buff *skb) return 0; } - return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -330,8 +331,8 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (skb_dst(skb) == NULL) { - int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, - skb->dev); + int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), @@ -443,7 +444,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); inhdr_error: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 94bf105ef3c9..ba9836c488ed 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -11,6 +11,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <asm/uaccess.h> #include <linux/skbuff.h> @@ -237,7 +238,6 @@ void ip_options_fragment(struct sk_buff * skb) opt->rr_needaddr = 0; opt->ts_needaddr = 0; opt->ts_needtime = 0; - return; } /* @@ -600,6 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; + unsigned long orefdst; int err; if (!opt->srr) @@ -623,16 +624,16 @@ int ip_options_rcv_srr(struct sk_buff *skb) } memcpy(&nexthop, &optptr[srrptr-1], 4); - rt = skb_rtable(skb); + orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { - ip_rt_put(rt2); - skb_dst_set(skb, &rt->u.dst); + skb_dst_drop(skb); + skb->_skb_refdst = orefdst; return -EINVAL; } - ip_rt_put(rt); + refdst_drop(orefdst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7ffcd96fe591..041d41df1224 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -51,6 +51,7 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/highmem.h> +#include <linux/slab.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -95,8 +96,8 @@ int __ip_local_out(struct sk_buff *skb) iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, - dst_output); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); } int ip_local_out(struct sk_buff *skb) @@ -119,7 +120,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(newskb)); - netif_rx(newskb); + netif_rx_ni(newskb); return 0; } @@ -254,7 +255,7 @@ int ip_mc_output(struct sk_buff *skb) */ if (rt->rt_flags&RTCF_MULTICAST) { - if ((!sk || inet_sk(sk)->mc_loop) + if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped @@ -264,13 +265,15 @@ int ip_mc_output(struct sk_buff *skb) This check is duplicated in ip_mr_input at the moment. */ - && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) + && + ((rt->rt_flags & RTCF_LOCAL) || + !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif - ) { + ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } @@ -285,12 +288,12 @@ int ip_mc_output(struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, - newskb->dev, ip_dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip_dev_loopback_xmit); } - return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, - ip_finish_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, + skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -303,22 +306,24 @@ int ip_output(struct sk_buff *skb) skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_queue_xmit(struct sk_buff *skb, int ipfragok) +int ip_queue_xmit(struct sk_buff *skb) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = inet->opt; struct rtable *rt; struct iphdr *iph; + int res; /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ + rcu_read_lock(); rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -329,21 +334,22 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) __be32 daddr; /* Use correct destination address if we have options. */ - daddr = inet->daddr; + daddr = inet->inet_daddr; if(opt && opt->srr) daddr = opt->faddr; { struct flowi fl = { .oif = sk->sk_bound_dev_if, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = inet->saddr, + .saddr = inet->inet_saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; + { .sport = inet->inet_sport, + .dport = inet->inet_dport } } }; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times @@ -355,7 +361,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) } sk_setup_caps(sk, &rt->u.dst); } - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set_noref(skb, &rt->u.dst); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -366,7 +372,7 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) + if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; @@ -378,7 +384,7 @@ packet_routed: if (opt && opt->optlen) { iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->daddr, rt, 0); + ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } ip_select_ident_more(iph, &rt->u.dst, sk, @@ -387,9 +393,12 @@ packet_routed: skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - return ip_local_out(skb); + res = ip_local_out(skb); + rcu_read_unlock(); + return res; no_route: + rcu_read_unlock(); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; @@ -465,6 +474,10 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) + mtu -= nf_bridge_mtu_reduction(skb); +#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: @@ -500,8 +513,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; - truesizes += frag->truesize; } + truesizes += frag->truesize; } /* Everything is OK. Generate! */ @@ -845,7 +858,8 @@ int ip_append_data(struct sock *sk, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + mtu-exthdrlen); return -EMSGSIZE; } @@ -859,8 +873,10 @@ int ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; + skb = skb_peek_tail(&sk->sk_write_queue); + inet->cork.length += length; - if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && + if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, @@ -878,7 +894,7 @@ int ip_append_data(struct sock *sk, * adding appropriate IP header. */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + if (!skb) goto alloc_new_skb; while (length > 0) { @@ -1099,7 +1115,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); return -EMSGSIZE; } @@ -1107,7 +1123,8 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return -EINVAL; inet->cork.length += size; - if ((sk->sk_protocol == IPPROTO_UDP) && + if ((size + skb->len > mtu) && + (sk->sk_protocol == IPPROTO_UDP) && (rt->u.dst.dev->features & NETIF_F_UFO)) { skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; @@ -1304,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk) err = ip_local_out(skb); if (err) { if (err > 0) - err = inet->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc7993e9061f..ce231780a2b1 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -23,6 +23,7 @@ #include <linux/icmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -245,7 +246,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, { struct ip_ra_chain *ra, *new_ra, **rap; - if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; @@ -286,12 +287,8 @@ int ip_ra_control(struct sock *sk, unsigned char on, void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { - struct inet_sock *inet = inet_sk(sk); struct sock_exterr_skb *serr; - if (!inet->recverr) - return; - skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; @@ -440,7 +437,7 @@ out: */ static int do_ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, int optlen) + int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); int val = 0, err; @@ -451,7 +448,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<<IP_TTL) | (1<<IP_HDRINCL) | (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || + (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | + (1<<IP_MINTTL))) || optname == IP_MULTICAST_TTL || optname == IP_MULTICAST_ALL || optname == IP_MULTICAST_LOOP || @@ -480,7 +478,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_OPTIONS: { struct ip_options *opt = NULL; - if (optlen > 40 || optlen < 0) + if (optlen > 40) goto e_inval; err = ip_options_get_from_user(sock_net(sk), &opt, optval, optlen); @@ -492,7 +490,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && - inet->daddr != LOOPBACK4_IPV6)) { + inet->inet_daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) icsk->icsk_ext_hdr_len -= inet->opt->optlen; @@ -575,7 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->hdrincl = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < 0 || val > 3) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) goto e_inval; inet->pmtudisc = val; break; @@ -611,6 +609,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, * Check the arguments are allowable */ + if (optlen < sizeof(struct in_addr)) + goto e_inval; + err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_user(&mreq, optval, sizeof(mreq))) @@ -631,17 +632,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); - if (dev) { + if (dev) mreq.imr_ifindex = dev->ifindex; - dev_put(dev); - } } else - dev = __dev_get_by_index(sock_net(sk), mreq.imr_ifindex); + dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); err = -EADDRNOTAVAIL; if (!dev) break; + dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && @@ -934,6 +934,14 @@ mc_msf_out: inet->transparent = !!val; break; + case IP_MINTTL: + if (optlen < 1) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + inet->min_ttl = val; + break; + default: err = -ENOPROTOOPT; break; @@ -946,8 +954,24 @@ e_inval: return -EINVAL; } +/** + * ip_queue_rcv_skb - Queue an skb into sock receive queue + * @sk: socket + * @skb: buffer + * + * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option + * is not set, we drop skb dst entry now, while dst cache line is hot. + */ +int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) + skb_dst_drop(skb); + return sock_queue_rcv_skb(sk, skb); +} +EXPORT_SYMBOL(ip_queue_rcv_skb); + int ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, int optlen) + int optname, char __user *optval, unsigned int optlen) { int err; @@ -972,7 +996,7 @@ EXPORT_SYMBOL(ip_setsockopt); #ifdef CONFIG_COMPAT int compat_ip_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { int err; @@ -1178,8 +1202,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; - info.ipi_addr.s_addr = inet->rcv_saddr; - info.ipi_spec_dst.s_addr = inet->rcv_saddr; + info.ipi_addr.s_addr = inet->inet_rcv_saddr; + info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr; info.ipi_ifindex = inet->mc_index; put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } @@ -1196,6 +1220,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TRANSPARENT: val = inet->transparent; break; + case IP_MINTTL: + val = inet->min_ttl; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 3262ce06294c..629067571f02 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -25,6 +25,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) { + struct net *net = dev_net(skb->dev); __be32 spi; struct iphdr *iph = (struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); @@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) return; spi = htonl(ntohs(ipch->cpi)); - x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; @@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) /* We always hold one tunnel user reference to indicate a tunnel */ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { + struct net *net = xs_net(x); struct xfrm_state *t; - t = xfrm_state_alloc(&init_net); + t = xfrm_state_alloc(net); if (t == NULL) goto out; @@ -61,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; + memcpy(&t->mark, &x->mark, sizeof(t->mark)); if (xfrm_init_state(t)) goto error; @@ -82,10 +85,12 @@ error: */ static int ipcomp_tunnel_attach(struct xfrm_state *x) { + struct net *net = xs_net(x); int err = 0; struct xfrm_state *t; + u32 mark = x->mark.v & x->mark.m; - t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4, x->props.saddr.a4, IPPROTO_IPIP, AF_INET); if (!t) { t = ipcomp_tunnel_create(x); @@ -124,16 +129,12 @@ static int ipcomp4_init_state(struct xfrm_state *x) if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); if (err) - goto error_tunnel; + goto out; } err = 0; out: return err; - -error_tunnel: - ipcomp_destroy(x); - goto out; } static const struct xfrm_type ipcomp_type = { @@ -146,7 +147,7 @@ static const struct xfrm_type ipcomp_type = { .output = ipcomp_output }; -static struct net_protocol ipcomp4_protocol = { +static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, .err_handler = ipcomp4_err, .no_policy = 1, diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f8d04c256454..b9d84e800cf4 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -53,6 +53,7 @@ #include <linux/root_dev.h> #include <linux/delay.h> #include <linux/nfs_fs.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> @@ -187,6 +188,16 @@ struct ic_device { static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ static struct net_device *ic_dev __initdata = NULL; /* Selected device */ +static bool __init ic_device_match(struct net_device *dev) +{ + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5))) + return true; + return false; +} + static int __init ic_open_devs(void) { struct ic_device *d, **last; @@ -207,10 +218,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (dev->flags & IFF_LOOPBACK) continue; - if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : - (!(dev->flags & IFF_LOOPBACK) && - (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && - strncmp(dev->name, "dummy", 5))) { + if (ic_device_match(dev)) { int able = 0; if (dev->mtu >= 364) able |= IC_BOOTP; @@ -228,7 +236,7 @@ static int __init ic_open_devs(void) } if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { rtnl_unlock(); - return -1; + return -ENOMEM; } d->dev = dev; *last = d; @@ -253,7 +261,7 @@ static int __init ic_open_devs(void) printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); else printk(KERN_ERR "IP-Config: No network devices available.\n"); - return -1; + return -ENODEV; } return 0; } @@ -968,7 +976,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str /* Is it a reply for the device we are configuring? */ if (b->xid != ic_dev_xid) { if (net_ratelimit()) - printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n"); + printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n"); goto drop_unlock; } @@ -1172,10 +1180,9 @@ static int __init ic_dynamic(void) schedule_timeout_uninterruptible(1); #ifdef IPCONFIG_DHCP /* DHCP isn't done until we get a DHCPACK. */ - if ((ic_got_reply & IC_BOOTP) - && (ic_proto_enabled & IC_USE_DHCP) - && ic_dhcp_msgtype != DHCPACK) - { + if ((ic_got_reply & IC_BOOTP) && + (ic_proto_enabled & IC_USE_DHCP) && + ic_dhcp_msgtype != DHCPACK) { ic_got_reply = 0; printk(","); continue; @@ -1304,6 +1311,32 @@ __be32 __init root_nfs_parse_addr(char *name) return addr; } +#define DEVICE_WAIT_MAX 12 /* 12 seconds */ + +static int __init wait_for_devices(void) +{ + int i; + + msleep(CONF_PRE_OPEN); + for (i = 0; i < DEVICE_WAIT_MAX; i++) { + struct net_device *dev; + int found = 0; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (ic_device_match(dev)) { + found = 1; + break; + } + } + rtnl_unlock(); + if (found) + return 0; + ssleep(1); + } + return -ENODEV; +} + /* * IP Autoconfig dispatcher. */ @@ -1314,6 +1347,7 @@ static int __init ip_auto_config(void) #ifdef IPCONFIG_DYNAMIC int retries = CONF_OPEN_RETRIES; #endif + int err; #ifdef CONFIG_PROC_FS proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); @@ -1326,12 +1360,15 @@ static int __init ip_auto_config(void) #ifdef IPCONFIG_DYNAMIC try_try_again: #endif - /* Give hardware a chance to settle */ - msleep(CONF_PRE_OPEN); + /* Wait for devices to appear */ + err = wait_for_devices(); + if (err) + return err; /* Setup all network devices */ - if (ic_open_devs() < 0) - return -1; + err = ic_open_devs(); + if (err) + return err; /* Give drivers a chance to settle */ ssleep(CONF_POST_OPEN); @@ -1344,9 +1381,9 @@ static int __init ip_auto_config(void) */ if (ic_myaddr == NONE || #ifdef CONFIG_ROOT_NFS - (root_server_addr == NONE - && ic_servaddr == NONE - && ROOT_DEV == Root_NFS) || + (root_server_addr == NONE && + ic_servaddr == NONE && + ROOT_DEV == Root_NFS) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1447,7 +1484,7 @@ late_initcall(ip_auto_config); /* * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel - * command line parameter. See Documentation/filesystems/nfsroot.txt. + * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt. */ static int __init ic_proto_name(char *name) { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 93e2b787da20..7fd636711037 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -95,6 +95,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> @@ -119,7 +120,7 @@ #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static int ipip_net_id; +static int ipip_net_id __read_mostly; struct ipip_net { struct ip_tunnel *tunnels_r_l[HASH_SIZE]; struct ip_tunnel *tunnels_r[HASH_SIZE]; @@ -130,11 +131,16 @@ struct ipip_net { struct net_device *fb_tunnel_dev; }; -static void ipip_fb_tunnel_init(struct net_device *dev); static void ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); -static DEFINE_RWLOCK(ipip_lock); +/* + * Locking : hash tables are protected by RCU and a spinlock + */ +static DEFINE_SPINLOCK(ipip_lock); + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) @@ -144,20 +150,21 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) return t; - } - for (t = ipn->tunnels_r[h0]; t; t = t->next) { + + for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) return t; - } - for (t = ipn->tunnels_l[h1]; t; t = t->next) { + + for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) return t; - } - if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + + t = rcu_dereference(ipn->tunnels_wc[0]); + if (t && (t->dev->flags&IFF_UP)) return t; return NULL; } @@ -193,9 +200,9 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { if (t == *tp) { - write_lock_bh(&ipip_lock); + spin_lock_bh(&ipip_lock); *tp = t->next; - write_unlock_bh(&ipip_lock); + spin_unlock_bh(&ipip_lock); break; } } @@ -205,10 +212,10 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { struct ip_tunnel **tp = ipip_bucket(ipn, t); + spin_lock_bh(&ipip_lock); t->next = *tp; - write_lock_bh(&ipip_lock); - *tp = t; - write_unlock_bh(&ipip_lock); + rcu_assign_pointer(*tp, t); + spin_unlock_bh(&ipip_lock); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -267,9 +274,9 @@ static void ipip_tunnel_uninit(struct net_device *dev) struct ipip_net *ipn = net_generic(net, ipip_net_id); if (dev == ipn->fb_tunnel_dev) { - write_lock_bh(&ipip_lock); + spin_lock_bh(&ipip_lock); ipn->tunnels_wc[0] = NULL; - write_unlock_bh(&ipip_lock); + spin_unlock_bh(&ipip_lock); } else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); @@ -318,7 +325,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) err = -ENOENT; - read_lock(&ipip_lock); + rcu_read_lock(); t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); if (t == NULL || t->parms.iph.daddr == 0) goto out; @@ -333,7 +340,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) t->err_count = 1; t->err_time = jiffies; out: - read_unlock(&ipip_lock); + rcu_read_unlock(); return err; } @@ -351,11 +358,11 @@ static int ipip_rcv(struct sk_buff *skb) struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); - read_lock(&ipip_lock); + rcu_read_lock(); if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr)) != NULL) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - read_unlock(&ipip_lock); + rcu_read_unlock(); kfree_skb(skb); return 0; } @@ -367,17 +374,14 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - tunnel->dev->stats.rx_packets++; - tunnel->dev->stats.rx_bytes += skb->len; - skb->dev = tunnel->dev; - skb_dst_drop(skb); - nf_reset(skb); + skb_tunnel_rx(skb, tunnel->dev); + ipip_ecn_decapsulate(iph, skb); netif_rx(skb); - read_unlock(&ipip_lock); + rcu_read_unlock(); return 0; } - read_unlock(&ipip_lock); + rcu_read_unlock(); return -1; } @@ -387,10 +391,11 @@ static int ipip_rcv(struct sk_buff *skb) * and that skb is filled properly by that function. */ -static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->dev->stats; + struct net_device_stats *stats = &dev->stats; + struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; @@ -402,11 +407,6 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) __be32 dst = tiph->daddr; int mtu; - if (tunnel->recursion++) { - stats->collisions++; - goto tx_error; - } - if (skb->protocol != htons(ETH_P_IP)) goto tx_error; @@ -443,25 +443,27 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error; } - if (tiph->frag_off) + df |= old_iph->frag_off & htons(IP_DF); + + if (df) { mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - else - mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (mtu < 68) { - stats->collisions++; - ip_rt_put(rt); - goto tx_error; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + if (mtu < 68) { + stats->collisions++; + ip_rt_put(rt); + goto tx_error; + } - df |= (old_iph->frag_off&htons(IP_DF)); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - goto tx_error; + if ((old_iph->frag_off & htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } } if (tunnel->err_count > 0) { @@ -483,10 +485,9 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - stats->tx_dropped++; + txq->tx_dropped++; dev_kfree_skb(skb); - tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -523,16 +524,14 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); IPTUNNEL_XMIT(); - tunnel->recursion--; - return 0; + return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: stats->tx_errors++; dev_kfree_skb(skb); - tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } static void ipip_tunnel_bind_dev(struct net_device *dev) @@ -728,7 +727,7 @@ static void ipip_tunnel_init(struct net_device *dev) ipip_tunnel_bind_dev(dev); } -static void ipip_fb_tunnel_init(struct net_device *dev) +static void __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -754,33 +753,27 @@ static struct xfrm_tunnel ipip_handler = { static const char banner[] __initconst = KERN_INFO "IPv4 over IPv4 tunneling driver\n"; -static void ipip_destroy_tunnels(struct ipip_net *ipn) +static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) { int prio; for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - while ((t = ipn->tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); + struct ip_tunnel *t = ipn->tunnels[prio][h]; + + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = t->next; + } } } } -static int ipip_init_net(struct net *net) +static int __net_init ipip_init_net(struct net *net) { + struct ipip_net *ipn = net_generic(net, ipip_net_id); int err; - struct ipip_net *ipn; - - err = -ENOMEM; - ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); - if (ipn == NULL) - goto err_alloc; - - err = net_assign_generic(net, ipip_net_id, ipn); - if (err < 0) - goto err_assign; ipn->tunnels[0] = ipn->tunnels_wc; ipn->tunnels[1] = ipn->tunnels_l; @@ -807,27 +800,26 @@ err_reg_dev: free_netdev(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ -err_assign: - kfree(ipn); -err_alloc: return err; } -static void ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_net(struct net *net) { - struct ipip_net *ipn; + struct ipip_net *ipn = net_generic(net, ipip_net_id); + LIST_HEAD(list); - ipn = net_generic(net, ipip_net_id); rtnl_lock(); - ipip_destroy_tunnels(ipn); - unregister_netdevice(ipn->fb_tunnel_dev); + ipip_destroy_tunnels(ipn, &list); + unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); + unregister_netdevice_many(&list); rtnl_unlock(); - kfree(ipn); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit = ipip_exit_net, + .id = &ipip_net_id, + .size = sizeof(struct ipip_net), }; static int __init ipip_init(void) @@ -836,15 +828,14 @@ static int __init ipip_init(void) printk(banner); - if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { + err = register_pernet_device(&ipip_net_ops); + if (err < 0) + return err; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + unregister_pernet_device(&ipip_net_ops); printk(KERN_INFO "ipip init: can't register tunnel\n"); - return -EAGAIN; } - - err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); - if (err) - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); - return err; } @@ -853,7 +844,7 @@ static void __exit ipip_fini(void) if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) printk(KERN_INFO "ipip close: can't deregister tunnel\n"); - unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); + unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9a8da5ed92b7..7f6273506eea 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -22,7 +22,7 @@ * overflow. * Carlos Picoto : PIMv1 Support * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header - * Relax this requrement to work with older peers. + * Relax this requirement to work with older peers. * */ @@ -47,6 +47,7 @@ #include <linux/mroute.h> #include <linux/init.h> #include <linux/if_ether.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -62,11 +63,40 @@ #include <net/ipip.h> #include <net/checksum.h> #include <net/netlink.h> +#include <net/fib_rules.h> #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) #define CONFIG_IP_PIMSM 1 #endif +struct mr_table { + struct list_head list; +#ifdef CONFIG_NET_NS + struct net *net; +#endif + u32 id; + struct sock *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct list_head mfc_cache_array[MFC_LINES]; + struct vif_device vif_table[MAXVIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + int mroute_do_assert; + int mroute_do_pim; +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) + int mroute_reg_vif_num; +#endif +}; + +struct ipmr_rule { + struct fib_rule common; +}; + +struct ipmr_result { + struct mr_table *mrt; +}; + /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ @@ -77,9 +107,7 @@ static DEFINE_RWLOCK(mrt_lock); * Multicast router control variables */ -#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL) - -static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ +#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); @@ -94,16 +122,217 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; -static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); -static int ipmr_cache_report(struct net *net, +static struct mr_table *ipmr_new_table(struct net *net, u32 id); +static int ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local); +static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); -static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); +static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm); +static void ipmr_expire_process(unsigned long arg); -#ifdef CONFIG_IP_PIMSM_V2 -static struct net_protocol pim_protocol; +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES +#define ipmr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) + +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + ipmr_for_each_table(mrt, net) { + if (mrt->id == id) + return mrt; + } + return NULL; +} + +static int ipmr_fib_lookup(struct net *net, struct flowi *flp, + struct mr_table **mrt) +{ + struct ipmr_result res; + struct fib_lookup_arg arg = { .result = &res, }; + int err; + + err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); + if (err < 0) + return err; + *mrt = res.mrt; + return 0; +} + +static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct ipmr_result *res = arg->result; + struct mr_table *mrt; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + mrt = ipmr_get_table(rule->fr_net, rule->table); + if (mrt == NULL) + return -EAGAIN; + res->mrt = mrt; + return 0; +} + +static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +{ + return 1; +} + +static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = { + FRA_GENERIC_POLICY, +}; + +static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, struct nlattr **tb) +{ + return 0; +} + +static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + return 1; +} + +static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + frh->dst_len = 0; + frh->src_len = 0; + frh->tos = 0; + return 0; +} + +static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { + .family = RTNL_FAMILY_IPMR, + .rule_size = sizeof(struct ipmr_rule), + .addr_size = sizeof(u32), + .action = ipmr_rule_action, + .match = ipmr_rule_match, + .configure = ipmr_rule_configure, + .compare = ipmr_rule_compare, + .default_pref = fib_default_rule_pref, + .fill = ipmr_rule_fill, + .nlgroup = RTNLGRP_IPV4_RULE, + .policy = ipmr_rule_policy, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_rules_init(struct net *net) +{ + struct fib_rules_ops *ops; + struct mr_table *mrt; + int err; + + ops = fib_rules_register(&ipmr_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + INIT_LIST_HEAD(&net->ipv4.mr_tables); + + mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) { + err = -ENOMEM; + goto err1; + } + + err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0); + if (err < 0) + goto err2; + + net->ipv4.mr_rules_ops = ops; + return 0; + +err2: + kfree(mrt); +err1: + fib_rules_unregister(ops); + return err; +} + +static void __net_exit ipmr_rules_exit(struct net *net) +{ + struct mr_table *mrt, *next; + + list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { + list_del(&mrt->list); + kfree(mrt); + } + fib_rules_unregister(net->ipv4.mr_rules_ops); +} +#else +#define ipmr_for_each_table(mrt, net) \ + for (mrt = net->ipv4.mrt; mrt; mrt = NULL) + +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + return net->ipv4.mrt; +} + +static int ipmr_fib_lookup(struct net *net, struct flowi *flp, + struct mr_table **mrt) +{ + *mrt = net->ipv4.mrt; + return 0; +} + +static int __net_init ipmr_rules_init(struct net *net) +{ + net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); + return net->ipv4.mrt ? 0 : -ENOMEM; +} + +static void __net_exit ipmr_rules_exit(struct net *net) +{ + kfree(net->ipv4.mrt); +} #endif -static struct timer_list ipmr_expire_timer; +static struct mr_table *ipmr_new_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + unsigned int i; + + mrt = ipmr_get_table(net, id); + if (mrt != NULL) + return mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (mrt == NULL) + return NULL; + write_pnet(&mrt->net, net); + mrt->id = id; + + /* Forwarding cache */ + for (i = 0; i < MFC_LINES; i++) + INIT_LIST_HEAD(&mrt->mfc_cache_array[i]); + + INIT_LIST_HEAD(&mrt->mfc_unres_queue); + + setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, + (unsigned long)mrt); + +#ifdef CONFIG_IP_PIMSM + mrt->mroute_reg_vif_num = -1; +#endif +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); +#endif + return mrt; +} /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ @@ -201,18 +430,30 @@ failure: #ifdef CONFIG_IP_PIMSM -static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); + struct mr_table *mrt; + struct flowi fl = { + .oif = dev->ifindex, + .iif = skb->skb_iif, + .mark = skb->mark, + }; + int err; + + err = ipmr_fib_lookup(net, &fl, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num, - IGMPMSG_WHOLEPKT); + ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } static const struct net_device_ops reg_vif_netdev_ops = { @@ -229,12 +470,18 @@ static void reg_vif_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static struct net_device *ipmr_reg_vif(struct net *net) +static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; struct in_device *in_dev; + char name[IFNAMSIZ]; + + if (mrt->id == RT_TABLE_DEFAULT) + sprintf(name, "pimreg"); + else + sprintf(name, "pimreg%u", mrt->id); - dev = alloc_netdev(0, "pimreg", reg_vif_setup); + dev = alloc_netdev(0, name, reg_vif_setup); if (dev == NULL) return NULL; @@ -279,16 +526,17 @@ failure: * @notify: Set to 1, if the caller is a notifier_call */ -static int vif_delete(struct net *net, int vifi, int notify) +static int vif_delete(struct mr_table *mrt, int vifi, int notify, + struct list_head *head) { struct vif_device *v; struct net_device *dev; struct in_device *in_dev; - if (vifi < 0 || vifi >= net->ipv4.maxvif) + if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; - v = &net->ipv4.vif_table[vifi]; + v = &mrt->vif_table[vifi]; write_lock_bh(&mrt_lock); dev = v->dev; @@ -300,17 +548,17 @@ static int vif_delete(struct net *net, int vifi, int notify) } #ifdef CONFIG_IP_PIMSM - if (vifi == net->ipv4.mroute_reg_vif_num) - net->ipv4.mroute_reg_vif_num = -1; + if (vifi == mrt->mroute_reg_vif_num) + mrt->mroute_reg_vif_num = -1; #endif - if (vifi+1 == net->ipv4.maxvif) { + if (vifi+1 == mrt->maxvif) { int tmp; for (tmp=vifi-1; tmp>=0; tmp--) { - if (VIF_EXISTS(net, tmp)) + if (VIF_EXISTS(mrt, tmp)) break; } - net->ipv4.maxvif = tmp+1; + mrt->maxvif = tmp+1; } write_unlock_bh(&mrt_lock); @@ -323,7 +571,7 @@ static int vif_delete(struct net *net, int vifi, int notify) } if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) - unregister_netdevice(dev); + unregister_netdevice_queue(dev, head); dev_put(dev); return 0; @@ -331,7 +579,6 @@ static int vif_delete(struct net *net, int vifi, int notify) static inline void ipmr_cache_free(struct mfc_cache *c) { - release_net(mfc_net(c)); kmem_cache_free(mrt_cachep, c); } @@ -339,13 +586,13 @@ static inline void ipmr_cache_free(struct mfc_cache *c) and reporting error to netlink readers. */ -static void ipmr_destroy_unres(struct mfc_cache *c) +static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) { + struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; struct nlmsgerr *e; - struct net *net = mfc_net(c); - atomic_dec(&net->ipv4.cache_resolve_queue_len); + atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { @@ -366,42 +613,40 @@ static void ipmr_destroy_unres(struct mfc_cache *c) } -/* Single timer process for all the unresolved queue. */ +/* Timer process for the unresolved queue. */ -static void ipmr_expire_process(unsigned long dummy) +static void ipmr_expire_process(unsigned long arg) { + struct mr_table *mrt = (struct mr_table *)arg; unsigned long now; unsigned long expires; - struct mfc_cache *c, **cp; + struct mfc_cache *c, *next; if (!spin_trylock(&mfc_unres_lock)) { - mod_timer(&ipmr_expire_timer, jiffies+HZ/10); + mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); return; } - if (mfc_unres_queue == NULL) + if (list_empty(&mrt->mfc_unres_queue)) goto out; now = jiffies; expires = 10*HZ; - cp = &mfc_unres_queue; - while ((c=*cp) != NULL) { + list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; - cp = &c->next; continue; } - *cp = c->next; - - ipmr_destroy_unres(c); + list_del(&c->list); + ipmr_destroy_unres(mrt, c); } - if (mfc_unres_queue != NULL) - mod_timer(&ipmr_expire_timer, jiffies + expires); + if (!list_empty(&mrt->mfc_unres_queue)) + mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); out: spin_unlock(&mfc_unres_lock); @@ -409,17 +654,17 @@ out: /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) +static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, + unsigned char *ttls) { int vifi; - struct net *net = mfc_net(cache); cache->mfc_un.res.minvif = MAXVIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXVIFS); - for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) { - if (VIF_EXISTS(net, vifi) && + for (vifi = 0; vifi < mrt->maxvif; vifi++) { + if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) @@ -430,16 +675,17 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) } } -static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) +static int vif_add(struct net *net, struct mr_table *mrt, + struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; - struct vif_device *v = &net->ipv4.vif_table[vifi]; + struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; int err; /* Is vif busy ? */ - if (VIF_EXISTS(net, vifi)) + if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->vifc_flags) { @@ -449,9 +695,9 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ - if (net->ipv4.mroute_reg_vif_num >= 0) + if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; - dev = ipmr_reg_vif(net); + dev = ipmr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); @@ -473,8 +719,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) return err; } break; + + case VIFF_USE_IFINDEX: case 0: - dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); + if (vifc->vifc_flags == VIFF_USE_IFINDEX) { + dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); + if (dev && dev->ip_ptr == NULL) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + } else + dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); + if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); @@ -487,8 +743,10 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) return -EINVAL; } - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { + dev_put(dev); return -EADDRNOTAVAIL; + } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; ip_rt_multicast_event(in_dev); @@ -515,49 +773,47 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) v->dev = dev; #ifdef CONFIG_IP_PIMSM if (v->flags&VIFF_REGISTER) - net->ipv4.mroute_reg_vif_num = vifi; + mrt->mroute_reg_vif_num = vifi; #endif - if (vifi+1 > net->ipv4.maxvif) - net->ipv4.maxvif = vifi+1; + if (vifi+1 > mrt->maxvif) + mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); return 0; } -static struct mfc_cache *ipmr_cache_find(struct net *net, +static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) { int line = MFC_HASH(mcastgrp, origin); struct mfc_cache *c; - for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) { - if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) - break; + list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { + if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) + return c; } - return c; + return NULL; } /* * Allocate a multicast cache entry */ -static struct mfc_cache *ipmr_cache_alloc(struct net *net) +static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c == NULL) return NULL; c->mfc_un.res.minvif = MAXVIFS; - mfc_net_set(c, net); return c; } -static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net) +static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c == NULL) return NULL; skb_queue_head_init(&c->mfc_un.unres.unresolved); c->mfc_un.unres.expires = jiffies + 10*HZ; - mfc_net_set(c, net); return c; } @@ -565,7 +821,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net) * A cache entry has gone into a resolved state from queued */ -static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) +static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, + struct mfc_cache *uc, struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; @@ -578,7 +835,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); - if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { + if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { nlh->nlmsg_len = (skb_tail_pointer(skb) - (u8 *)nlh); } else { @@ -590,9 +847,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); } - rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid); + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); } else - ip_mr_forward(skb, c, 0); + ip_mr_forward(net, mrt, skb, c, 0); } } @@ -603,7 +860,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) * Called under mrt_lock. */ -static int ipmr_cache_report(struct net *net, +static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { struct sk_buff *skb; @@ -636,7 +893,7 @@ static int ipmr_cache_report(struct net *net, memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = IGMPMSG_WHOLEPKT; msg->im_mbz = 0; - msg->im_vif = net->ipv4.mroute_reg_vif_num; + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -668,7 +925,7 @@ static int ipmr_cache_report(struct net *net, skb->transport_header = skb->network_header; } - if (net->ipv4.mroute_sk == NULL) { + if (mrt->mroute_sk == NULL) { kfree_skb(skb); return -EINVAL; } @@ -676,7 +933,7 @@ static int ipmr_cache_report(struct net *net, /* * Deliver to mrouted */ - ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb); + ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); if (ret < 0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); @@ -691,27 +948,29 @@ static int ipmr_cache_report(struct net *net, */ static int -ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) +ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) { + bool found = false; int err; struct mfc_cache *c; const struct iphdr *iph = ip_hdr(skb); spin_lock_bh(&mfc_unres_lock); - for (c=mfc_unres_queue; c; c=c->next) { - if (net_eq(mfc_net(c), net) && - c->mfc_mcastgrp == iph->daddr && - c->mfc_origin == iph->saddr) + list_for_each_entry(c, &mrt->mfc_unres_queue, list) { + if (c->mfc_mcastgrp == iph->daddr && + c->mfc_origin == iph->saddr) { + found = true; break; + } } - if (c == NULL) { + if (!found) { /* * Create a new entry if allowable */ - if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 || - (c = ipmr_cache_alloc_unres(net)) == NULL) { + if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || + (c = ipmr_cache_alloc_unres()) == NULL) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -728,7 +987,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) /* * Reflect first query at mrouted. */ - err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE); + err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker @@ -740,11 +999,11 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) return err; } - atomic_inc(&net->ipv4.cache_resolve_queue_len); - c->next = mfc_unres_queue; - mfc_unres_queue = c; + atomic_inc(&mrt->cache_resolve_queue_len); + list_add(&c->list, &mrt->mfc_unres_queue); - mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); + if (atomic_read(&mrt->cache_resolve_queue_len) == 1) + mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); } /* @@ -766,19 +1025,18 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) * MFC cache manipulation by user space mroute daemon */ -static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc) +static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) { int line; - struct mfc_cache *c, **cp; + struct mfc_cache *c, *next; line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - for (cp = &net->ipv4.mfc_cache_array[line]; - (c = *cp) != NULL; cp = &c->next) { + list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { write_lock_bh(&mrt_lock); - *cp = c->next; + list_del(&c->list); write_unlock_bh(&mrt_lock); ipmr_cache_free(c); @@ -788,24 +1046,30 @@ static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc) return -ENOENT; } -static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) +static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, + struct mfcctl *mfc, int mrtsock) { + bool found = false; int line; - struct mfc_cache *uc, *c, **cp; + struct mfc_cache *uc, *c; + + if (mfc->mfcc_parent >= MAXVIFS) + return -ENFILE; line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - for (cp = &net->ipv4.mfc_cache_array[line]; - (c = *cp) != NULL; cp = &c->next) { + list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + found = true; break; + } } - if (c != NULL) { + if (found) { write_lock_bh(&mrt_lock); c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(c, mfc->mfcc_ttls); + ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); @@ -815,43 +1079,42 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; - c = ipmr_cache_alloc(net); + c = ipmr_cache_alloc(); if (c == NULL) return -ENOMEM; c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(c, mfc->mfcc_ttls); + ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_lock_bh(&mrt_lock); - c->next = net->ipv4.mfc_cache_array[line]; - net->ipv4.mfc_cache_array[line] = c; + list_add(&c->list, &mrt->mfc_cache_array[line]); write_unlock_bh(&mrt_lock); /* * Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ + found = false; spin_lock_bh(&mfc_unres_lock); - for (cp = &mfc_unres_queue; (uc=*cp) != NULL; - cp = &uc->next) { - if (net_eq(mfc_net(uc), net) && - uc->mfc_origin == c->mfc_origin && + list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { + if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { - *cp = uc->next; - atomic_dec(&net->ipv4.cache_resolve_queue_len); + list_del(&uc->list); + atomic_dec(&mrt->cache_resolve_queue_len); + found = true; break; } } - if (mfc_unres_queue == NULL) - del_timer(&ipmr_expire_timer); + if (list_empty(&mrt->mfc_unres_queue)) + del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); - if (uc) { - ipmr_cache_resolve(uc, c); + if (found) { + ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } return 0; @@ -861,51 +1124,41 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct net *net) +static void mroute_clean_tables(struct mr_table *mrt) { int i; + LIST_HEAD(list); + struct mfc_cache *c, *next; /* * Shut down all active vif entries */ - for (i = 0; i < net->ipv4.maxvif; i++) { - if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) - vif_delete(net, i, 0); + for (i = 0; i < mrt->maxvif; i++) { + if (!(mrt->vif_table[i].flags&VIFF_STATIC)) + vif_delete(mrt, i, 0, &list); } + unregister_netdevice_many(&list); /* * Wipe the cache */ - for (i=0; i<MFC_LINES; i++) { - struct mfc_cache *c, **cp; - - cp = &net->ipv4.mfc_cache_array[i]; - while ((c = *cp) != NULL) { - if (c->mfc_flags&MFC_STATIC) { - cp = &c->next; + for (i = 0; i < MFC_LINES; i++) { + list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { + if (c->mfc_flags&MFC_STATIC) continue; - } write_lock_bh(&mrt_lock); - *cp = c->next; + list_del(&c->list); write_unlock_bh(&mrt_lock); ipmr_cache_free(c); } } - if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) { - struct mfc_cache *c, **cp; - + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - cp = &mfc_unres_queue; - while ((c = *cp) != NULL) { - if (!net_eq(mfc_net(c), net)) { - cp = &c->next; - continue; - } - *cp = c->next; - - ipmr_destroy_unres(c); + list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { + list_del(&c->list); + ipmr_destroy_unres(mrt, c); } spin_unlock_bh(&mfc_unres_lock); } @@ -914,16 +1167,19 @@ static void mroute_clean_tables(struct net *net) static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); + struct mr_table *mrt; rtnl_lock(); - if (sk == net->ipv4.mroute_sk) { - IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; + ipmr_for_each_table(mrt, net) { + if (sk == mrt->mroute_sk) { + IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - write_lock_bh(&mrt_lock); - net->ipv4.mroute_sk = NULL; - write_unlock_bh(&mrt_lock); + write_lock_bh(&mrt_lock); + mrt->mroute_sk = NULL; + write_unlock_bh(&mrt_lock); - mroute_clean_tables(net); + mroute_clean_tables(mrt); + } } rtnl_unlock(); } @@ -935,28 +1191,33 @@ static void mrtsock_destruct(struct sock *sk) * MOSPF/PIM router set up we can clean this up. */ -int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen) +int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) { int ret; struct vifctl vif; struct mfcctl mfc; struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; if (optname != MRT_INIT) { - if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN)) + if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT_INIT: if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->num != IPPROTO_IGMP) + inet_sk(sk)->inet_num != IPPROTO_IGMP) return -EOPNOTSUPP; if (optlen != sizeof(int)) return -ENOPROTOOPT; rtnl_lock(); - if (net->ipv4.mroute_sk) { + if (mrt->mroute_sk) { rtnl_unlock(); return -EADDRINUSE; } @@ -964,7 +1225,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { write_lock_bh(&mrt_lock); - net->ipv4.mroute_sk = sk; + mrt->mroute_sk = sk; write_unlock_bh(&mrt_lock); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; @@ -972,7 +1233,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int rtnl_unlock(); return ret; case MRT_DONE: - if (sk != net->ipv4.mroute_sk) + if (sk != mrt->mroute_sk) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -985,9 +1246,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int return -ENFILE; rtnl_lock(); if (optname == MRT_ADD_VIF) { - ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); + ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); } else { - ret = vif_delete(net, vif.vifc_vifi, 0); + ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } rtnl_unlock(); return ret; @@ -1004,9 +1265,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int return -EFAULT; rtnl_lock(); if (optname == MRT_DEL_MFC) - ret = ipmr_mfc_delete(net, &mfc); + ret = ipmr_mfc_delete(mrt, &mfc); else - ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk); + ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); rtnl_unlock(); return ret; /* @@ -1017,7 +1278,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int int v; if (get_user(v,(int __user *)optval)) return -EFAULT; - net->ipv4.mroute_do_assert = (v) ? 1 : 0; + mrt->mroute_do_assert = (v) ? 1 : 0; return 0; } #ifdef CONFIG_IP_PIMSM @@ -1031,14 +1292,35 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int rtnl_lock(); ret = 0; - if (v != net->ipv4.mroute_do_pim) { - net->ipv4.mroute_do_pim = v; - net->ipv4.mroute_do_assert = v; + if (v != mrt->mroute_do_pim) { + mrt->mroute_do_pim = v; + mrt->mroute_do_assert = v; } rtnl_unlock(); return ret; } #endif +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + case MRT_TABLE: + { + u32 v; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(v, (u32 __user *)optval)) + return -EFAULT; + if (sk == mrt->mroute_sk) + return -EBUSY; + + rtnl_lock(); + ret = 0; + if (!ipmr_new_table(net, v)) + ret = -ENOMEM; + raw_sk(sk)->ipmr_table = v; + rtnl_unlock(); + return ret; + } +#endif /* * Spurious command, or MRT_VERSION which you cannot * set. @@ -1057,6 +1339,11 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int int olr; int val; struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; if (optname != MRT_VERSION && #ifdef CONFIG_IP_PIMSM @@ -1078,10 +1365,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int val = 0x0305; #ifdef CONFIG_IP_PIMSM else if (optname == MRT_PIM) - val = net->ipv4.mroute_do_pim; + val = mrt->mroute_do_pim; #endif else - val = net->ipv4.mroute_do_assert; + val = mrt->mroute_do_assert; if (copy_to_user(optval, &val, olr)) return -EFAULT; return 0; @@ -1098,16 +1385,21 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; - if (vr.vifi >= net->ipv4.maxvif) + if (vr.vifi >= mrt->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif = &net->ipv4.vif_table[vr.vifi]; - if (VIF_EXISTS(net, vr.vifi)) { + vif = &mrt->vif_table[vr.vifi]; + if (VIF_EXISTS(mrt, vr.vifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -1125,7 +1417,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) return -EFAULT; read_lock(&mrt_lock); - c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr); + c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; @@ -1148,19 +1440,22 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v { struct net_device *dev = ptr; struct net *net = dev_net(dev); + struct mr_table *mrt; struct vif_device *v; int ct; - - if (!net_eq(dev_net(dev), net)) - return NOTIFY_DONE; + LIST_HEAD(list); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - v = &net->ipv4.vif_table[0]; - for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { - if (v->dev == dev) - vif_delete(net, ct, 1); + + ipmr_for_each_table(mrt, net) { + v = &mrt->vif_table[0]; + for (ct = 0; ct < mrt->maxvif; ct++, v++) { + if (v->dev == dev) + vif_delete(mrt, ct, 1, &list); + } } + unregister_netdevice_many(&list); return NOTIFY_DONE; } @@ -1217,11 +1512,11 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) * Processing handlers for ipmr_forward */ -static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) +static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *c, int vifi) { - struct net *net = mfc_net(c); const struct iphdr *iph = ip_hdr(skb); - struct vif_device *vif = &net->ipv4.vif_table[vifi]; + struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *dev; struct rtable *rt; int encap = 0; @@ -1235,7 +1530,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) vif->bytes_out += skb->len; vif->dev->stats.tx_bytes += skb->len; vif->dev->stats.tx_packets++; - ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT); + ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } #endif @@ -1309,21 +1604,20 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev, ipmr_forward_finish); return; out_free: kfree_skb(skb); - return; } -static int ipmr_find_vif(struct net_device *dev) +static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) { - struct net *net = dev_net(dev); int ct; - for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) { - if (net->ipv4.vif_table[ct].dev == dev) + + for (ct = mrt->maxvif-1; ct >= 0; ct--) { + if (mrt->vif_table[ct].dev == dev) break; } return ct; @@ -1331,11 +1625,12 @@ static int ipmr_find_vif(struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ -static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) +static int ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local) { int psend = -1; int vif, ct; - struct net *net = mfc_net(cache); vif = cache->mfc_parent; cache->mfc_un.res.pkt++; @@ -1344,7 +1639,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (net->ipv4.vif_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != skb->dev) { int true_vifi; if (skb_rtable(skb)->fl.iif == 0) { @@ -1363,26 +1658,26 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local } cache->mfc_un.res.wrong_if++; - true_vifi = ipmr_find_vif(skb->dev); + true_vifi = ipmr_find_vif(mrt, skb->dev); - if (true_vifi >= 0 && net->ipv4.mroute_do_assert && + if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ - (net->ipv4.mroute_do_pim || + (mrt->mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { cache->mfc_un.res.last_assert = jiffies; - ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF); + ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); } goto dont_forward; } - net->ipv4.vif_table[vif].pkt_in++; - net->ipv4.vif_table[vif].bytes_in += skb->len; + mrt->vif_table[vif].pkt_in++; + mrt->vif_table[vif].bytes_in += skb->len; /* * Forward the frame @@ -1392,7 +1687,8 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(skb2, cache, psend); + ipmr_queue_xmit(net, mrt, skb2, cache, + psend); } psend = ct; } @@ -1401,9 +1697,9 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(skb2, cache, psend); + ipmr_queue_xmit(net, mrt, skb2, cache, psend); } else { - ipmr_queue_xmit(skb, cache, psend); + ipmr_queue_xmit(net, mrt, skb, cache, psend); return 0; } } @@ -1424,6 +1720,8 @@ int ip_mr_input(struct sk_buff *skb) struct mfc_cache *cache; struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + struct mr_table *mrt; + int err; /* Packet is looped back after forward, it should not be forwarded second time, but still can be delivered locally. @@ -1431,6 +1729,12 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->flags&IPSKB_FORWARDED) goto dont_forward; + err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) @@ -1443,9 +1747,9 @@ int ip_mr_input(struct sk_buff *skb) that we can forward NO IGMP messages. */ read_lock(&mrt_lock); - if (net->ipv4.mroute_sk) { + if (mrt->mroute_sk) { nf_reset(skb); - raw_rcv(net->ipv4.mroute_sk, skb); + raw_rcv(mrt->mroute_sk, skb); read_unlock(&mrt_lock); return 0; } @@ -1454,7 +1758,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* * No usable cache entry @@ -1472,19 +1776,19 @@ int ip_mr_input(struct sk_buff *skb) skb = skb2; } - vif = ipmr_find_vif(skb->dev); + vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) { - int err = ipmr_cache_unresolved(net, vif, skb); + int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); - return err; + return err2; } read_unlock(&mrt_lock); kfree_skb(skb); return -ENODEV; } - ip_mr_forward(skb, cache, local); + ip_mr_forward(net, mrt, skb, cache, local); read_unlock(&mrt_lock); @@ -1501,11 +1805,11 @@ dont_forward: } #ifdef CONFIG_IP_PIMSM -static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) +static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, + unsigned int pimlen) { struct net_device *reg_dev = NULL; struct iphdr *encap; - struct net *net = dev_net(skb->dev); encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* @@ -1520,8 +1824,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) return 1; read_lock(&mrt_lock); - if (net->ipv4.mroute_reg_vif_num >= 0) - reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev; + if (mrt->mroute_reg_vif_num >= 0) + reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -1532,14 +1836,12 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) skb->mac_header = skb->network_header; skb_pull(skb, (u8*)encap - skb->data); skb_reset_network_header(skb); - skb->dev = reg_dev; skb->protocol = htons(ETH_P_IP); skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; - skb_dst_drop(skb); - reg_dev->stats.rx_bytes += skb->len; - reg_dev->stats.rx_packets++; - nf_reset(skb); + + skb_tunnel_rx(skb, reg_dev); + netif_rx(skb); dev_put(reg_dev); @@ -1556,17 +1858,21 @@ int pim_rcv_v1(struct sk_buff * skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); + struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = igmp_hdr(skb); - if (!net->ipv4.mroute_do_pim || + if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + goto drop; + + if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; - if (__pim_rcv(skb, sizeof(*pim))) { + if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } @@ -1578,6 +1884,8 @@ drop: static int pim_rcv(struct sk_buff * skb) { struct pimreghdr *pim; + struct net *net = dev_net(skb->dev); + struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; @@ -1589,7 +1897,10 @@ static int pim_rcv(struct sk_buff * skb) csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; - if (__pim_rcv(skb, sizeof(*pim))) { + if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + goto drop; + + if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } @@ -1597,29 +1908,31 @@ drop: } #endif -static int -ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) +static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm) { int ct; struct rtnexthop *nhp; - struct net *net = mfc_net(c); - struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev; u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; - if (dev) - RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent >= MAXVIFS) + return -ENOENT; + + if (VIF_EXISTS(mrt, c->mfc_parent)) + RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (c->mfc_un.res.ttls[ct] < 255) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) goto rtattr_failure; nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex; + nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } @@ -1637,11 +1950,16 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, int nowait) { int err; + struct mr_table *mrt; struct mfc_cache *cache; struct rtable *rt = skb_rtable(skb); + mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + read_lock(&mrt_lock); - cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst); + cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); if (cache == NULL) { struct sk_buff *skb2; @@ -1655,7 +1973,7 @@ int ipmr_get_route(struct net *net, } dev = skb->dev; - if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { + if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { read_unlock(&mrt_lock); return -ENODEV; } @@ -1672,24 +1990,107 @@ int ipmr_get_route(struct net *net, iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; iph->version = 0; - err = ipmr_cache_unresolved(net, vif, skb2); + err = ipmr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); return err; } if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; - err = ipmr_fill_mroute(skb, cache, rtm); + err = __ipmr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); return err; } +static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 pid, u32 seq, struct mfc_cache *c) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IPMR; + rtm->rtm_dst_len = 32; + rtm->rtm_src_len = 32; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + NLA_PUT_U32(skb, RTA_TABLE, mrt->id); + rtm->rtm_type = RTN_MULTICAST; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = 0; + + NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); + NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); + + if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr_table *mrt; + struct mfc_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + read_lock(&mrt_lock); + ipmr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC_LINES; h++) { + list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ipmr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + mfc) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + s_h = 0; +next_table: + t++; + } +done: + read_unlock(&mrt_lock); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} + #ifdef CONFIG_PROC_FS /* * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; + struct mr_table *mrt; int ct; }; @@ -1697,11 +2098,13 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net, struct ipmr_vif_iter *iter, loff_t pos) { - for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) { - if (!VIF_EXISTS(net, iter->ct)) + struct mr_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!VIF_EXISTS(mrt, iter->ct)) continue; if (pos-- == 0) - return &net->ipv4.vif_table[iter->ct]; + return &mrt->vif_table[iter->ct]; } return NULL; } @@ -1709,7 +2112,15 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net, static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { + struct ipmr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + iter->mrt = mrt; read_lock(&mrt_lock); return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) @@ -1720,15 +2131,16 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ipmr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); + struct mr_table *mrt = iter->mrt; ++*pos; if (v == SEQ_START_TOKEN) return ipmr_vif_seq_idx(net, iter, 0); - while (++iter->ct < net->ipv4.maxvif) { - if (!VIF_EXISTS(net, iter->ct)) + while (++iter->ct < mrt->maxvif) { + if (!VIF_EXISTS(mrt, iter->ct)) continue; - return &net->ipv4.vif_table[iter->ct]; + return &mrt->vif_table[iter->ct]; } return NULL; } @@ -1741,7 +2153,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { - struct net *net = seq_file_net(seq); + struct ipmr_vif_iter *iter = seq->private; + struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -1752,7 +2165,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", - vif - net->ipv4.vif_table, + vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); @@ -1783,7 +2196,8 @@ static const struct file_operations ipmr_vif_fops = { struct ipmr_mfc_iter { struct seq_net_private p; - struct mfc_cache **cache; + struct mr_table *mrt; + struct list_head *cache; int ct; }; @@ -1791,22 +2205,22 @@ struct ipmr_mfc_iter { static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { + struct mr_table *mrt = it->mrt; struct mfc_cache *mfc; - it->cache = net->ipv4.mfc_cache_array; read_lock(&mrt_lock); - for (it->ct = 0; it->ct < MFC_LINES; it->ct++) - for (mfc = net->ipv4.mfc_cache_array[it->ct]; - mfc; mfc = mfc->next) + for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { + it->cache = &mrt->mfc_cache_array[it->ct]; + list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) return mfc; + } read_unlock(&mrt_lock); - it->cache = &mfc_unres_queue; spin_lock_bh(&mfc_unres_lock); - for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) - if (net_eq(mfc_net(mfc), net) && - pos-- == 0) + it->cache = &mrt->mfc_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) return mfc; spin_unlock_bh(&mfc_unres_lock); @@ -1819,7 +2233,13 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + it->mrt = mrt; it->cache = NULL; it->ct = 0; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) @@ -1831,37 +2251,36 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct mfc_cache *mfc = v; struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); + struct mr_table *mrt = it->mrt; ++*pos; if (v == SEQ_START_TOKEN) return ipmr_mfc_seq_idx(net, seq->private, 0); - if (mfc->next) - return mfc->next; + if (mfc->list.next != it->cache) + return list_entry(mfc->list.next, struct mfc_cache, list); - if (it->cache == &mfc_unres_queue) + if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; - BUG_ON(it->cache != net->ipv4.mfc_cache_array); + BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]); while (++it->ct < MFC_LINES) { - mfc = net->ipv4.mfc_cache_array[it->ct]; - if (mfc) - return mfc; + it->cache = &mrt->mfc_cache_array[it->ct]; + if (list_empty(it->cache)) + continue; + return list_first_entry(it->cache, struct mfc_cache, list); } /* exhausted cache_array, show unresolved */ read_unlock(&mrt_lock); - it->cache = &mfc_unres_queue; + it->cache = &mrt->mfc_unres_queue; it->ct = 0; spin_lock_bh(&mfc_unres_lock); - mfc = mfc_unres_queue; - while (mfc && !net_eq(mfc_net(mfc), net)) - mfc = mfc->next; - if (mfc) - return mfc; + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mfc_cache, list); end_of_list: spin_unlock_bh(&mfc_unres_lock); @@ -1873,18 +2292,17 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) { struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); + struct mr_table *mrt = it->mrt; - if (it->cache == &mfc_unres_queue) + if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == net->ipv4.mfc_cache_array) + else if (it->cache == &mrt->mfc_cache_array[it->ct]) read_unlock(&mrt_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; - struct net *net = seq_file_net(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -1892,20 +2310,21 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) } else { const struct mfc_cache *mfc = v; const struct ipmr_mfc_iter *it = seq->private; + const struct mr_table *mrt = it->mrt; - seq_printf(seq, "%08lX %08lX %-3hd", - (unsigned long) mfc->mfc_mcastgrp, - (unsigned long) mfc->mfc_origin, + seq_printf(seq, "%08X %08X %-3hd", + (__force u32) mfc->mfc_mcastgrp, + (__force u32) mfc->mfc_origin, mfc->mfc_parent); - if (it->cache != &mfc_unres_queue) { + if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->mfc_un.res.pkt, mfc->mfc_un.res.bytes, mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; n < mfc->mfc_un.res.maxvif; n++ ) { - if (VIF_EXISTS(net, n) && + if (VIF_EXISTS(mrt, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", @@ -1945,7 +2364,7 @@ static const struct file_operations ipmr_mfc_fops = { #endif #ifdef CONFIG_IP_PIMSM_V2 -static struct net_protocol pim_protocol = { +static const struct net_protocol pim_protocol = { .handler = pim_rcv, .netns_ok = 1, }; @@ -1957,27 +2376,11 @@ static struct net_protocol pim_protocol = { */ static int __net_init ipmr_net_init(struct net *net) { - int err = 0; + int err; - net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device), - GFP_KERNEL); - if (!net->ipv4.vif_table) { - err = -ENOMEM; + err = ipmr_rules_init(net); + if (err < 0) goto fail; - } - - /* Forwarding cache */ - net->ipv4.mfc_cache_array = kcalloc(MFC_LINES, - sizeof(struct mfc_cache *), - GFP_KERNEL); - if (!net->ipv4.mfc_cache_array) { - err = -ENOMEM; - goto fail_mfc_cache; - } - -#ifdef CONFIG_IP_PIMSM - net->ipv4.mroute_reg_vif_num = -1; -#endif #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -1992,10 +2395,8 @@ static int __net_init ipmr_net_init(struct net *net) proc_cache_fail: proc_net_remove(net, "ip_mr_vif"); proc_vif_fail: - kfree(net->ipv4.mfc_cache_array); + ipmr_rules_exit(net); #endif -fail_mfc_cache: - kfree(net->ipv4.vif_table); fail: return err; } @@ -2006,8 +2407,7 @@ static void __net_exit ipmr_net_exit(struct net *net) proc_net_remove(net, "ip_mr_cache"); proc_net_remove(net, "ip_mr_vif"); #endif - kfree(net->ipv4.mfc_cache_array); - kfree(net->ipv4.vif_table); + ipmr_rules_exit(net); } static struct pernet_operations ipmr_net_ops = { @@ -2030,7 +2430,6 @@ int __init ip_mr_init(void) if (err) goto reg_pernet_fail; - setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); err = register_netdevice_notifier(&ip_mr_notifier); if (err) goto reg_notif_fail; @@ -2041,6 +2440,7 @@ int __init ip_mr_init(void) goto add_proto_fail; } #endif + rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute); return 0; #ifdef CONFIG_IP_PIMSM_V2 @@ -2048,7 +2448,6 @@ add_proto_fail: unregister_netdevice_notifier(&ip_mr_notifier); #endif reg_notif_fail: - del_timer(&ipmr_expire_timer); unregister_pernet_subsys(&ipmr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 1725dc0ef688..07de855e2175 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -4,6 +4,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/ip.h> #include <linux/skbuff.h> +#include <linux/gfp.h> #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> @@ -16,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi fl = {}; - struct dst_entry *odst; + unsigned long orefdst; unsigned int hh_len; unsigned int type; @@ -50,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) if (ip_route_output_key(net, &rt, &fl) != 0) return -1; - odst = skb_dst(skb); + orefdst = skb->_skb_refdst; if (ip_route_input(skb, iph->daddr, iph->saddr, RT_TOS(iph->tos), rt->u.dst.dev) != 0) { dst_release(&rt->u.dst); return -1; } dst_release(&rt->u.dst); - dst_release(odst); + refdst_drop(orefdst); } if (skb_dst(skb)->error) @@ -155,10 +156,10 @@ static int nf_ip_reroute(struct sk_buff *skb, if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); - if (!(iph->tos == rt_info->tos - && skb->mark == rt_info->mark - && iph->daddr == rt_info->daddr - && iph->saddr == rt_info->saddr)) + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) return ip_route_me_harder(skb, RTN_UNSPEC); } return 0; @@ -248,9 +249,9 @@ module_exit(ipv4_netfilter_fini); #ifdef CONFIG_SYSCTL struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, }, + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "netfilter", }, { } }; EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 7505dff4ffdf..1ac01b128621 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -8,7 +8,7 @@ * Copyright (C) 2002 David S. Miller (davem@redhat.com) * */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> @@ -27,6 +27,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> +#include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); @@ -48,16 +49,17 @@ MODULE_DESCRIPTION("arptables core"); #endif #ifdef CONFIG_NETFILTER_DEBUG -#define ARP_NF_ASSERT(x) \ -do { \ - if (!(x)) \ - printk("ARP_NF_ASSERT: %s:%s:%u\n", \ - __func__, __FILE__, __LINE__); \ -} while(0) +#define ARP_NF_ASSERT(x) WARN_ON(!(x)) #else #define ARP_NF_ASSERT(x) #endif +void *arpt_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(arpt, ARPT); +} +EXPORT_SYMBOL_GPL(arpt_alloc_initial_table); + static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, const char *hdr_addr, int len) { @@ -217,16 +219,23 @@ static inline int arp_checkentry(const struct arpt_arp *arp) } static unsigned int -arpt_error(struct sk_buff *skb, const struct xt_target_param *par) +arpt_error(struct sk_buff *skb, const struct xt_action_param *par) { if (net_ratelimit()) - printk("arp_tables: error: '%s'\n", + pr_err("arp_tables: error: '%s'\n", (const char *)par->targinfo); return NF_DROP; } -static inline struct arpt_entry *get_entry(void *base, unsigned int offset) +static inline const struct arpt_entry_target * +arpt_get_target_c(const struct arpt_entry *e) +{ + return arpt_get_target((struct arpt_entry *)e); +} + +static inline struct arpt_entry * +get_entry(const void *base, unsigned int offset) { return (struct arpt_entry *)(base + offset); } @@ -246,12 +255,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; - bool hotdrop = false; struct arpt_entry *e, *back; const char *indev, *outdev; void *table_base; const struct xt_table_info *private; - struct xt_target_param tgpar; + struct xt_action_param acpar; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -266,14 +274,15 @@ unsigned int arpt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); - tgpar.in = in; - tgpar.out = out; - tgpar.hooknum = hook; - tgpar.family = NFPROTO_ARP; + acpar.in = in; + acpar.out = out; + acpar.hooknum = hook; + acpar.family = NFPROTO_ARP; + acpar.hotdrop = false; arp = arp_hdr(skb); do { - struct arpt_entry_target *t; + const struct arpt_entry_target *t; int hdr_len; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { @@ -285,7 +294,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, (2 * skb->dev->addr_len); ADD_COUNTER(e->counters, hdr_len, 1); - t = arpt_get_target(e); + t = arpt_get_target_c(e); /* Standard target? */ if (!t->u.kernel.target->target) { @@ -319,9 +328,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Targets which reenter must return * abs. verdicts */ - tgpar.target = t->u.kernel.target; - tgpar.targinfo = t->data; - verdict = t->u.kernel.target->target(skb, &tgpar); + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; + verdict = t->u.kernel.target->target(skb, &acpar); /* Target might have changed stuff. */ arp = arp_hdr(skb); @@ -331,31 +340,27 @@ unsigned int arpt_do_table(struct sk_buff *skb, else /* Verdict */ break; - } while (!hotdrop); + } while (!acpar.hotdrop); xt_info_rdunlock_bh(); - if (hotdrop) + if (acpar.hotdrop) return NF_DROP; else return verdict; } /* All zeroes == unconditional rule. */ -static inline int unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_arp *arp) { - unsigned int i; + static const struct arpt_arp uncond; - for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) - if (((__u32 *)arp)[i]) - return 0; - - return 1; + return memcmp(arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct xt_table_info *newinfo, +static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -376,11 +381,11 @@ static int mark_source_chains(struct xt_table_info *newinfo, for (;;) { const struct arpt_standard_target *t - = (void *)arpt_get_target(e); + = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { - printk("arptables: loop hook %u pos %u %08X.\n", + pr_notice("arptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } @@ -388,11 +393,11 @@ static int mark_source_chains(struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) - && (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0) - && t->verdict < 0 - && unconditional(&e->arp)) || visited) { + if ((e->target_offset == sizeof(struct arpt_entry) && + (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0) && + t->verdict < 0 && unconditional(&e->arp)) || + visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -431,8 +436,8 @@ static int mark_source_chains(struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0 - && newpos >= 0) { + ARPT_STANDARD_TARGET) == 0 && + newpos >= 0) { if (newpos > newinfo->size - sizeof(struct arpt_entry)) { duprintf("mark_source_chains: " @@ -460,7 +465,7 @@ static int mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int check_entry(struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e, const char *name) { const struct arpt_entry_target *t; @@ -472,7 +477,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name) if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) return -EINVAL; - t = arpt_get_target(e); + t = arpt_get_target_c(e); if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; @@ -502,8 +507,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) } static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct arpt_entry_target *t; struct xt_target *target; @@ -514,13 +518,11 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, return ret; t = arpt_get_target(e); - target = try_then_request_module(xt_find_target(NFPROTO_ARP, - t->u.user.name, - t->u.user.revision), - "arpt_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("find_check_entry: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto out; } t->u.kernel.target = target; @@ -528,8 +530,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, ret = check_target(e, name); if (ret) goto err; - - (*i)++; return 0; err: module_put(t->u.kernel.target->me); @@ -537,18 +537,33 @@ out: return ret; } +static bool check_underflow(const struct arpt_entry *e) +{ + const struct arpt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->arp)) + return false; + t = arpt_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct arpt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + static inline int check_entry_size_and_hooks(struct arpt_entry *e, struct xt_table_info *newinfo, - unsigned char *base, - unsigned char *limit, + const unsigned char *base, + const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, - unsigned int *i) + unsigned int valid_hooks) { unsigned int h; - if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 - || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || + (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -562,31 +577,32 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; - if ((unsigned char *)e - base == underflows[h]) + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } newinfo->underflow[h] = underflows[h]; + } } - /* FIXME: underflows must be unconditional, standard verdicts - < 0 (not ARPT_RETURN). --RR */ - /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; - - (*i)++; return 0; } -static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) +static inline void cleanup_entry(struct arpt_entry *e) { struct xt_tgdtor_param par; struct arpt_entry_target *t; - if (i && (*i)-- == 0) - return 1; - t = arpt_get_target(e); par.target = t->u.kernel.target; par.targinfo = t->data; @@ -594,26 +610,20 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - return 0; } /* Checks and translates the user-supplied table segment (held in * newinfo). */ -static int translate_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info *newinfo, - void *entry0, - unsigned int size, - unsigned int number, - const unsigned int *hook_entries, - const unsigned int *underflows) +static int translate_table(struct xt_table_info *newinfo, void *entry0, + const struct arpt_replace *repl) { + struct arpt_entry *iter; unsigned int i; - int ret; + int ret = 0; - newinfo->size = size; - newinfo->number = number; + newinfo->size = repl->size; + newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { @@ -625,52 +635,66 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry_size_and_hooks, - newinfo, - entry0, - entry0 + size, - hook_entries, underflows, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + break; + ++i; + if (strcmp(arpt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; - if (i != number) { + if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", - i, number); + i, repl->num_entries); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + i, repl->underflow[i]); return -EINVAL; } } - if (!mark_source_chains(newinfo, valid_hooks, entry0)) { + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } if (ret != 0) { - ARPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter); + } return ret; } @@ -683,30 +707,10 @@ static int translate_table(const char *name, return ret; } -/* Gets counters. */ -static inline int add_entry_to_counter(const struct arpt_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - -static inline int set_entry_to_counter(const struct arpt_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { + struct arpt_entry *iter; unsigned int cpu; unsigned int i; unsigned int curcpu; @@ -722,32 +726,32 @@ static void get_counters(const struct xt_table_info *t, curcpu = smp_processor_id(); i = 0; - ARPT_ENTRY_ITERATE(t->entries[curcpu], - t->size, - set_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[curcpu], t->size) { + SET_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; + } for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; xt_info_wrlock(cpu); - ARPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[cpu], t->size) { + ADD_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; + } xt_info_wrunlock(cpu); } local_bh_enable(); } -static struct xt_counters *alloc_counters(struct xt_table *table) +static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -765,11 +769,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) } static int copy_entries_to_user(unsigned int total_size, - struct xt_table *table, + const struct xt_table *table, void __user *userptr) { unsigned int off, num; - struct arpt_entry *e; + const struct arpt_entry *e; struct xt_counters *counters; struct xt_table_info *private = table->private; int ret = 0; @@ -789,7 +793,7 @@ static int copy_entries_to_user(unsigned int total_size, /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ - struct arpt_entry_target *t; + const struct arpt_entry_target *t; e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -800,7 +804,7 @@ static int copy_entries_to_user(unsigned int total_size, goto free_counters; } - t = arpt_get_target(e); + t = arpt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset + offsetof(struct arpt_entry_target, u.user.name), @@ -817,7 +821,7 @@ static int copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -static void compat_standard_from_user(void *dst, void *src) +static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -826,7 +830,7 @@ static void compat_standard_from_user(void *dst, void *src) memcpy(dst, &v, sizeof(v)); } -static int compat_standard_to_user(void __user *dst, void *src) +static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; @@ -835,18 +839,18 @@ static int compat_standard_to_user(void __user *dst, void *src) return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } -static int compat_calc_entry(struct arpt_entry *e, +static int compat_calc_entry(const struct arpt_entry *e, const struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) + const void *base, struct xt_table_info *newinfo) { - struct arpt_entry_target *t; + const struct arpt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); entry_offset = (void *)e - base; - t = arpt_get_target(e); + t = arpt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); @@ -867,7 +871,9 @@ static int compat_calc_entry(struct arpt_entry *e, static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { + struct arpt_entry *iter; void *loc_cpu_entry; + int ret; if (!newinfo || !info) return -EINVAL; @@ -876,13 +882,17 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; - return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, - newinfo); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; } #endif -static int get_info(struct net *net, void __user *user, int *len, int compat) +static int get_info(struct net *net, void __user *user, + const int *len, int compat) { char name[ARPT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -907,10 +917,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; @@ -941,7 +951,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) } static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, - int *len) + const int *len) { int ret; struct arpt_get_entries get; @@ -992,6 +1002,7 @@ static int __do_replace(struct net *net, const char *name, struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; + struct arpt_entry *iter; ret = 0; counters = vmalloc_node(num_counters * sizeof(struct xt_counters), @@ -1035,8 +1046,8 @@ static int __do_replace(struct net *net, const char *name, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, @@ -1055,12 +1066,14 @@ static int __do_replace(struct net *net, const char *name, return ret; } -static int do_replace(struct net *net, void __user *user, unsigned int len) +static int do_replace(struct net *net, const void __user *user, + unsigned int len) { int ret; struct arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct arpt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1081,9 +1094,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); + ret = translate_table(newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1096,27 +1107,15 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter); free_newinfo: xt_free_table_info(newinfo); return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -static int do_add_counters(struct net *net, void __user *user, unsigned int len, - int compat) +static int do_add_counters(struct net *net, const void __user *user, + unsigned int len, int compat) { unsigned int i, curcpu; struct xt_counters_info tmp; @@ -1129,6 +1128,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, const struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; + struct arpt_entry *iter; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1186,11 +1186,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); - ARPT_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, - paddc, - &i); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } xt_info_wrunlock(curcpu); unlock_up_free: local_bh_enable(); @@ -1203,28 +1202,22 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, } #ifdef CONFIG_COMPAT -static inline int -compat_release_entry(struct compat_arpt_entry *e, unsigned int *i) +static inline void compat_release_entry(struct compat_arpt_entry *e) { struct arpt_entry_target *t; - if (i && (*i)-- == 0) - return 1; - t = compat_arpt_get_target(e); module_put(t->u.kernel.target->me); - return 0; } static inline int check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, struct xt_table_info *newinfo, unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, const char *name) { struct arpt_entry_target *t; @@ -1233,8 +1226,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, int ret, off, h; duprintf("check_compat_entry_size_and_hooks %p\n", e); - if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 - || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1255,14 +1248,12 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, entry_offset = (void *)e - (void *)base; t = compat_arpt_get_target(e); - target = try_then_request_module(xt_find_target(NFPROTO_ARP, - t->u.user.name, - t->u.user.revision), - "arpt_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto out; } t->u.kernel.target = target; @@ -1284,8 +1275,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, /* Clear counters and comefrom */ memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; - - (*i)++; return 0; release_target: @@ -1329,19 +1318,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, return ret; } -static inline int compat_check_entry(struct arpt_entry *e, const char *name, - unsigned int *i) -{ - int ret; - - ret = check_target(e, name); - if (ret) - return ret; - - (*i)++; - return 0; -} - static int translate_compat_table(const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, @@ -1354,8 +1330,10 @@ static int translate_compat_table(const char *name, unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; + struct compat_arpt_entry *iter0; + struct arpt_entry *iter1; unsigned int size; - int ret; + int ret = 0; info = *pinfo; entry0 = *pentry0; @@ -1372,13 +1350,17 @@ static int translate_compat_table(const char *name, j = 0; xt_compat_lock(NFPROTO_ARP); /* Walk through entries, checking offsets. */ - ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); - if (ret != 0) - goto out_unlock; + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } ret = -EINVAL; if (j != number) { @@ -1417,9 +1399,12 @@ static int translate_compat_table(const char *name, entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; size = total_size; - ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, - &pos, &size, name, newinfo, entry1); + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } xt_compat_flush_offsets(NFPROTO_ARP); xt_compat_unlock(NFPROTO_ARP); if (ret) @@ -1430,13 +1415,32 @@ static int translate_compat_table(const char *name, goto free_newinfo; i = 0; - ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = check_target(iter1, name); + if (ret != 0) + break; + ++i; + } if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; j -= i; - COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, - compat_release_entry, &j); - ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1); + } xt_free_table_info(newinfo); return ret; } @@ -1454,7 +1458,11 @@ static int translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } return ret; out_unlock: xt_compat_flush_offsets(NFPROTO_ARP); @@ -1481,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user, struct compat_arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct arpt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1518,7 +1527,8 @@ static int compat_do_replace(struct net *net, void __user *user, return 0; free_newinfo_untrans: - ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1552,7 +1562,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, - unsigned int *i) + unsigned int i) { struct arpt_entry_target *t; struct compat_arpt_entry __user *ce; @@ -1560,14 +1570,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t origsize; int ret; - ret = -EFAULT; origsize = *size; ce = (struct compat_arpt_entry __user *)*dstptr; - if (copy_to_user(ce, e, sizeof(struct arpt_entry))) - goto out; - - if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) - goto out; + if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; *dstptr += sizeof(struct compat_arpt_entry); *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); @@ -1577,18 +1585,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, t = arpt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) - goto out; - ret = -EFAULT; + return ret; next_offset = e->next_offset - (origsize - *size); - if (put_user(target_offset, &ce->target_offset)) - goto out; - if (put_user(next_offset, &ce->next_offset)) - goto out; - - (*i)++; + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; return 0; -out: - return ret; } static int compat_copy_entries_to_user(unsigned int total_size, @@ -1602,6 +1604,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, int ret = 0; void *loc_cpu_entry; unsigned int i = 0; + struct arpt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1611,9 +1614,12 @@ static int compat_copy_entries_to_user(unsigned int total_size, loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, - compat_copy_entry_to_user, - &pos, &size, counters, &i); + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } vfree(counters); return ret; } @@ -1760,13 +1766,13 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct xt_table *arpt_register_table(struct net *net, struct xt_table *table, +struct xt_table *arpt_register_table(struct net *net, + const struct xt_table *table, const struct arpt_replace *repl) { int ret; struct xt_table_info *newinfo; - struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; @@ -1780,12 +1786,7 @@ struct xt_table *arpt_register_table(struct net *net, struct xt_table *table, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, - newinfo, loc_cpu_entry, repl->size, - repl->num_entries, - repl->hook_entry, - repl->underflow); - + ret = translate_table(newinfo, loc_cpu_entry, repl); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; @@ -1808,35 +1809,37 @@ void arpt_unregister_table(struct xt_table *table) struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; + struct arpt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, - cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } /* The built-in targets: standard (NULL) and error. */ -static struct xt_target arpt_standard_target __read_mostly = { - .name = ARPT_STANDARD_TARGET, - .targetsize = sizeof(int), - .family = NFPROTO_ARP, +static struct xt_target arpt_builtin_tg[] __read_mostly = { + { + .name = ARPT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_ARP, #ifdef CONFIG_COMPAT - .compatsize = sizeof(compat_int_t), - .compat_from_user = compat_standard_from_user, - .compat_to_user = compat_standard_to_user, + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, #endif -}; - -static struct xt_target arpt_error_target __read_mostly = { - .name = ARPT_ERROR_TARGET, - .target = arpt_error, - .targetsize = ARPT_FUNCTION_MAXNAMELEN, - .family = NFPROTO_ARP, + }, + { + .name = ARPT_ERROR_TARGET, + .target = arpt_error, + .targetsize = ARPT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_ARP, + }, }; static struct nf_sockopt_ops arpt_sockopts = { @@ -1880,12 +1883,9 @@ static int __init arp_tables_init(void) goto err1; /* Noone else will be downing sem now, so we won't sleep */ - ret = xt_register_target(&arpt_standard_target); + ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); if (ret < 0) goto err2; - ret = xt_register_target(&arpt_error_target); - if (ret < 0) - goto err3; /* Register setsockopt */ ret = nf_register_sockopt(&arpt_sockopts); @@ -1896,9 +1896,7 @@ static int __init arp_tables_init(void) return 0; err4: - xt_unregister_target(&arpt_error_target); -err3: - xt_unregister_target(&arpt_standard_target); + xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); err2: unregister_pernet_subsys(&arp_tables_net_ops); err1: @@ -1908,8 +1906,7 @@ err1: static void __exit arp_tables_fini(void) { nf_unregister_sockopt(&arpt_sockopts); - xt_unregister_target(&arpt_error_target); - xt_unregister_target(&arpt_standard_target); + xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); unregister_pernet_subsys(&arp_tables_net_ops); } diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index b0d5b1d0a769..e1be7dd1171b 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -9,7 +9,7 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("arptables arp payload mangle target"); static unsigned int -target(struct sk_buff *skb, const struct xt_target_param *par) +target(struct sk_buff *skb, const struct xt_action_param *par) { const struct arpt_mangle *mangle = par->targinfo; const struct arphdr *arp; @@ -54,7 +54,7 @@ target(struct sk_buff *skb, const struct xt_target_param *par) return mangle->target; } -static bool checkentry(const struct xt_tgchk_param *par) +static int checkentry(const struct xt_tgchk_param *par) { const struct arpt_mangle *mangle = par->targinfo; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6ecfdae7c589..79ca5e70d497 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -6,7 +6,9 @@ */ #include <linux/module.h> +#include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); @@ -15,93 +17,37 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) -static struct -{ - struct arpt_replace repl; - struct arpt_standard entries[3]; - struct arpt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "filter", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error), - .hook_entry = { - [NF_ARP_IN] = 0, - [NF_ARP_OUT] = sizeof(struct arpt_standard), - [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), - }, - .underflow = { - [NF_ARP_IN] = 0, - [NF_ARP_OUT] = sizeof(struct arpt_standard), - [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), - }, - }, - .entries = { - ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_IN */ - ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_OUT */ - ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_FORWARD */ - }, - .term = ARPT_ERROR_INIT, -}; - -static struct xt_table packet_filter = { +static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, + .priority = NF_IP_PRI_FILTER, }; /* The work comes in here from netfilter.c */ -static unsigned int arpt_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int +arptable_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return arpt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.arptable_filter); -} + const struct net *net = dev_net((in != NULL) ? in : out); -static unsigned int arpt_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return arpt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.arptable_filter); + return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); } -static struct nf_hook_ops arpt_ops[] __read_mostly = { - { - .hook = arpt_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_IN, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = arpt_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = arpt_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_FORWARD, - .priority = NF_IP_PRI_FILTER, - }, -}; +static struct nf_hook_ops *arpfilter_ops __read_mostly; static int __net_init arptable_filter_net_init(struct net *net) { - /* Register table */ + struct arpt_replace *repl; + + repl = arpt_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; net->ipv4.arptable_filter = - arpt_register_table(net, &packet_filter, &initial_table.repl); + arpt_register_table(net, &packet_filter, repl); + kfree(repl); if (IS_ERR(net->ipv4.arptable_filter)) return PTR_ERR(net->ipv4.arptable_filter); return 0; @@ -125,9 +71,11 @@ static int __init arptable_filter_init(void) if (ret < 0) return ret; - ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); - if (ret < 0) + arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) { + ret = PTR_ERR(arpfilter_ops); goto cleanup_table; + } return ret; cleanup_table: @@ -137,7 +85,7 @@ cleanup_table: static void __exit arptable_filter_fini(void) { - nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); + xt_hook_unlink(&packet_filter, arpfilter_ops); unregister_pernet_subsys(&arptable_filter_net_ops); } diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index c156db215987..a4e5fc5df4bf 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -26,6 +26,7 @@ #include <linux/security.h> #include <linux/net.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/route.h> @@ -160,8 +161,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) break; case IPQ_COPY_PACKET: - if ((entry->skb->ip_summed == CHECKSUM_PARTIAL || - entry->skb->ip_summed == CHECKSUM_COMPLETE) && + if (entry->skb->ip_summed == CHECKSUM_PARTIAL && (*errp = skb_checksum_help(entry->skb))) { read_unlock_bh(&queue_lock); return NULL; @@ -461,7 +461,6 @@ __ipq_rcv_skb(struct sk_buff *skb) if (flags & NLM_F_ACK) netlink_ack(skb, nlh, 0); - return; } static void @@ -497,10 +496,9 @@ ipq_rcv_nl_event(struct notifier_block *this, { struct netlink_notify *n = ptr; - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_FIREWALL && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) __ipq_reset(); write_unlock_bh(&queue_lock); } @@ -516,14 +514,13 @@ static struct ctl_table_header *ipq_sysctl_header; static ctl_table ipq_table[] = { { - .ctl_name = NET_IPQ_QMAX, .procname = NET_IPQ_QMAX_NAME, .data = &queue_maxlen, .maxlen = sizeof(queue_maxlen), .mode = 0644, .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; #endif @@ -622,7 +619,7 @@ cleanup_netlink_notifier: static void __exit ip_queue_fini(void) { nf_unregister_queue_handlers(&nfqh); - synchronize_net(); + ipq_flush(NULL, 0); #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fdefae6b5dfc..4b6c5ca610fc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -8,6 +8,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/capability.h> #include <linux/skbuff.h> @@ -27,6 +28,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <net/netfilter/nf_log.h> +#include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -37,24 +39,19 @@ MODULE_DESCRIPTION("IPv4 packet filter"); /*#define DEBUG_IP_FIREWALL_USER*/ #ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) printk(format , ## args) +#define dprintf(format, args...) pr_info(format , ## args) #else #define dprintf(format, args...) #endif #ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) +#define duprintf(format, args...) pr_info(format , ## args) #else #define duprintf(format, args...) #endif #ifdef CONFIG_NETFILTER_DEBUG -#define IP_NF_ASSERT(x) \ -do { \ - if (!(x)) \ - printk("IP_NF_ASSERT: %s:%s:%u\n", \ - __func__, __FILE__, __LINE__); \ -} while(0) +#define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif @@ -65,6 +62,12 @@ do { \ #define inline #endif +void *ipt_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ipt, IPT); +} +EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); + /* We keep a set of rules for each CPU, so we can avoid write-locking them in the softirq when updating the counters and therefore @@ -88,9 +91,9 @@ ip_packet_match(const struct iphdr *ip, #define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, - IPT_INV_SRCIP) - || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) { + IPT_INV_SRCIP) || + FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { dprintf("Source or dest mismatch.\n"); dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", @@ -121,8 +124,8 @@ ip_packet_match(const struct iphdr *ip, } /* Check specific protocol */ - if (ipinfo->proto - && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + if (ipinfo->proto && + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); @@ -157,52 +160,38 @@ ip_checkentry(const struct ipt_ip *ip) } static unsigned int -ipt_error(struct sk_buff *skb, const struct xt_target_param *par) +ipt_error(struct sk_buff *skb, const struct xt_action_param *par) { if (net_ratelimit()) - printk("ip_tables: error: `%s'\n", - (const char *)par->targinfo); + pr_info("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } -/* Performance critical - called for every packet */ -static inline bool -do_match(struct ipt_entry_match *m, const struct sk_buff *skb, - struct xt_match_param *par) -{ - par->match = m->u.kernel.match; - par->matchinfo = m->data; - - /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, par)) - return true; - else - return false; -} - /* Performance critical */ static inline struct ipt_entry * -get_entry(void *base, unsigned int offset) +get_entry(const void *base, unsigned int offset) { return (struct ipt_entry *)(base + offset); } /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline int -unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_ip *ip) { - unsigned int i; - - for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++) - if (((__u32 *)ip)[i]) - return 0; + static const struct ipt_ip uncond; - return 1; + return memcmp(ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } +/* for const-correctness */ +static inline const struct ipt_entry_target * +ipt_get_target_c(const struct ipt_entry *e) +{ + return ipt_get_target((struct ipt_entry *)e); +} + #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) static const char *const hooknames[] = { @@ -237,11 +226,11 @@ static struct nf_loginfo trace_loginfo = { /* Mildly perf critical (only if packet tracing is on) */ static inline int -get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, +get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { - struct ipt_standard_target *t = (void *)ipt_get_target(s); + const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ @@ -250,11 +239,11 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) - && strcmp(t->target.u.kernel.target->name, - IPT_STANDARD_TARGET) == 0 - && t->verdict < 0 - && unconditional(&s->ip)) { + if (s->target_offset == sizeof(struct ipt_entry) && + strcmp(t->target.u.kernel.target->name, + IPT_STANDARD_TARGET) == 0 && + t->verdict < 0 && + unconditional(&s->ip)) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -267,17 +256,18 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, return 0; } -static void trace_packet(struct sk_buff *skb, +static void trace_packet(const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, - struct xt_table_info *private, - struct ipt_entry *e) + const struct xt_table_info *private, + const struct ipt_entry *e) { - void *table_base; + const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; + const struct ipt_entry *iter; unsigned int rulenum = 0; table_base = private->entries[smp_processor_id()]; @@ -286,10 +276,10 @@ static void trace_packet(struct sk_buff *skb, hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; - IPT_ENTRY_ITERATE(root, - private->size - private->hook_entry[hook], - get_chainname_rulenum, - e, hookname, &chainname, &comment, &rulenum); + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", @@ -311,24 +301,19 @@ ipt_do_table(struct sk_buff *skb, const struct net_device *out, struct xt_table *table) { -#define tb_comefrom ((struct ipt_entry *)table_base)->comefrom - static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; - u_int16_t datalen; - bool hotdrop = false; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; - void *table_base; - struct ipt_entry *e, *back; - struct xt_table_info *private; - struct xt_match_param mtpar; - struct xt_target_param tgpar; + const void *table_base; + struct ipt_entry *e, **jumpstack; + unsigned int *stackptr, origptr, cpu; + const struct xt_table_info *private; + struct xt_action_param acpar; /* Initialization */ ip = ip_hdr(skb); - datalen = skb->len - ip->ihl * 4; indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; /* We handle fragments by dealing with the first fragment as @@ -337,36 +322,48 @@ ipt_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ - mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; - mtpar.thoff = ip_hdrlen(skb); - mtpar.hotdrop = &hotdrop; - mtpar.in = tgpar.in = in; - mtpar.out = tgpar.out = out; - mtpar.family = tgpar.family = NFPROTO_IPV4; - mtpar.hooknum = tgpar.hooknum = hook; + acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; + acpar.thoff = ip_hdrlen(skb); + acpar.hotdrop = false; + acpar.in = in; + acpar.out = out; + acpar.family = NFPROTO_IPV4; + acpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + cpu = smp_processor_id(); + table_base = private->entries[cpu]; + jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); + origptr = *stackptr; e = get_entry(table_base, private->hook_entry[hook]); - /* For return from builtin chain */ - back = get_entry(table_base, private->underflow[hook]); + pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", + table->name, hook, origptr, + get_entry(table_base, private->underflow[hook])); do { - struct ipt_entry_target *t; + const struct ipt_entry_target *t; + const struct xt_entry_match *ematch; IP_NF_ASSERT(e); - IP_NF_ASSERT(back); if (!ip_packet_match(ip, indev, outdev, - &e->ip, mtpar.fragoff) || - IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) { + &e->ip, acpar.fragoff)) { + no_match: e = ipt_next_entry(e); continue; } + xt_ematch_foreach(ematch, e) { + acpar.match = ematch->u.kernel.match; + acpar.matchinfo = ematch->data; + if (!acpar.match->match(skb, &acpar)) + goto no_match; + } + ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); t = ipt_get_target(e); @@ -390,68 +387,63 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - e = back; - back = get_entry(table_base, back->comefrom); + if (*stackptr == 0) { + e = get_entry(table_base, + private->underflow[hook]); + pr_debug("Underflow (this is normal) " + "to %p\n", e); + } else { + e = jumpstack[--*stackptr]; + pr_debug("Pulled %p out from pos %u\n", + e, *stackptr); + e = ipt_next_entry(e); + } continue; } - if (table_base + v != ipt_next_entry(e) - && !(e->ip.flags & IPT_F_GOTO)) { - /* Save old back ptr in next entry */ - struct ipt_entry *next = ipt_next_entry(e); - next->comefrom = (void *)back - table_base; - /* set back pointer to next entry */ - back = next; + if (table_base + v != ipt_next_entry(e) && + !(e->ip.flags & IPT_F_GOTO)) { + if (*stackptr >= private->stacksize) { + verdict = NF_DROP; + break; + } + jumpstack[(*stackptr)++] = e; + pr_debug("Pushed %p into pos %u\n", + e, *stackptr - 1); } e = get_entry(table_base, v); continue; } - /* Targets which reenter must return - abs. verdicts */ - tgpar.target = t->u.kernel.target; - tgpar.targinfo = t->data; - + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; -#ifdef CONFIG_NETFILTER_DEBUG - tb_comefrom = 0xeeeeeeec; -#endif - verdict = t->u.kernel.target->target(skb, &tgpar); -#ifdef CONFIG_NETFILTER_DEBUG - if (tb_comefrom != 0xeeeeeeec && verdict == IPT_CONTINUE) { - printk("Target %s reentered!\n", - t->u.kernel.target->name); - verdict = NF_DROP; - } - tb_comefrom = 0x57acc001; -#endif + verdict = t->u.kernel.target->target(skb, &acpar); /* Target might have changed stuff. */ ip = ip_hdr(skb); - datalen = skb->len - ip->ihl * 4; - if (verdict == IPT_CONTINUE) e = ipt_next_entry(e); else /* Verdict */ break; - } while (!hotdrop); + } while (!acpar.hotdrop); xt_info_rdunlock_bh(); - + pr_debug("Exiting %s; resetting sp from %u to %u\n", + __func__, *stackptr, origptr); + *stackptr = origptr; #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else - if (hotdrop) + if (acpar.hotdrop) return NF_DROP; else return verdict; #endif - -#undef tb_comefrom } /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -469,23 +461,23 @@ mark_source_chains(struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - struct ipt_standard_target *t - = (void *)ipt_get_target(e); + const struct ipt_standard_target *t + = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", + pr_err("iptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) - && (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0) - && t->verdict < 0 - && unconditional(&e->ip)) || visited) { + if ((e->target_offset == sizeof(struct ipt_entry) && + (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0) && + t->verdict < 0 && unconditional(&e->ip)) || + visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -532,8 +524,8 @@ mark_source_chains(struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0 - && newpos >= 0) { + IPT_STANDARD_TARGET) == 0 && + newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ipt_entry)) { duprintf("mark_source_chains: " @@ -560,30 +552,26 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static int -cleanup_match(struct ipt_entry_match *m, unsigned int *i) +static void cleanup_match(struct ipt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; - if (i && (*i)-- == 0) - return 1; - + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV4; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); - return 0; } static int -check_entry(struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e, const char *name) { - struct ipt_entry_target *t; + const struct ipt_entry_target *t; if (!ip_checkentry(&e->ip)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); + duprintf("ip check failed %p %s.\n", e, par->match->name); return -EINVAL; } @@ -591,7 +579,7 @@ check_entry(struct ipt_entry *e, const char *name) e->next_offset) return -EINVAL; - t = ipt_get_target(e); + t = ipt_get_target_c(e); if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; @@ -599,8 +587,7 @@ check_entry(struct ipt_entry *e, const char *name) } static int -check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, - unsigned int *i) +check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; int ret; @@ -611,31 +598,27 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, ret = xt_check_match(par, m->u.match_size - sizeof(*m), ip->proto, ip->invflags & IPT_INV_PROTO); if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - par.match->name); + duprintf("check failed for `%s'.\n", par->match->name); return ret; } - ++*i; return 0; } static int -find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, - unsigned int *i) +find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; - match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, - m->u.user.revision), - "ipt_%s", m->u.user.name); - if (IS_ERR(match) || !match) { + match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { duprintf("find_check_match: `%s' not found\n", m->u.user.name); - return match ? PTR_ERR(match) : -ENOENT; + return PTR_ERR(match); } m->u.kernel.match = match; - ret = check_match(m, par, i); + ret = check_match(m, par); if (ret) goto err; @@ -645,10 +628,11 @@ err: return ret; } -static int check_target(struct ipt_entry *e, const char *name) +static int check_target(struct ipt_entry *e, struct net *net, const char *name) { struct ipt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -661,7 +645,7 @@ static int check_target(struct ipt_entry *e, const char *name) ret = xt_check_target(&par, t->u.target_size - sizeof(*t), e->ip.proto, e->ip.invflags & IPT_INV_PROTO); if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", + duprintf("check failed for `%s'.\n", t->u.kernel.target->name); return ret; } @@ -669,66 +653,86 @@ static int check_target(struct ipt_entry *e, const char *name) } static int -find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ipt_entry *e, struct net *net, const char *name, + unsigned int size) { struct ipt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; ret = check_entry(e, name); if (ret) return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV4; - ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); - if (ret != 0) - goto cleanup_matches; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } t = ipt_get_target(e); - target = try_then_request_module(xt_find_target(AF_INET, - t->u.user.name, - t->u.user.revision), - "ipt_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("find_check_entry: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; - - (*i)++; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } +static bool check_underflow(const struct ipt_entry *e) +{ + const struct ipt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->ip)) + return false; + t = ipt_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct ipt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + static int check_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, - unsigned char *base, - unsigned char *limit, + const unsigned char *base, + const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, - unsigned int *i) + unsigned int valid_hooks) { unsigned int h; - if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 - || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || + (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -742,62 +746,60 @@ check_entry_size_and_hooks(struct ipt_entry *e, /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; - if ((unsigned char *)e - base == underflows[h]) + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } newinfo->underflow[h] = underflows[h]; + } } - /* FIXME: underflows must be unconditional, standard verdicts - < 0 (not IPT_RETURN). --RR */ - /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; - - (*i)++; return 0; } -static int -cleanup_entry(struct ipt_entry *e, unsigned int *i) +static void +cleanup_entry(struct ipt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct ipt_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_match *ematch; /* Cleanup all matches */ - IPT_MATCH_ITERATE(e, cleanup_match, NULL); + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); t = ipt_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV4; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - return 0; } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info *newinfo, - void *entry0, - unsigned int size, - unsigned int number, - const unsigned int *hook_entries, - const unsigned int *underflows) +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ipt_replace *repl) { + struct ipt_entry *iter; unsigned int i; - int ret; + int ret = 0; - newinfo->size = size; - newinfo->number = number; + newinfo->size = repl->size; + newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -808,49 +810,61 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry_size_and_hooks, - newinfo, - entry0, - entry0 + size, - hook_entries, underflows, &i); - if (ret != 0) - return ret; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + if (strcmp(ipt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } - if (i != number) { + if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", - i, number); + i, repl->num_entries); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + i, repl->underflow[i]); return -EINVAL; } } - if (!mark_source_chains(newinfo, valid_hooks, entry0)) + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } if (ret != 0) { - IPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } return ret; } @@ -863,33 +877,11 @@ translate_table(const char *name, return ret; } -/* Gets counters. */ -static inline int -add_entry_to_counter(const struct ipt_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - -static inline int -set_entry_to_counter(const struct ipt_entry *e, - struct ipt_counters total[], - unsigned int *i) -{ - SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { + struct ipt_entry *iter; unsigned int cpu; unsigned int i; unsigned int curcpu; @@ -905,32 +897,32 @@ get_counters(const struct xt_table_info *t, curcpu = smp_processor_id(); i = 0; - IPT_ENTRY_ITERATE(t->entries[curcpu], - t->size, - set_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[curcpu], t->size) { + SET_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; + } for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; xt_info_wrlock(cpu); - IPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[cpu], t->size) { + ADD_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; /* macro does multi eval of i */ + } xt_info_wrunlock(cpu); } local_bh_enable(); } -static struct xt_counters * alloc_counters(struct xt_table *table) +static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -948,11 +940,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table) static int copy_entries_to_user(unsigned int total_size, - struct xt_table *table, + const struct xt_table *table, void __user *userptr) { unsigned int off, num; - struct ipt_entry *e; + const struct ipt_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; @@ -1004,7 +996,7 @@ copy_entries_to_user(unsigned int total_size, } } - t = ipt_get_target(e); + t = ipt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset + offsetof(struct ipt_entry_target, u.user.name), @@ -1021,7 +1013,7 @@ copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -static void compat_standard_from_user(void *dst, void *src) +static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -1030,7 +1022,7 @@ static void compat_standard_from_user(void *dst, void *src) memcpy(dst, &v, sizeof(v)); } -static int compat_standard_to_user(void __user *dst, void *src) +static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; @@ -1039,25 +1031,20 @@ static int compat_standard_to_user(void __user *dst, void *src) return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } -static inline int -compat_calc_match(struct ipt_entry_match *m, int *size) -{ - *size += xt_compat_match_offset(m->u.kernel.match); - return 0; -} - -static int compat_calc_entry(struct ipt_entry *e, +static int compat_calc_entry(const struct ipt_entry *e, const struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) + const void *base, struct xt_table_info *newinfo) { - struct ipt_entry_target *t; + const struct xt_entry_match *ematch; + const struct ipt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - base; - IPT_MATCH_ITERATE(e, compat_calc_match, &off); - t = ipt_get_target(e); + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ipt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET, entry_offset, off); @@ -1078,7 +1065,9 @@ static int compat_calc_entry(struct ipt_entry *e, static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { + struct ipt_entry *iter; void *loc_cpu_entry; + int ret; if (!newinfo || !info) return -EINVAL; @@ -1087,13 +1076,17 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; - return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, - newinfo); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; } #endif -static int get_info(struct net *net, void __user *user, int *len, int compat) +static int get_info(struct net *net, void __user *user, + const int *len, int compat) { char name[IPT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1118,10 +1111,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; @@ -1153,7 +1146,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) } static int -get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) +get_entries(struct net *net, struct ipt_get_entries __user *uptr, + const int *len) { int ret; struct ipt_get_entries get; @@ -1201,6 +1195,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; + struct ipt_entry *iter; ret = 0; counters = vmalloc(num_counters * sizeof(struct xt_counters)); @@ -1243,8 +1238,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1263,12 +1259,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, void __user *user, unsigned int len) +do_replace(struct net *net, const void __user *user, unsigned int len) { int ret; struct ipt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ipt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1289,13 +1286,11 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; - duprintf("ip_tables: Translated table\n"); + duprintf("Translated table\n"); ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); @@ -1304,27 +1299,16 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - static int -do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) +do_add_counters(struct net *net, const void __user *user, + unsigned int len, int compat) { unsigned int i, curcpu; struct xt_counters_info tmp; @@ -1337,6 +1321,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat const struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; + struct ipt_entry *iter; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1394,11 +1379,10 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); - IPT_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, - paddc, - &i); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } xt_info_wrunlock(curcpu); unlock_up_free: local_bh_enable(); @@ -1426,45 +1410,40 @@ struct compat_ipt_replace { static int compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, - unsigned int *i) + unsigned int i) { struct ipt_entry_target *t; struct compat_ipt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; - int ret; + const struct xt_entry_match *ematch; + int ret = 0; - ret = -EFAULT; origsize = *size; ce = (struct compat_ipt_entry __user *)*dstptr; - if (copy_to_user(ce, e, sizeof(struct ipt_entry))) - goto out; - - if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) - goto out; + if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; *dstptr += sizeof(struct compat_ipt_entry); *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } target_offset = e->target_offset - (origsize - *size); - if (ret) - goto out; t = ipt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) - goto out; - ret = -EFAULT; + return ret; next_offset = e->next_offset - (origsize - *size); - if (put_user(target_offset, &ce->target_offset)) - goto out; - if (put_user(next_offset, &ce->next_offset)) - goto out; - - (*i)++; + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; return 0; -out: - return ret; } static int @@ -1472,61 +1451,45 @@ compat_find_calc_match(struct ipt_entry_match *m, const char *name, const struct ipt_ip *ip, unsigned int hookmask, - int *size, unsigned int *i) + int *size) { struct xt_match *match; - match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, - m->u.user.revision), - "ipt_%s", m->u.user.name); - if (IS_ERR(match) || !match) { + match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { duprintf("compat_check_calc_match: `%s' not found\n", m->u.user.name); - return match ? PTR_ERR(match) : -ENOENT; + return PTR_ERR(match); } m->u.kernel.match = match; *size += xt_compat_match_offset(match); - - (*i)++; - return 0; -} - -static int -compat_release_match(struct ipt_entry_match *m, unsigned int *i) -{ - if (i && (*i)-- == 0) - return 1; - - module_put(m->u.kernel.match->me); return 0; } -static int -compat_release_entry(struct compat_ipt_entry *e, unsigned int *i) +static void compat_release_entry(struct compat_ipt_entry *e) { struct ipt_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_match *ematch; /* Cleanup all matches */ - COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); t = compat_ipt_get_target(e); module_put(t->u.kernel.target->me); - return 0; } static int check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, struct xt_table_info *newinfo, unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, const char *name) { + struct xt_entry_match *ematch; struct ipt_entry_target *t; struct xt_target *target; unsigned int entry_offset; @@ -1534,8 +1497,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, int ret, off, h; duprintf("check_compat_entry_size_and_hooks %p\n", e); - if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 - || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1555,20 +1518,21 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - (void *)base; j = 0; - ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, - &e->ip, e->comefrom, &off, &j); - if (ret != 0) - goto release_matches; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ip, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } t = compat_ipt_get_target(e); - target = try_then_request_module(xt_find_target(AF_INET, - t->u.user.name, - t->u.user.revision), - "ipt_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; @@ -1590,14 +1554,16 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, /* Clear counters and comefrom */ memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; - - (*i)++; return 0; out: module_put(t->u.kernel.target->me); release_matches: - IPT_MATCH_ITERATE(e, compat_release_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } return ret; } @@ -1611,6 +1577,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, struct ipt_entry *de; unsigned int origsize; int ret, h; + struct xt_entry_match *ematch; ret = 0; origsize = *size; @@ -1621,10 +1588,11 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, *dstptr += sizeof(struct ipt_entry); *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); - ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, - dstptr, size); - if (ret) - return ret; + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } de->target_offset = e->target_offset - (origsize - *size); t = compat_ipt_get_target(e); target = t->u.kernel.target; @@ -1641,36 +1609,43 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, } static int -compat_check_entry(struct ipt_entry *e, const char *name, - unsigned int *i) +compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) { + struct xt_entry_match *ematch; struct xt_mtchk_param mtpar; unsigned int j; - int ret; + int ret = 0; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV4; - ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); - if (ret) - goto cleanup_matches; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; - - (*i)++; return 0; cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1682,6 +1657,8 @@ translate_compat_table(const char *name, unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; + struct compat_ipt_entry *iter0; + struct ipt_entry *iter1; unsigned int size; int ret; @@ -1700,13 +1677,17 @@ translate_compat_table(const char *name, j = 0; xt_compat_lock(AF_INET); /* Walk through entries, checking offsets. */ - ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); - if (ret != 0) - goto out_unlock; + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } ret = -EINVAL; if (j != number) { @@ -1745,9 +1726,12 @@ translate_compat_table(const char *name, entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; size = total_size; - ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, - &pos, &size, name, newinfo, entry1); + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); if (ret) @@ -1758,13 +1742,32 @@ translate_compat_table(const char *name, goto free_newinfo; i = 0; - ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + } if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; j -= i; - COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, - compat_release_entry, &j); - IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } xt_free_table_info(newinfo); return ret; } @@ -1782,7 +1785,11 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } return ret; out_unlock: xt_compat_flush_offsets(AF_INET); @@ -1797,6 +1804,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) struct compat_ipt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ipt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1819,7 +1827,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -1835,7 +1843,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1884,6 +1893,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, int ret = 0; const void *loc_cpu_entry; unsigned int i = 0; + struct ipt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1896,9 +1906,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, - compat_copy_entry_to_user, - &pos, &size, counters, &i); + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } vfree(counters); return ret; @@ -2051,13 +2064,13 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ipt_register_table(struct net *net, struct xt_table *table, +struct xt_table *ipt_register_table(struct net *net, + const struct xt_table *table, const struct ipt_replace *repl) { int ret; struct xt_table_info *newinfo; - struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; @@ -2071,11 +2084,7 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *table, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, - newinfo, loc_cpu_entry, repl->size, - repl->num_entries, - repl->hook_entry, - repl->underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; @@ -2093,17 +2102,19 @@ out: return ERR_PTR(ret); } -void ipt_unregister_table(struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; + struct ipt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); @@ -2121,7 +2132,7 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, } static bool -icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) +icmp_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; @@ -2137,7 +2148,7 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ICMP tinygram.\n"); - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -2148,31 +2159,31 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) !!(icmpinfo->invflags&IPT_ICMP_INV)); } -static bool icmp_checkentry(const struct xt_mtchk_param *par) +static int icmp_checkentry(const struct xt_mtchk_param *par) { const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(icmpinfo->invflags & ~IPT_ICMP_INV); + return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; } -/* The built-in targets: standard (NULL) and error. */ -static struct xt_target ipt_standard_target __read_mostly = { - .name = IPT_STANDARD_TARGET, - .targetsize = sizeof(int), - .family = NFPROTO_IPV4, +static struct xt_target ipt_builtin_tg[] __read_mostly = { + { + .name = IPT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_IPV4, #ifdef CONFIG_COMPAT - .compatsize = sizeof(compat_int_t), - .compat_from_user = compat_standard_from_user, - .compat_to_user = compat_standard_to_user, + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, #endif -}; - -static struct xt_target ipt_error_target __read_mostly = { - .name = IPT_ERROR_TARGET, - .target = ipt_error, - .targetsize = IPT_FUNCTION_MAXNAMELEN, - .family = NFPROTO_IPV4, + }, + { + .name = IPT_ERROR_TARGET, + .target = ipt_error, + .targetsize = IPT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_IPV4, + }, }; static struct nf_sockopt_ops ipt_sockopts = { @@ -2192,13 +2203,15 @@ static struct nf_sockopt_ops ipt_sockopts = { .owner = THIS_MODULE, }; -static struct xt_match icmp_matchstruct __read_mostly = { - .name = "icmp", - .match = icmp_match, - .matchsize = sizeof(struct ipt_icmp), - .checkentry = icmp_checkentry, - .proto = IPPROTO_ICMP, - .family = NFPROTO_IPV4, +static struct xt_match ipt_builtin_mt[] __read_mostly = { + { + .name = "icmp", + .match = icmp_match, + .matchsize = sizeof(struct ipt_icmp), + .checkentry = icmp_checkentry, + .proto = IPPROTO_ICMP, + .family = NFPROTO_IPV4, + }, }; static int __net_init ip_tables_net_init(struct net *net) @@ -2225,13 +2238,10 @@ static int __init ip_tables_init(void) goto err1; /* Noone else will be downing sem now, so we won't sleep */ - ret = xt_register_target(&ipt_standard_target); + ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); if (ret < 0) goto err2; - ret = xt_register_target(&ipt_error_target); - if (ret < 0) - goto err3; - ret = xt_register_match(&icmp_matchstruct); + ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); if (ret < 0) goto err4; @@ -2240,15 +2250,13 @@ static int __init ip_tables_init(void) if (ret < 0) goto err5; - printk(KERN_INFO "ip_tables: (C) 2000-2006 Netfilter Core Team\n"); + pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: - xt_unregister_match(&icmp_matchstruct); + xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); err4: - xt_unregister_target(&ipt_error_target); -err3: - xt_unregister_target(&ipt_standard_target); + xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); err2: unregister_pernet_subsys(&ip_tables_net_ops); err1: @@ -2259,10 +2267,8 @@ static void __exit ip_tables_fini(void) { nf_unregister_sockopt(&ipt_sockopts); - xt_unregister_match(&icmp_matchstruct); - xt_unregister_target(&ipt_error_target); - xt_unregister_target(&ipt_standard_target); - + xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); + xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); unregister_pernet_subsys(&ip_tables_net_ops); } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2e4f98b85524..f91c94b9a790 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -9,11 +9,13 @@ * published by the Free Software Foundation. * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/jhash.h> #include <linux/bitops.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/udp.h> @@ -87,7 +89,7 @@ clusterip_config_entry_put(struct clusterip_config *c) list_del(&c->list); write_unlock_bh(&clusterip_lock); - dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); + dev_mc_del(c->dev, c->clustermac); dev_put(c->dev); /* In case anyone still accesses the file, the open/close @@ -238,8 +240,7 @@ clusterip_hashfn(const struct sk_buff *skb, break; default: if (net_ratelimit()) - printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n", - iph->protocol); + pr_info("unknown protocol %u\n", iph->protocol); sport = dport = 0; } @@ -261,7 +262,7 @@ clusterip_hashfn(const struct sk_buff *skb, hashval = 0; /* This cannot happen, unless the check function wasn't called * at rule load time */ - printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode); + pr_info("unknown mode %u\n", config->hash_mode); BUG(); break; } @@ -281,7 +282,7 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) ***********************************************************************/ static unsigned int -clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) +clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct nf_conn *ct; @@ -294,7 +295,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) { - printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); + pr_info("no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be * marked as INVALID */ @@ -303,9 +304,9 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP - && (ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) + if (ip_hdr(skb)->protocol == IPPROTO_ICMP && + (ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) return XT_CONTINUE; /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, @@ -347,25 +348,24 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool clusterip_tg_check(const struct xt_tgchk_param *par) +static int clusterip_tg_check(const struct xt_tgchk_param *par) { struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; - struct clusterip_config *config; + int ret; if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { - printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n", - cipinfo->hash_mode); - return false; + pr_info("unknown mode %u\n", cipinfo->hash_mode); + return -EINVAL; } - if (e->ip.dmsk.s_addr != htonl(0xffffffff) - || e->ip.dst.s_addr == 0) { - printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); - return false; + if (e->ip.dmsk.s_addr != htonl(0xffffffff) || + e->ip.dst.s_addr == 0) { + pr_info("Please specify destination IP\n"); + return -EINVAL; } /* FIXME: further sanity checks */ @@ -373,41 +373,41 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par) config = clusterip_config_find_get(e->ip.dst.s_addr, 1); if (!config) { if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { - printk(KERN_WARNING "CLUSTERIP: no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr); - return false; + pr_info("no config found for %pI4, need 'new'\n", + &e->ip.dst.s_addr); + return -EINVAL; } else { struct net_device *dev; if (e->ip.iniface[0] == '\0') { - printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n"); - return false; + pr_info("Please specify an interface name\n"); + return -EINVAL; } dev = dev_get_by_name(&init_net, e->ip.iniface); if (!dev) { - printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); - return false; + pr_info("no such interface %s\n", + e->ip.iniface); + return -ENOENT; } config = clusterip_config_init(cipinfo, e->ip.dst.s_addr, dev); if (!config) { - printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n"); + pr_info("cannot allocate config\n"); dev_put(dev); - return false; + return -ENOMEM; } - dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0); + dev_mc_add(config->dev, config->clustermac); } } cipinfo->config = config; - if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->target->family); - return false; - } - - return true; + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; } /* drop reference count of cluster config when rule is deleted */ @@ -421,7 +421,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) clusterip_config_put(cipinfo->config); - nf_ct_l3proto_module_put(par->target->family); + nf_ct_l3proto_module_put(par->family); } #ifdef CONFIG_COMPAT @@ -478,8 +478,8 @@ static void arp_print(struct arp_payload *payload) } hbuffer[--k]='\0'; - printk("src %pI4@%s, dst %pI4\n", - &payload->src_ip, hbuffer, &payload->dst_ip); + pr_debug("src %pI4@%s, dst %pI4\n", + &payload->src_ip, hbuffer, &payload->dst_ip); } #endif @@ -495,14 +495,14 @@ arp_mangle(unsigned int hook, struct clusterip_config *c; /* we don't care about non-ethernet and non-ipv4 ARP */ - if (arp->ar_hrd != htons(ARPHRD_ETHER) - || arp->ar_pro != htons(ETH_P_IP) - || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) return NF_ACCEPT; /* we only want to mangle arp requests and replies */ - if (arp->ar_op != htons(ARPOP_REPLY) - && arp->ar_op != htons(ARPOP_REQUEST)) + if (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST)) return NF_ACCEPT; payload = (void *)(arp+1); @@ -518,7 +518,7 @@ arp_mangle(unsigned int hook, * this wouldn't work, since we didn't subscribe the mcast group on * other interfaces */ if (c->dev != out) { - pr_debug("CLUSTERIP: not mangling arp reply on different " + pr_debug("not mangling arp reply on different " "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name); clusterip_config_put(c); @@ -529,7 +529,7 @@ arp_mangle(unsigned int hook, memcpy(payload->src_hw, c->clustermac, arp->ar_hln); #ifdef DEBUG - pr_debug(KERN_DEBUG "CLUSTERIP mangled arp reply: "); + pr_debug("mangled arp reply: "); arp_print(payload); #endif @@ -560,8 +560,7 @@ struct clusterip_seq_position { static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) { - const struct proc_dir_entry *pde = s->private; - struct clusterip_config *c = pde->data; + struct clusterip_config *c = s->private; unsigned int weight; u_int32_t local_nodes; struct clusterip_seq_position *idx; @@ -601,7 +600,8 @@ static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) static void clusterip_seq_stop(struct seq_file *s, void *v) { - kfree(v); + if (!IS_ERR(v)) + kfree(v); } static int clusterip_seq_show(struct seq_file *s, void *v) @@ -632,10 +632,9 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct proc_dir_entry *pde = PDE(inode); - struct clusterip_config *c = pde->data; + struct clusterip_config *c = PDE(inode)->data; - sf->private = pde; + sf->private = c; clusterip_config_get(c); } @@ -645,8 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct proc_dir_entry *pde = PDE(inode); - struct clusterip_config *c = pde->data; + struct clusterip_config *c = PDE(inode)->data; int ret; ret = seq_release(inode, file); @@ -660,10 +658,9 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { + struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data; #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; - const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - struct clusterip_config *c = pde->data; unsigned long nodenum; if (copy_from_user(buffer, input, PROC_WRITELEN)) @@ -709,13 +706,13 @@ static int __init clusterip_tg_init(void) #ifdef CONFIG_PROC_FS clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); if (!clusterip_procdir) { - printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); + pr_err("Unable to proc dir entry\n"); ret = -ENOMEM; goto cleanup_hook; } #endif /* CONFIG_PROC_FS */ - printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n", + pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; @@ -730,8 +727,7 @@ cleanup_target: static void __exit clusterip_tg_exit(void) { - printk(KERN_NOTICE "ClusterIP Version %s unloading\n", - CLUSTERIP_VERSION); + pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); #ifdef CONFIG_PROC_FS remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); #endif diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index f7e2fa0974dc..4bf3dc49ad1e 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/skbuff.h> @@ -50,7 +50,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) struct tcphdr _tcph, *tcph; __be16 oldval; - /* Not enought header? */ + /* Not enough header? */ tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (!tcph) return false; @@ -77,7 +77,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) } static unsigned int -ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) +ecn_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_ECN_info *einfo = par->targinfo; @@ -85,36 +85,33 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) if (!set_ect_ip(skb, einfo)) return NF_DROP; - if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) - && ip_hdr(skb)->protocol == IPPROTO_TCP) + if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) && + ip_hdr(skb)->protocol == IPPROTO_TCP) if (!set_ect_tcp(skb, einfo)) return NF_DROP; return XT_CONTINUE; } -static bool ecn_tg_check(const struct xt_tgchk_param *par) +static int ecn_tg_check(const struct xt_tgchk_param *par) { const struct ipt_ECN_info *einfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; if (einfo->operation & IPT_ECN_OP_MASK) { - printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", - einfo->operation); - return false; + pr_info("unsupported ECN operation %x\n", einfo->operation); + return -EINVAL; } if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { - printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n", - einfo->ip_ect); - return false; + pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect); + return -EINVAL; } - if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) - && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { - printk(KERN_WARNING "ECN: cannot use TCP operations on a " - "non-tcp rule\n"); - return false; + if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) && + (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { + pr_info("cannot use TCP operations on a non-tcp rule\n"); + return -EINVAL; } - return true; + return 0; } static struct xt_target ecn_tg_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index acc44c69eb68..5234f4f3499a 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -9,7 +9,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> @@ -74,8 +74,8 @@ static void dump_packet(const struct nf_loginfo *info, if (ntohs(ih->frag_off) & IP_OFFSET) printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((logflags & IPT_LOG_IPOPT) - && ih->ihl * 4 > sizeof(struct iphdr)) { + if ((logflags & IPT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { const unsigned char *op; unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; unsigned int i, optsize; @@ -146,8 +146,8 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((logflags & IPT_LOG_TCPOPT) - && th->doff * 4 > sizeof(struct tcphdr)) { + if ((logflags & IPT_LOG_TCPOPT) && + th->doff * 4 > sizeof(struct tcphdr)) { unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; const unsigned char *op; unsigned int i, optsize; @@ -238,9 +238,9 @@ static void dump_packet(const struct nf_loginfo *info, printk("TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (ich->type <= NR_ICMP_TYPES - && required_len[ich->type] - && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { printk("INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; @@ -276,8 +276,8 @@ static void dump_packet(const struct nf_loginfo *info, } /* Max length: 10 "MTU=65535 " */ - if (ich->type == ICMP_DEST_UNREACH - && ich->code == ICMP_FRAG_NEEDED) + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) printk("MTU=%u ", ntohs(ich->un.frag.mtu)); } break; @@ -367,7 +367,7 @@ static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 0, + .level = 5, .logflags = NF_LOG_MASK, }, }, @@ -407,8 +407,8 @@ ipt_log_packet(u_int8_t pf, if (in && !out) { /* MAC logging for input chain only. */ printk("MAC="); - if (skb->dev && skb->dev->hard_header_len - && skb->mac_header != skb->network_header) { + if (skb->dev && skb->dev->hard_header_len && + skb->mac_header != skb->network_header) { int i; const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) @@ -425,7 +425,7 @@ ipt_log_packet(u_int8_t pf, } static unsigned int -log_tg(struct sk_buff *skb, const struct xt_target_param *par) +log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_log_info *loginfo = par->targinfo; struct nf_loginfo li; @@ -439,20 +439,19 @@ log_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool log_tg_check(const struct xt_tgchk_param *par) +static int log_tg_check(const struct xt_tgchk_param *par) { const struct ipt_log_info *loginfo = par->targinfo; if (loginfo->level >= 8) { - pr_debug("LOG: level %u >= 8\n", loginfo->level); - return false; + pr_debug("level %u >= 8\n", loginfo->level); + return -EINVAL; } if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { - pr_debug("LOG: prefix term %i\n", - loginfo->prefix[sizeof(loginfo->prefix)-1]); - return false; + pr_debug("prefix is not null-terminated\n"); + return -EINVAL; } - return true; + return 0; } static struct xt_target log_tg_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index dada0863946d..d2ed9dc74ebc 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -8,7 +8,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/inetdevice.h> #include <linux/ip.h> @@ -28,23 +28,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); /* FIXME: Multiple targets. --RR */ -static bool masquerade_tg_check(const struct xt_tgchk_param *par) +static int masquerade_tg_check(const struct xt_tgchk_param *par) { const struct nf_nat_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { - pr_debug("masquerade_check: bad MAP_IPS.\n"); - return false; + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; } if (mr->rangesize != 1) { - pr_debug("masquerade_check: bad rangesize %u\n", mr->rangesize); - return false; + pr_debug("bad rangesize %u\n", mr->rangesize); + return -EINVAL; } - return true; + return 0; } static unsigned int -masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) +masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; struct nf_conn_nat *nat; @@ -59,8 +59,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) ct = nf_ct_get(skb, &ctinfo); nat = nfct_nat(ct); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. @@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) rt = skb_rtable(skb); newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); if (!newsrc) { - printk("MASQUERADE: %s ate my IP address\n", par->out->name); + pr_info("%s ate my IP address\n", par->out->name); return NF_DROP; } diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 7c29582d4ec8..f43867d1697f 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -9,7 +9,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ip.h> #include <linux/module.h> #include <linux/netdevice.h> @@ -22,23 +22,23 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); -static bool netmap_tg_check(const struct xt_tgchk_param *par) +static int netmap_tg_check(const struct xt_tgchk_param *par) { const struct nf_nat_multi_range_compat *mr = par->targinfo; if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { - pr_debug("NETMAP:check: bad MAP_IPS.\n"); - return false; + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; } if (mr->rangesize != 1) { - pr_debug("NETMAP:check: bad rangesize %u.\n", mr->rangesize); - return false; + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; } - return true; + return 0; } static unsigned int -netmap_tg(struct sk_buff *skb, const struct xt_target_param *par) +netmap_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 698e5e78685b..18a0656505a0 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/ip.h> #include <linux/timer.h> @@ -26,23 +26,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); /* FIXME: Take multiple ranges --RR */ -static bool redirect_tg_check(const struct xt_tgchk_param *par) +static int redirect_tg_check(const struct xt_tgchk_param *par) { const struct nf_nat_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { - pr_debug("redirect_check: bad MAP_IPS.\n"); - return false; + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; } if (mr->rangesize != 1) { - pr_debug("redirect_check: bad rangesize %u.\n", mr->rangesize); - return false; + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; } - return true; + return 0; } static unsigned int -redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) +redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index c93ae44bff2a..f5f4a888e4ec 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -9,9 +9,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/icmp.h> @@ -135,13 +136,10 @@ static inline void send_unreach(struct sk_buff *skb_in, int code) } static unsigned int -reject_tg(struct sk_buff *skb, const struct xt_target_param *par) +reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_reject_info *reject = par->targinfo; - /* WARNING: This code causes reentry within iptables. - This means that the iptables jump stack is now crap. We - must return an absolute verdict. --RR */ switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: send_unreach(skb, ICMP_NET_UNREACH); @@ -174,23 +172,23 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par) return NF_DROP; } -static bool reject_tg_check(const struct xt_tgchk_param *par) +static int reject_tg_check(const struct xt_tgchk_param *par) { const struct ipt_reject_info *rejinfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); - return false; + pr_info("ECHOREPLY no longer supported.\n"); + return -EINVAL; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ - if (e->ip.proto != IPPROTO_TCP - || (e->ip.invflags & XT_INV_PROTO)) { - printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); - return false; + if (e->ip.proto != IPPROTO_TCP || + (e->ip.invflags & XT_INV_PROTO)) { + pr_info("TCP_RESET invalid for non-tcp\n"); + return -EINVAL; } } - return true; + return 0; } static struct xt_target reject_tg_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index d32cc4bb328a..446e0f467a17 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -29,10 +29,11 @@ * Specify, after how many hundredths of a second the queue should be * flushed even if it is not full yet. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/socket.h> +#include <linux/slab.h> #include <linux/skbuff.h> #include <linux/kernel.h> #include <linux/timer.h> @@ -56,8 +57,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ -#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0) - static unsigned int nlbufsiz = NLMSG_GOODSIZE; module_param(nlbufsiz, uint, 0400); MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); @@ -90,12 +89,12 @@ static void ulog_send(unsigned int nlgroupnum) ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; if (timer_pending(&ub->timer)) { - pr_debug("ipt_ULOG: ulog_send: timer was pending, deleting\n"); + pr_debug("ulog_send: timer was pending, deleting\n"); del_timer(&ub->timer); } if (!ub->skb) { - pr_debug("ipt_ULOG: ulog_send: nothing to send\n"); + pr_debug("ulog_send: nothing to send\n"); return; } @@ -104,7 +103,7 @@ static void ulog_send(unsigned int nlgroupnum) ub->lastnlh->nlmsg_type = NLMSG_DONE; NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; - pr_debug("ipt_ULOG: throwing %d packets to netlink group %u\n", + pr_debug("throwing %d packets to netlink group %u\n", ub->qlen, nlgroupnum + 1); netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); @@ -117,7 +116,7 @@ static void ulog_send(unsigned int nlgroupnum) /* timer function to flush queue in flushtimeout time */ static void ulog_timer(unsigned long data) { - pr_debug("ipt_ULOG: timer function called, calling ulog_send\n"); + pr_debug("timer function called, calling ulog_send\n"); /* lock to protect against somebody modifying our structure * from ipt_ulog_target at the same time */ @@ -138,7 +137,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) n = max(size, nlbufsiz); skb = alloc_skb(n, GFP_ATOMIC); if (!skb) { - PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", n); + pr_debug("cannot alloc whole buffer %ub!\n", n); if (n > size) { /* try to allocate only as much as we need for @@ -146,8 +145,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) skb = alloc_skb(size, GFP_ATOMIC); if (!skb) - PRINTR("ipt_ULOG: can't even allocate %ub\n", - size); + pr_debug("cannot even allocate %ub\n", size); } } @@ -198,8 +196,7 @@ static void ipt_ulog_packet(unsigned int hooknum, goto alloc_failure; } - pr_debug("ipt_ULOG: qlen %d, qthreshold %Zu\n", ub->qlen, - loginfo->qthreshold); + pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, @@ -226,9 +223,9 @@ static void ipt_ulog_packet(unsigned int hooknum, else *(pm->prefix) = '\0'; - if (in && in->hard_header_len > 0 - && skb->mac_header != skb->network_header - && in->hard_header_len <= ULOG_MAC_LEN) { + if (in && in->hard_header_len > 0 && + skb->mac_header != skb->network_header && + in->hard_header_len <= ULOG_MAC_LEN) { memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); pm->mac_len = in->hard_header_len; } else @@ -272,16 +269,14 @@ static void ipt_ulog_packet(unsigned int hooknum, return; nlmsg_failure: - PRINTR("ipt_ULOG: error during NLMSG_PUT\n"); - + pr_debug("error during NLMSG_PUT\n"); alloc_failure: - PRINTR("ipt_ULOG: Error building netlink message\n"); - + pr_debug("Error building netlink message\n"); spin_unlock_bh(&ulog_lock); } static unsigned int -ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) +ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) { ipt_ulog_packet(par->hooknum, skb, par->in, par->out, par->targinfo, NULL); @@ -313,21 +308,20 @@ static void ipt_logfn(u_int8_t pf, ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } -static bool ulog_tg_check(const struct xt_tgchk_param *par) +static int ulog_tg_check(const struct xt_tgchk_param *par) { const struct ipt_ulog_info *loginfo = par->targinfo; if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { - pr_debug("ipt_ULOG: prefix term %i\n", - loginfo->prefix[sizeof(loginfo->prefix) - 1]); - return false; + pr_debug("prefix not null-terminated\n"); + return -EINVAL; } if (loginfo->qthreshold > ULOG_MAX_QLEN) { - pr_debug("ipt_ULOG: queue threshold %Zu > MAX_QLEN\n", + pr_debug("queue threshold %Zu > MAX_QLEN\n", loginfo->qthreshold); - return false; + return -EINVAL; } - return true; + return 0; } #ifdef CONFIG_COMPAT @@ -338,7 +332,7 @@ struct compat_ipt_ulog_info { char prefix[ULOG_PREFIX_LEN]; }; -static void ulog_tg_compat_from_user(void *dst, void *src) +static void ulog_tg_compat_from_user(void *dst, const void *src) { const struct compat_ipt_ulog_info *cl = src; struct ipt_ulog_info l = { @@ -351,7 +345,7 @@ static void ulog_tg_compat_from_user(void *dst, void *src) memcpy(dst, &l, sizeof(l)); } -static int ulog_tg_compat_to_user(void __user *dst, void *src) +static int ulog_tg_compat_to_user(void __user *dst, const void *src) { const struct ipt_ulog_info *l = src; struct compat_ipt_ulog_info cl = { @@ -389,10 +383,10 @@ static int __init ulog_tg_init(void) { int ret, i; - pr_debug("ipt_ULOG: init module\n"); + pr_debug("init module\n"); if (nlbufsiz > 128*1024) { - printk("Netlink buffer has to be <= 128kB\n"); + pr_warning("Netlink buffer has to be <= 128kB\n"); return -EINVAL; } @@ -422,7 +416,7 @@ static void __exit ulog_tg_exit(void) ulog_buff_t *ub; int i; - pr_debug("ipt_ULOG: cleanup_module\n"); + pr_debug("cleanup_module\n"); if (nflog) nf_log_unregister(&ipt_ulog_logger); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 3b216be3bc9f..db8bff0fb86d 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -8,7 +8,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> @@ -30,7 +30,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev, } static bool -addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = dev_net(par->in ? par->in : par->out); const struct ipt_addrtype_info *info = par->matchinfo; @@ -48,7 +48,7 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) } static bool -addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) +addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = dev_net(par->in ? par->in : par->out); const struct ipt_addrtype_info_v1 *info = par->matchinfo; @@ -70,34 +70,34 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { struct ipt_addrtype_info_v1 *info = par->matchinfo; if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - printk(KERN_ERR "ipt_addrtype: both incoming and outgoing " - "interface limitation cannot be selected\n"); - return false; + pr_info("both incoming and outgoing " + "interface limitation cannot be selected\n"); + return -EINVAL; } if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - printk(KERN_ERR "ipt_addrtype: output interface limitation " - "not valid in PRE_ROUTING and INPUT\n"); - return false; + pr_info("output interface limitation " + "not valid in PREROUTING and INPUT\n"); + return -EINVAL; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { - printk(KERN_ERR "ipt_addrtype: input interface limitation " - "not valid in POST_ROUTING and OUTPUT\n"); - return false; + pr_info("input interface limitation " + "not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; } - return true; + return 0; } static struct xt_match addrtype_mt_reg[] __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 0104c0b399de..14a2aa8b8a14 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -5,7 +5,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/skbuff.h> @@ -18,25 +18,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); -#ifdef DEBUG_CONNTRACK -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - /* Returns 1 if the spi is matched by the range, 0 otherwise */ static inline bool spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; - duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', - min,spi,max); + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); r=(spi >= min && spi <= max) ^ invert; - duprintf(" result %s\n",r? "PASS" : "FAILED"); + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } -static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) +static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; @@ -51,8 +45,8 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil AH tinygram.\n"); - *par->hotdrop = true; + pr_debug("Dropping evil AH tinygram.\n"); + par->hotdrop = true; return 0; } @@ -61,16 +55,16 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) !!(ahinfo->invflags & IPT_AH_INV_SPI)); } -static bool ah_mt_check(const struct xt_mtchk_param *par) +static int ah_mt_check(const struct xt_mtchk_param *par) { const struct ipt_ah *ahinfo = par->matchinfo; /* Must specify no unknown invflags */ if (ahinfo->invflags & ~IPT_AH_INV_MASK) { - duprintf("ipt_ah: unknown flags %X\n", ahinfo->invflags); - return false; + pr_debug("unknown flags %X\n", ahinfo->invflags); + return -EINVAL; } - return true; + return 0; } static struct xt_match ah_mt_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 6289b64144c6..af6e9c778345 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/ip.h> #include <net/ip.h> @@ -67,7 +67,7 @@ static inline bool match_tcp(const struct sk_buff *skb, return true; } -static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) +static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ipt_ecn_info *info = par->matchinfo; @@ -78,32 +78,31 @@ static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { if (ip_hdr(skb)->protocol != IPPROTO_TCP) return false; - if (!match_tcp(skb, info, par->hotdrop)) + if (!match_tcp(skb, info, &par->hotdrop)) return false; } return true; } -static bool ecn_mt_check(const struct xt_mtchk_param *par) +static int ecn_mt_check(const struct xt_mtchk_param *par) { const struct ipt_ecn_info *info = par->matchinfo; const struct ipt_ip *ip = par->entryinfo; if (info->operation & IPT_ECN_OP_MATCH_MASK) - return false; + return -EINVAL; if (info->invert & IPT_ECN_OP_MATCH_MASK) - return false; + return -EINVAL; - if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) - && ip->proto != IPPROTO_TCP) { - printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" - " non-tcp packets\n"); - return false; + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && + ip->proto != IPPROTO_TCP) { + pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + return -EINVAL; } - return true; + return 0; } static struct xt_match ecn_mt_reg __read_mostly = { diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index c30a969724f8..c37641e819f2 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); @@ -23,104 +24,32 @@ MODULE_DESCRIPTION("iptables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "filter", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table packet_filter = { +static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_FILTER, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ipt_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_filter); -} - static unsigned int -ipt_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +iptable_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_filter); -} + const struct net *net; -static unsigned int -ipt_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* root is playing with raw sockets. */ return NF_ACCEPT; - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_filter); + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); } -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_local_in_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = ipt_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = ipt_local_out_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_FILTER, - }, -}; +static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static int forward = NF_ACCEPT; @@ -128,9 +57,18 @@ module_param(forward, bool, 0000); static int __net_init iptable_filter_net_init(struct net *net) { - /* Register table */ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ipt_standard *)repl->entries)[1].target.verdict = + -forward - 1; + net->ipv4.iptable_filter = - ipt_register_table(net, &packet_filter, &initial_table.repl); + ipt_register_table(net, &packet_filter, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_filter)) return PTR_ERR(net->ipv4.iptable_filter); return 0; @@ -138,7 +76,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_filter); + ipt_unregister_table(net, net->ipv4.iptable_filter); } static struct pernet_operations iptable_filter_net_ops = { @@ -151,21 +89,20 @@ static int __init iptable_filter_init(void) int ret; if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); + pr_err("iptables forward must be 0 or 1\n"); return -EINVAL; } - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; - ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) return ret; /* Register hooks */ - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); goto cleanup_table; + } return ret; @@ -176,7 +113,7 @@ static int __init iptable_filter_init(void) static void __exit iptable_filter_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 4087614d9519..294a2a32f293 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -12,6 +12,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> @@ -27,101 +28,16 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[5]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "mangle", - .valid_hooks = MANGLE_VALID_HOOKS, - .num_entries = 6, - .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table packet_mangler = { +static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_MANGLE, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ipt_pre_routing_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_mangle); -} - -static unsigned int -ipt_post_routing_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_mangle); -} - -static unsigned int -ipt_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_mangle); -} - -static unsigned int -ipt_forward_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_mangle); -} - static unsigned int -ipt_local_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) { unsigned int ret; const struct iphdr *iph; @@ -130,8 +46,8 @@ ipt_local_hook(unsigned int hook, u_int32_t mark; /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) - || ip_hdrlen(skb) < sizeof(struct iphdr)) + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; /* Save things which could affect route */ @@ -141,7 +57,7 @@ ipt_local_hook(unsigned int hook, daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, hook, in, out, + ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { @@ -158,49 +74,36 @@ ipt_local_hook(unsigned int hook, return ret; } -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_pre_routing_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_local_in_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_forward_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_local_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_post_routing_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_MANGLE, - }, -}; +/* The work comes in here from netfilter.c. */ +static unsigned int +iptable_mangle_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (hook == NF_INET_LOCAL_OUT) + return ipt_mangle_out(skb, out); + if (hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, hook, in, out, + dev_net(out)->ipv4.iptable_mangle); + /* PREROUTING/INPUT/FORWARD: */ + return ipt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.iptable_mangle); +} + +static struct nf_hook_ops *mangle_ops __read_mostly; static int __net_init iptable_mangle_net_init(struct net *net) { - /* Register table */ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; net->ipv4.iptable_mangle = - ipt_register_table(net, &packet_mangler, &initial_table.repl); + ipt_register_table(net, &packet_mangler, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_mangle)) return PTR_ERR(net->ipv4.iptable_mangle); return 0; @@ -208,7 +111,7 @@ static int __net_init iptable_mangle_net_init(struct net *net) static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_mangle); + ipt_unregister_table(net, net->ipv4.iptable_mangle); } static struct pernet_operations iptable_mangle_net_ops = { @@ -225,9 +128,11 @@ static int __init iptable_mangle_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); goto cleanup_table; + } return ret; @@ -238,7 +143,7 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&iptable_mangle_net_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index e5356da1fb54..07fb710cd722 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -5,94 +5,49 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/slab.h> #include <net/ip.h> #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[2]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "raw", - .valid_hooks = RAW_VALID_HOOKS, - .num_entries = 3, - .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table packet_raw = { +static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_RAW, }; /* The work comes in here from netfilter.c. */ static unsigned int -ipt_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +iptable_raw_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_raw); -} + const struct net *net; -static unsigned int -ipt_local_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* root is playing with raw sockets. */ return NF_ACCEPT; - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_raw); + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); } -/* 'raw' is the very first table. */ -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_hook, - .pf = PF_INET, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_RAW, - .owner = THIS_MODULE, - }, - { - .hook = ipt_local_hook, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_RAW, - .owner = THIS_MODULE, - }, -}; +static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init iptable_raw_net_init(struct net *net) { - /* Register table */ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; net->ipv4.iptable_raw = - ipt_register_table(net, &packet_raw, &initial_table.repl); + ipt_register_table(net, &packet_raw, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_raw)) return PTR_ERR(net->ipv4.iptable_raw); return 0; @@ -100,7 +55,7 @@ static int __net_init iptable_raw_net_init(struct net *net) static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_raw); + ipt_unregister_table(net, net->ipv4.iptable_raw); } static struct pernet_operations iptable_raw_net_ops = { @@ -117,9 +72,11 @@ static int __init iptable_raw_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); goto cleanup_table; + } return ret; @@ -130,7 +87,7 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&iptable_raw_net_ops); } diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 29ab630f240a..be45bdc4c602 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -17,6 +17,7 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); @@ -27,109 +28,44 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "security", - .valid_hooks = SECURITY_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table security_table = { +static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_SECURITY, }; static unsigned int -ipt_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_security); -} - -static unsigned int -ipt_forward_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +iptable_security_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_security); -} + const struct net *net; -static unsigned int -ipt_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* Somebody is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) - || ip_hdrlen(skb) < sizeof(struct iphdr)) + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_security); + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); } -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_local_in_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_SECURITY, - }, - { - .hook = ipt_forward_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_SECURITY, - }, - { - .hook = ipt_local_out_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_SECURITY, - }, -}; +static struct nf_hook_ops *sectbl_ops __read_mostly; static int __net_init iptable_security_net_init(struct net *net) { - net->ipv4.iptable_security = - ipt_register_table(net, &security_table, &initial_table.repl); + struct ipt_replace *repl; + repl = ipt_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.iptable_security = + ipt_register_table(net, &security_table, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_security)) return PTR_ERR(net->ipv4.iptable_security); @@ -138,7 +74,7 @@ static int __net_init iptable_security_net_init(struct net *net) static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_security); + ipt_unregister_table(net, net->ipv4.iptable_security); } static struct pernet_operations iptable_security_net_ops = { @@ -154,9 +90,11 @@ static int __init iptable_security_init(void) if (ret < 0) return ret; - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); goto cleanup_table; + } return ret; @@ -167,7 +105,7 @@ cleanup_table: static void __exit iptable_security_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&iptable_security_net_ops); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 7d2ead7228ac..5a03c02af999 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -22,10 +22,12 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> +#include <net/netfilter/nf_log.h> int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, struct nf_conn *ct, @@ -113,8 +115,11 @@ static unsigned int ipv4_confirm(unsigned int hooknum, ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), ct, ctinfo); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, + "nf_ct_%s: dropping packet", helper->name); return ret; + } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { typeof(nf_nat_seq_adjust_hook) seq_adjust; @@ -158,28 +163,28 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_in, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, @@ -191,7 +196,6 @@ static int log_invalid_proto_max = 255; static ctl_table ip_ct_sysctl_table[] = { { - .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, .procname = "ip_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), @@ -199,7 +203,6 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, .procname = "ip_conntrack_count", .data = &init_net.ct.count, .maxlen = sizeof(int), @@ -207,15 +210,13 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, .procname = "ip_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, .procname = "ip_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(int), @@ -223,19 +224,15 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, .procname = "ip_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &log_invalid_proto_min, .extra2 = &log_invalid_proto_max, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ @@ -251,16 +248,16 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) struct nf_conntrack_tuple tuple; memset(&tuple, 0, sizeof(tuple)); - tuple.src.u3.ip = inet->rcv_saddr; - tuple.src.u.tcp.port = inet->sport; - tuple.dst.u3.ip = inet->daddr; - tuple.dst.u.tcp.port = inet->dport; + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; tuple.src.l3num = PF_INET; - tuple.dst.protonum = IPPROTO_TCP; + tuple.dst.protonum = sk->sk_protocol; - /* We only do TCP at the moment: is there a better way? */ - if (strcmp(sk->sk_prot->name, "TCP")) { - pr_debug("SO_ORIGINAL_DST: Not a TCP socket\n"); + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); return -ENOPROTOOPT; } @@ -270,7 +267,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(sock_net(sk), &tuple); + h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -385,32 +382,32 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); if (ret < 0) { - printk("nf_conntrack_ipv4: can't register tcp.\n"); + pr_err("nf_conntrack_ipv4: can't register tcp.\n"); goto cleanup_sockopt; } ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); if (ret < 0) { - printk("nf_conntrack_ipv4: can't register udp.\n"); + pr_err("nf_conntrack_ipv4: can't register udp.\n"); goto cleanup_tcp; } ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); if (ret < 0) { - printk("nf_conntrack_ipv4: can't register icmp.\n"); + pr_err("nf_conntrack_ipv4: can't register icmp.\n"); goto cleanup_udp; } ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); if (ret < 0) { - printk("nf_conntrack_ipv4: can't register ipv4\n"); + pr_err("nf_conntrack_ipv4: can't register ipv4\n"); goto cleanup_icmp; } ret = nf_register_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); if (ret < 0) { - printk("nf_conntrack_ipv4: can't register hooks.\n"); + pr_err("nf_conntrack_ipv4: can't register hooks.\n"); goto cleanup_ipv4; } #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 8668a3defda6..244f7cb08d68 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < net->ct.htable_size; st->bucket++) { n = rcu_dereference(net->ct.hash[st->bucket].first); if (!is_a_nulls(n)) @@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(head->next); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= net->ct.htable_size) return NULL; } head = rcu_dereference(net->ct.hash[st->bucket].first); @@ -336,12 +336,12 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " - "%08x %08x %08x %08x %08x %08x %08x %08x \n", + "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, st->searched, st->found, @@ -358,7 +358,8 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->expect_new, st->expect_create, - st->expect_delete + st->expect_delete, + st->search_restart ); return 0; } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index d71ba7677344..7404bde95994 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; @@ -54,8 +55,8 @@ static const u_int8_t invmap[] = { static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { - if (orig->dst.u.icmp.type >= sizeof(invmap) - || !invmap[orig->dst.u.icmp.type]) + if (orig->dst.u.icmp.type >= sizeof(invmap) || + !invmap[orig->dst.u.icmp.type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; @@ -101,8 +102,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, [ICMP_ADDRESS] = 1 }; - if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) - || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || + !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); @@ -114,13 +115,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int -icmp_error_message(struct net *net, struct sk_buff *skb, +icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; NF_CT_ASSERT(skb->nfct == NULL); @@ -146,7 +148,7 @@ icmp_error_message(struct net *net, struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, &innertuple); + h = nf_conntrack_find_get(net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; @@ -163,7 +165,8 @@ icmp_error_message(struct net *net, struct sk_buff *skb, /* Small and modified version of icmp_rcv */ static int -icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, +icmp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmphdr *icmph; @@ -201,14 +204,14 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH - && icmph->type != ICMP_SOURCE_QUENCH - && icmph->type != ICMP_TIME_EXCEEDED - && icmph->type != ICMP_PARAMETERPROB - && icmph->type != ICMP_REDIRECT) + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(net, skb, ctinfo, hooknum); + return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) @@ -238,17 +241,17 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { - if (!tb[CTA_PROTO_ICMP_TYPE] - || !tb[CTA_PROTO_ICMP_CODE] - || !tb[CTA_PROTO_ICMP_ID]) + if (!tb[CTA_PROTO_ICMP_TYPE] || + !tb[CTA_PROTO_ICMP_CODE] || + !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - if (tuple->dst.u.icmp.type >= sizeof(invmap) - || !invmap[tuple->dst.u.icmp.type]) + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) return -EINVAL; return 0; @@ -270,9 +273,7 @@ static struct ctl_table icmp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table icmp_compat_sysctl_table[] = { @@ -283,9 +284,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index fa2d6b6fc3e5..cb763ae9ed90 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -14,8 +14,13 @@ #include <net/route.h> #include <net/ip.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include <net/netfilter/nf_conntrack.h> +#endif +#include <net/netfilter/nf_conntrack_zones.h> /* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -34,6 +39,27 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } +static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP_DEFRAG_CONNTRACK_IN + zone; + else + return IP_DEFRAG_CONNTRACK_OUT + zone; +} + static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, @@ -44,16 +70,14 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ - if (skb->nfct) + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; #endif #endif /* Gather fragments. */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (nf_ct_ipv4_gather_frags(skb, - hooknum == NF_INET_PRE_ROUTING ? - IP_DEFRAG_CONNTRACK_IN : - IP_DEFRAG_CONNTRACK_OUT)) + enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); + if (nf_ct_ipv4_gather_frags(skb, user)) return NF_STOLEN; } return NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 3229e0a81ba6..4f8bddb760c9 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> +#include <linux/gfp.h> #include <net/checksum.h> #include <net/icmp.h> #include <net/ip.h> @@ -30,14 +31,12 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_zones.h> static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; -/* Calculated at init based on memory size */ -static unsigned int nf_nat_htable_size __read_mostly; - #define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; @@ -72,15 +71,16 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; /* Original src, to ensure we map it consistently if poss. */ hash = jhash_3words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->src.u.all, + (__force u32)tuple->src.u.all ^ zone, tuple->dst.protonum, 0); - return ((u64)hash * nf_nat_htable_size) >> 32; + return ((u64)hash * net->ipv4.nat_htable_size) >> 32; } /* Is this tuple already taken? (not by us) */ @@ -142,12 +142,12 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(struct net *net, +find_appropriate_src(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; const struct hlist_node *n; @@ -155,7 +155,7 @@ find_appropriate_src(struct net *net, rcu_read_lock(); hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { ct = nat->ct; - if (same_src(ct, tuple)) { + if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -178,7 +178,7 @@ find_appropriate_src(struct net *net, the ip with the lowest src-ip/dst-ip/proto usage. */ static void -find_best_ips_proto(struct nf_conntrack_tuple *tuple, +find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) @@ -212,7 +212,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, range->flags & IP_NAT_RANGE_PERSISTENT ? - (__force u32)tuple->dst.u3.ip : 0, 0); + 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0); j = ((u64)j * (maxip - minip + 1)) >> 32; *var_ipp = htonl(minip + j); } @@ -232,6 +232,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, { struct net *net = nf_ct_net(ct); const struct nf_nat_protocol *proto; + u16 zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -242,7 +243,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, orig_tuple, tuple, range)) { + if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -252,7 +253,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, /* 2) Select the least-used IP/proto combination in the given range. */ *tuple = *orig_tuple; - find_best_ips_proto(tuple, range, ct, maniptype); + find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ @@ -330,7 +331,8 @@ nf_nat_setup_info(struct nf_conn *ct, if (have_to_hash) { unsigned int srchash; - srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + srchash = hash_by_src(net, nf_ct_zone(ct), + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); @@ -620,7 +622,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { }; static int -nfnetlink_parse_nat(struct nlattr *nat, +nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range *range) { struct nlattr *tb[CTA_NAT_MAX+1]; @@ -656,7 +658,7 @@ nfnetlink_parse_nat(struct nlattr *nat, static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, - struct nlattr *attr) + const struct nlattr *attr) { struct nf_nat_range range; @@ -671,7 +673,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, - struct nlattr *attr) + const struct nlattr *attr) { return -EOPNOTSUPP; } @@ -679,8 +681,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int __net_init nf_nat_net_init(struct net *net) { - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + /* Leave them the same for the moment. */ + net->ipv4.nat_htable_size = net->ct.htable_size; + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, + &net->ipv4.nat_vmalloced, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -703,7 +707,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - nf_nat_htable_size); + net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { @@ -724,9 +728,6 @@ static int __init nf_nat_init(void) return ret; } - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - ret = register_pernet_subsys(&nf_nat_net_ops); if (ret < 0) goto cleanup_extend; @@ -750,6 +751,8 @@ static int __init nf_nat_init(void) BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); + BUG_ON(nf_ct_nat_offset != NULL); + rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset); return 0; cleanup_extend: @@ -764,6 +767,7 @@ static void __exit nf_nat_cleanup(void) nf_ct_extend_unregister(&nat_extend); rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); + rcu_assign_pointer(nf_ct_nat_offset, NULL); synchronize_net(); } diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index a1d5d58a58bf..86e0e84ff0a0 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -27,76 +27,29 @@ MODULE_ALIAS("ip_nat_ftp"); /* FIXME: Time out? --RR */ -static int -mangle_rfc959_packet(struct sk_buff *skb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type, + char *buffer, size_t buflen, + __be32 addr, u16 port) { - char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; - - sprintf(buffer, "%u,%u,%u,%u,%u,%u", - NIPQUAD(newip), port>>8, port&0xFF); - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); - - return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_eprt_packet(struct sk_buff *skb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - char buffer[sizeof("|1|255.255.255.255|65535|")]; - - sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); - - return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_epsv_packet(struct sk_buff *skb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - char buffer[sizeof("|||65535|")]; - - sprintf(buffer, "|||%u|", port); - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); + switch (type) { + case NF_CT_FTP_PORT: + case NF_CT_FTP_PASV: + return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&addr)[0], + ((unsigned char *)&addr)[1], + ((unsigned char *)&addr)[2], + ((unsigned char *)&addr)[3], + port >> 8, + port & 0xFF); + case NF_CT_FTP_EPRT: + return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port); + case NF_CT_FTP_EPSV: + return snprintf(buffer, buflen, "|||%u|", port); + } - return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); + return 0; } -static int (*mangle[])(struct sk_buff *, __be32, u_int16_t, - unsigned int, unsigned int, struct nf_conn *, - enum ip_conntrack_info) -= { - [NF_CT_FTP_PORT] = mangle_rfc959_packet, - [NF_CT_FTP_PASV] = mangle_rfc959_packet, - [NF_CT_FTP_EPRT] = mangle_eprt_packet, - [NF_CT_FTP_EPSV] = mangle_epsv_packet -}; - /* So, this packet has hit the connection tracking matching code. Mangle it, and change the expectation to match the new version. */ static unsigned int nf_nat_ftp(struct sk_buff *skb, @@ -110,6 +63,8 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, u_int16_t port; int dir = CTINFO2DIR(ctinfo); struct nf_conn *ct = exp->master; + char buffer[sizeof("|1|255.255.255.255|65535|")]; + unsigned int buflen; pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); @@ -132,11 +87,21 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, if (port == 0) return NF_DROP; - if (!mangle[type](skb, newip, port, matchoff, matchlen, ct, ctinfo)) { - nf_ct_unexpect_related(exp); - return NF_DROP; - } + buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port); + if (!buflen) + goto out; + + pr_debug("calling nf_nat_mangle_tcp_packet\n"); + + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, + matchlen, buffer, buflen)) + goto out; + return NF_ACCEPT; + +out: + nf_ct_unexpect_related(exp); + return NF_DROP; } static void __exit nf_nat_ftp_fini(void) diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 7e8e6fc75413..5045196d853c 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -10,7 +10,6 @@ */ #include <linux/module.h> -#include <linux/moduleparam.h> #include <linux/tcp.h> #include <net/tcp.h> @@ -44,7 +43,7 @@ static int set_addr(struct sk_buff *skb, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { if (net_ratelimit()) - printk("nf_nat_h323: nf_nat_mangle_tcp_packet" + pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet" " error\n"); return -1; } @@ -60,7 +59,7 @@ static int set_addr(struct sk_buff *skb, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { if (net_ratelimit()) - printk("nf_nat_h323: nf_nat_mangle_udp_packet" + pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet" " error\n"); return -1; } @@ -216,7 +215,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, /* Run out of expectations */ if (i >= H323_RTP_CHANNEL_MAX) { if (net_ratelimit()) - printk("nf_nat_h323: out of expectations\n"); + pr_notice("nf_nat_h323: out of expectations\n"); return 0; } @@ -235,7 +234,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, if (nated_port == 0) { /* No port available */ if (net_ratelimit()) - printk("nf_nat_h323: out of RTP ports\n"); + pr_notice("nf_nat_h323: out of RTP ports\n"); return 0; } @@ -292,7 +291,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, if (nated_port == 0) { /* No port available */ if (net_ratelimit()) - printk("nf_nat_h323: out of TCP ports\n"); + pr_notice("nf_nat_h323: out of TCP ports\n"); return 0; } @@ -342,7 +341,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, if (nated_port == 0) { /* No port available */ if (net_ratelimit()) - printk("nf_nat_q931: out of TCP ports\n"); + pr_notice("nf_nat_q931: out of TCP ports\n"); return 0; } @@ -426,7 +425,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, if (nated_port == 0) { /* No port available */ if (net_ratelimit()) - printk("nf_nat_ras: out of TCP ports\n"); + pr_notice("nf_nat_ras: out of TCP ports\n"); return 0; } @@ -508,7 +507,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, if (nated_port == 0) { /* No port available */ if (net_ratelimit()) - printk("nf_nat_q931: out of TCP ports\n"); + pr_notice("nf_nat_q931: out of TCP ports\n"); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 09172a65d9b6..4a0c6b548eee 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -8,6 +8,7 @@ * published by the Free Software Foundation. */ #include <linux/module.h> +#include <linux/gfp.h> #include <linux/kmod.h> #include <linux/types.h> #include <linux/timer.h> @@ -41,18 +42,14 @@ adjust_tcp_sequence(u32 seq, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - int dir; - struct nf_nat_seq *this_way, *other_way; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way = &nat->seq[dir]; - pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", seq, seq); + pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", + seq, sizediff); - dir = CTINFO2DIR(ctinfo); - - this_way = &nat->seq[dir]; - other_way = &nat->seq[!dir]; - - pr_debug("nf_nat_resize_packet: Seq_offset before: "); + pr_debug("adjust_tcp_sequence: Seq_offset before: "); DUMP_OFFSET(this_way); spin_lock_bh(&nf_nat_seqofs_lock); @@ -63,16 +60,38 @@ adjust_tcp_sequence(u32 seq, * retransmit */ if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += sizediff; } spin_unlock_bh(&nf_nat_seqofs_lock); - pr_debug("nf_nat_resize_packet: Seq_offset after: "); + pr_debug("adjust_tcp_sequence: Seq_offset after: "); DUMP_OFFSET(this_way); } +/* Get the offset value, for conntrack */ +s16 nf_nat_get_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way; + s16 offset; + + if (!nat) + return 0; + + this_way = &nat->seq[dir]; + spin_lock_bh(&nf_nat_seqofs_lock); + offset = after(seq, this_way->correction_pos) + ? this_way->offset_after : this_way->offset_before; + spin_unlock_bh(&nf_nat_seqofs_lock); + + return offset; +} +EXPORT_SYMBOL_GPL(nf_nat_get_offset); + /* Frobs data inside this packet, which is linear. */ static void mangle_contents(struct sk_buff *skb, unsigned int dataoff, @@ -123,6 +142,17 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) return 1; } +void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s16 off) +{ + if (!off) + return; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); + nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); +} +EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); + /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -131,14 +161,13 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) * skb enlargement, ... * * */ -int -nf_nat_mangle_tcp_packet(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) +int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len, bool adjust) { struct rtable *rt = skb_rtable(skb); struct iphdr *iph; @@ -184,21 +213,13 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, inet_proto_csum_replace2(&tcph->check, skb, htons(oldlen), htons(datalen), 1); - if (rep_len != match_len) { - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(tcph->seq), - (int)rep_len - (int)match_len, - ct, ctinfo); - /* Tell TCP window tracking about seq change */ - nf_conntrack_tcp_update(skb, ip_hdrlen(skb), - ct, CTINFO2DIR(ctinfo), - (int)rep_len - (int)match_len); - - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); - } + if (adjust && rep_len != match_len) + nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); + return 1; } -EXPORT_SYMBOL(nf_nat_mangle_tcp_packet); +EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); /* Generic function for mangling variable-length address changes inside * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX @@ -415,12 +436,7 @@ nf_nat_seq_adjust(struct sk_buff *skb, tcph->seq = newseq; tcph->ack_seq = newack; - if (!nf_nat_sack_adjust(skb, tcph, ct, ctinfo)) - return 0; - - nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, dir, seqoff); - - return 1; + return nf_nat_sack_adjust(skb, tcph, ct, ctinfo); } /* Setup NAT on this expected conntrack so it follows master. */ diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 9eb171056c63..4c060038d29f 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -25,6 +25,7 @@ #include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> @@ -74,7 +75,7 @@ static void pptp_nat_expected(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple_ip(&t); - other_exp = nf_ct_expect_find_get(net, &t); + other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t); if (other_exp) { nf_ct_unexpect_related(other_exp); nf_ct_expect_put(other_exp); diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 6348a793936e..98ed78281aee 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -7,6 +7,7 @@ */ /* Everything about the rules for NAT. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/ip.h> #include <linux/netfilter.h> @@ -15,6 +16,7 @@ #include <linux/kmod.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> +#include <linux/slab.h> #include <net/checksum.h> #include <net/route.h> #include <linux/bitops.h> @@ -28,46 +30,16 @@ (1 << NF_INET_POST_ROUTING) | \ (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} nat_initial_table __net_initdata = { - .repl = { - .name = "nat", - .valid_hooks = NAT_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table nat_table = { +static const struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, }; /* Source NAT */ static unsigned int -ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) +ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -86,7 +58,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) } static unsigned int -ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) +ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -103,28 +75,28 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); } -static bool ipt_snat_checkentry(const struct xt_tgchk_param *par) +static int ipt_snat_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_multi_range_compat *mr = par->targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { - printk("SNAT: multiple ranges no longer supported\n"); - return false; + pr_info("SNAT: multiple ranges no longer supported\n"); + return -EINVAL; } - return true; + return 0; } -static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par) +static int ipt_dnat_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_multi_range_compat *mr = par->targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { - printk("DNAT: multiple ranges no longer supported\n"); - return false; + pr_info("DNAT: multiple ranges no longer supported\n"); + return -EINVAL; } - return true; + return 0; } unsigned int @@ -186,8 +158,13 @@ static struct xt_target ipt_dnat_reg __read_mostly = { static int __net_init nf_nat_rule_net_init(struct net *net) { - net->ipv4.nat_table = ipt_register_table(net, &nat_table, - &nat_initial_table.repl); + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&nat_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl); + kfree(repl); if (IS_ERR(net->ipv4.nat_table)) return PTR_ERR(net->ipv4.nat_table); return 0; @@ -195,7 +172,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net) static void __net_exit nf_nat_rule_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.nat_table); + ipt_unregister_table(net, net->ipv4.nat_table); } static struct pernet_operations nf_nat_rule_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 07d61a57613c..11b538deaaec 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -1,4 +1,4 @@ -/* SIP extension for UDP NAT alteration. +/* SIP extension for NAT alteration. * * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> * based on RR's ip_nat_ftp.c and other modules. @@ -15,6 +15,7 @@ #include <linux/ip.h> #include <net/ip.h> #include <linux/udp.h> +#include <linux/tcp.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> @@ -29,25 +30,42 @@ MODULE_DESCRIPTION("SIP NAT helper"); MODULE_ALIAS("ip_nat_sip"); -static unsigned int mangle_packet(struct sk_buff *skb, +static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int matchoff, unsigned int matchlen, const char *buffer, unsigned int buflen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - - if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen, - buffer, buflen)) - return 0; + struct tcphdr *th; + unsigned int baseoff; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) { + th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); + baseoff = ip_hdrlen(skb) + th->doff * 4; + matchoff += dataoff - baseoff; + + if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + matchoff, matchlen, + buffer, buflen, false)) + return 0; + } else { + baseoff = ip_hdrlen(skb) + sizeof(struct udphdr); + matchoff += dataoff - baseoff; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, + matchoff, matchlen, + buffer, buflen)) + return 0; + } /* Reload data pointer and adjust datalen value */ - *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); + *dptr = skb->data + dataoff; *datalen += buflen - matchlen; return 1; } -static int map_addr(struct sk_buff *skb, +static int map_addr(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int matchoff, unsigned int matchlen, union nf_inet_addr *addr, __be16 port) @@ -76,11 +94,11 @@ static int map_addr(struct sk_buff *skb, buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); - return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen); } -static int map_sip_addr(struct sk_buff *skb, +static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, enum sip_header_types type) { @@ -93,16 +111,18 @@ static int map_sip_addr(struct sk_buff *skb, if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, &matchoff, &matchlen, &addr, &port) <= 0) return 1; - return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port); + return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port); } -static unsigned int ip_nat_sip(struct sk_buff *skb, +static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned int dataoff, matchoff, matchlen; + unsigned int coff, matchoff, matchlen; + enum sip_header_types hdr; union nf_inet_addr addr; __be16 port; int request, in_header; @@ -112,16 +132,21 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, if (ct_sip_parse_request(ct, *dptr, *datalen, &matchoff, &matchlen, &addr, &port) > 0 && - !map_addr(skb, dptr, datalen, matchoff, matchlen, + !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; request = 1; } else request = 0; + if (nf_ct_protonum(ct) == IPPROTO_TCP) + hdr = SIP_HDR_VIA_TCP; + else + hdr = SIP_HDR_VIA_UDP; + /* Translate topmost Via header and parameters */ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, - SIP_HDR_VIA, NULL, &matchoff, &matchlen, + hdr, NULL, &matchoff, &matchlen, &addr, &port) > 0) { unsigned int matchend, poff, plen, buflen, n; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; @@ -138,7 +163,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, goto next; } - if (!map_addr(skb, dptr, datalen, matchoff, matchlen, + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; @@ -153,8 +178,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", &ct->tuplehash[!dir].tuple.dst.u3.ip); - if (!mangle_packet(skb, dptr, datalen, poff, plen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) return NF_DROP; } @@ -167,8 +192,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", &ct->tuplehash[!dir].tuple.src.u3.ip); - if (!mangle_packet(skb, dptr, datalen, poff, plen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) return NF_DROP; } @@ -181,31 +206,45 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; buflen = sprintf(buffer, "%u", ntohs(p)); - if (!mangle_packet(skb, dptr, datalen, poff, plen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) return NF_DROP; } } next: /* Translate Contact headers */ - dataoff = 0; + coff = 0; in_header = 0; - while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen, + while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, SIP_HDR_CONTACT, &in_header, &matchoff, &matchlen, &addr, &port) > 0) { - if (!map_addr(skb, dptr, datalen, matchoff, matchlen, + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; } - if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) || - !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO)) + if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) || + !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO)) return NF_DROP; + return NF_ACCEPT; } +static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) + return; + + th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); + nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); +} + /* Handles expected signalling connections and media streams */ static void ip_nat_sip_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) @@ -232,7 +271,7 @@ static void ip_nat_sip_expected(struct nf_conn *ct, } } -static unsigned int ip_nat_sip_expect(struct sk_buff *skb, +static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, struct nf_conntrack_expect *exp, unsigned int matchoff, @@ -279,8 +318,8 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, if (exp->tuple.dst.u3.ip != exp->saved_ip || exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { buflen = sprintf(buffer, "%pI4:%u", &newip, port); - if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) goto err; } return NF_ACCEPT; @@ -290,7 +329,7 @@ err: return NF_DROP; } -static int mangle_content_len(struct sk_buff *skb, +static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen) { enum ip_conntrack_info ctinfo; @@ -312,12 +351,13 @@ static int mangle_content_len(struct sk_buff *skb, return 0; buflen = sprintf(buffer, "%u", c_len); - return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen); } -static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, unsigned int *datalen, +static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, enum sdp_header_types type, enum sdp_header_types term, char *buffer, int buflen) @@ -326,16 +366,16 @@ static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchlen, matchoff; - if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, + if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, &matchoff, &matchlen) <= 0) return -ENOENT; - return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL; } -static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, - unsigned int *datalen, +static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, enum sdp_header_types type, enum sdp_header_types term, const union nf_inet_addr *addr) @@ -344,16 +384,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, unsigned int buflen; buflen = sprintf(buffer, "%pI4", &addr->ip); - if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, + if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term, buffer, buflen)) return 0; - return mangle_content_len(skb, dptr, datalen); + return mangle_content_len(skb, dataoff, dptr, datalen); } -static unsigned int ip_nat_sdp_port(struct sk_buff *skb, - const char **dptr, - unsigned int *datalen, +static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, unsigned int matchoff, unsigned int matchlen, u_int16_t port) @@ -362,16 +401,16 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int buflen; buflen = sprintf(buffer, "%u", port); - if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, + if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen)) return 0; - return mangle_content_len(skb, dptr, datalen); + return mangle_content_len(skb, dataoff, dptr, datalen); } -static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, - unsigned int *datalen, +static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, const union nf_inet_addr *addr) { char buffer[sizeof("nnn.nnn.nnn.nnn")]; @@ -379,12 +418,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, /* Mangle session description owner and contact addresses */ buflen = sprintf(buffer, "%pI4", &addr->ip); - if (mangle_sdp_packet(skb, dptr, dataoff, datalen, + if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, buffer, buflen)) return 0; - switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, + switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, buffer, buflen)) { case 0: @@ -401,14 +440,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, return 0; } - return mangle_content_len(skb, dptr, datalen); + return mangle_content_len(skb, dataoff, dptr, datalen); } /* So, this packet has hit the connection tracking matching code. Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp_media(struct sk_buff *skb, - const char **dptr, - unsigned int *datalen, +static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, struct nf_conntrack_expect *rtp_exp, struct nf_conntrack_expect *rtcp_exp, unsigned int mediaoff, @@ -456,7 +494,8 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, /* Update media port. */ if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && - !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port)) + !ip_nat_sdp_port(skb, dataoff, dptr, datalen, + mediaoff, medialen, port)) goto err2; return NF_ACCEPT; @@ -471,6 +510,7 @@ err1: static void __exit nf_nat_sip_fini(void) { rcu_assign_pointer(nf_nat_sip_hook, NULL); + rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL); rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); @@ -482,12 +522,14 @@ static void __exit nf_nat_sip_fini(void) static int __init nf_nat_sip_init(void) { BUG_ON(nf_nat_sip_hook != NULL); + BUG_ON(nf_nat_sip_seq_adjust_hook != NULL); BUG_ON(nf_nat_sip_expect_hook != NULL); BUG_ON(nf_nat_sdp_addr_hook != NULL); BUG_ON(nf_nat_sdp_port_hook != NULL); BUG_ON(nf_nat_sdp_session_hook != NULL); BUG_ON(nf_nat_sdp_media_hook != NULL); rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); + rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index d9521f6f9ed0..1679e2c0963d 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -43,6 +43,7 @@ #include <linux/moduleparam.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/udp.h> @@ -400,7 +401,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); if (*octets == NULL) { if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } @@ -451,7 +452,7 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) { if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } @@ -728,7 +729,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, if (*obj == NULL) { kfree(id); if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } (*obj)->syntax.l[0] = l; @@ -745,7 +746,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, kfree(p); kfree(id); if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } memcpy((*obj)->syntax.c, p, len); @@ -760,7 +761,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, if (*obj == NULL) { kfree(id); if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } if (!asn1_null_decode(ctx, end)) { @@ -781,7 +782,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, kfree(lp); kfree(id); if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } memcpy((*obj)->syntax.ul, lp, len); @@ -802,7 +803,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, kfree(p); kfree(id); if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } memcpy((*obj)->syntax.uc, p, len); @@ -820,7 +821,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, if (*obj == NULL) { kfree(id); if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); + pr_notice("OOM in bsalg (%d)\n", __LINE__); return 0; } (*obj)->syntax.ul[0] = ul; @@ -1038,7 +1039,7 @@ static int snmp_parse_mangle(unsigned char *msg, unsigned int cls, con, tag, vers, pdutype; struct asn1_ctx ctx; struct asn1_octstr comm; - struct snmp_object **obj; + struct snmp_object *obj; if (debug > 1) hex_dump(msg, len); @@ -1148,43 +1149,34 @@ static int snmp_parse_mangle(unsigned char *msg, if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) return 0; - obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (obj == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); - return 0; - } - while (!asn1_eoc_decode(&ctx, eoc)) { unsigned int i; - if (!snmp_object_decode(&ctx, obj)) { - if (*obj) { - kfree((*obj)->id); - kfree(*obj); + if (!snmp_object_decode(&ctx, &obj)) { + if (obj) { + kfree(obj->id); + kfree(obj); } - kfree(obj); return 0; } if (debug > 1) { printk(KERN_DEBUG "bsalg: object: "); - for (i = 0; i < (*obj)->id_len; i++) { + for (i = 0; i < obj->id_len; i++) { if (i > 0) printk("."); - printk("%lu", (*obj)->id[i]); + printk("%lu", obj->id[i]); } - printk(": type=%u\n", (*obj)->type); + printk(": type=%u\n", obj->type); } - if ((*obj)->type == SNMP_IPADDR) + if (obj->type == SNMP_IPADDR) mangle_address(ctx.begin, ctx.pointer - 4 , map, check); - kfree((*obj)->id); - kfree(*obj); + kfree(obj->id); + kfree(obj); } - kfree(obj); if (!asn1_eoc_decode(&ctx, eoc)) return 0; diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 5567bd0d0750..beb25819c9c9 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -7,6 +7,7 @@ */ #include <linux/types.h> #include <linux/icmp.h> +#include <linux/gfp.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -137,9 +138,8 @@ nf_nat_fn(unsigned int hooknum, ret = nf_nat_rule_find(skb, hooknum, in, out, ct); - if (ret != NF_ACCEPT) { + if (ret != NF_ACCEPT) return ret; - } } else pr_debug("Already setup manip %s for ct %p\n", maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", @@ -197,11 +197,11 @@ nf_nat_out(unsigned int hooknum, (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - if (ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip - || ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all - ) + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all) + ) return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; } #endif @@ -251,7 +251,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_in, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, @@ -259,7 +259,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_out, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, @@ -267,7 +267,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_local_fn, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, @@ -275,7 +275,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_fn, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, @@ -293,12 +293,12 @@ static int __init nf_nat_standalone_init(void) #endif ret = nf_nat_rule_init(); if (ret < 0) { - printk("nf_nat_init: can't setup rules.\n"); + pr_err("nf_nat_init: can't setup rules.\n"); goto cleanup_decode_session; } ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); if (ret < 0) { - printk("nf_nat_init: can't register hooks.\n"); + pr_err("nf_nat_init: can't register hooks.\n"); goto cleanup_rule_init; } return ret; diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index b096e81500ae..7274a43c7a12 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c @@ -6,7 +6,6 @@ */ #include <linux/module.h> -#include <linux/moduleparam.h> #include <linux/udp.h> #include <net/netfilter/nf_nat_helper.h> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f25542c48b7d..3dc9914c1dce 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_SENTINEL }; -static struct { - char *name; +static const struct { + const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, @@ -249,6 +249,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), + SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), + SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), + SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_SENTINEL }; @@ -280,7 +283,7 @@ static void icmpmsg_put(struct seq_file *seq) count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); + val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); if (val) { type[count] = i; vals[count++] = val; @@ -307,18 +310,18 @@ static void icmp_put(struct seq_file *seq) for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) net->mib.icmpmsg_statistics, + snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) net->mib.icmpmsg_statistics, + snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, icmpmibmap[i].index | 0x100)); } @@ -341,7 +344,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.ip_statistics, + snmp_fold_field((void __percpu **)net->mib.ip_statistics, snmp4_ipstats_list[i].entry)); icmp_put(seq); /* RFC 2011 compatibility */ @@ -356,11 +359,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void **)net->mib.tcp_statistics, + snmp_fold_field((void __percpu **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.tcp_statistics, + snmp_fold_field((void __percpu **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -371,7 +374,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.udp_statistics, + snmp_fold_field((void __percpu **)net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -382,7 +385,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.udplite_statistics, + snmp_fold_field((void __percpu **)net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -419,7 +422,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.net_statistics, + snmp_fold_field((void __percpu **)net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -429,7 +432,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.ip_statistics, + snmp_fold_field((void __percpu **)net->mib.ip_statistics, snmp4_ipextstats_list[i].entry)); seq_putc(seq, '\n'); diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index ea50da0649fd..542f22fc98b3 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -22,35 +22,20 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - -#include <asm/uaccess.h> -#include <asm/system.h> +#include <linux/cache.h> #include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/timer.h> -#include <net/ip.h> +#include <linux/spinlock.h> #include <net/protocol.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/ipip.h> -#include <linux/igmp.h> -struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; +const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; static DEFINE_SPINLOCK(inet_proto_lock); /* * Add a protocol handler to the hash tables */ -int inet_add_protocol(struct net_protocol *prot, unsigned char protocol) +int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { int hash, ret; @@ -72,7 +57,7 @@ int inet_add_protocol(struct net_protocol *prot, unsigned char protocol) * Remove a protocol from the hash tables. */ -int inet_del_protocol(struct net_protocol *prot, unsigned char protocol) +int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { int hash, ret; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2979f14bb188..2c7a1639388a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -60,7 +60,6 @@ #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> -#include <linux/gfp.h> #include <linux/ip.h> #include <linux/net.h> #include <net/ip.h> @@ -87,7 +86,7 @@ void raw_hash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; - head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; + head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; write_lock_bh(&h->lock); sk_add_node(sk, head); @@ -115,9 +114,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, sk_for_each_from(sk, node) { struct inet_sock *inet = inet_sk(sk); - if (net_eq(sock_net(sk), net) && inet->num == num && - !(inet->daddr && inet->daddr != raddr) && - !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && + !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } @@ -291,8 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk, skb) < 0) { - atomic_inc(&sk->sk_drops); + if (ip_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } @@ -327,7 +325,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, int err; if (length > rt->u.dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, rt->u.dst.dev->mtu); return -EMSGSIZE; } @@ -352,13 +350,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb->ip_summed = CHECKSUM_NONE; skb->transport_header = skb->network_header; - err = memcpy_fromiovecend((void *)iph, from, 0, length); - if (err) - goto error_fault; + err = -EFAULT; + if (memcpy_fromiovecend((void *)iph, from, 0, length)) + goto error_free; - /* We don't modify invalid header */ iphlen = iph->ihl * 4; - if (iphlen >= sizeof(*iph) && iphlen <= length) { + + /* + * We don't want to modify the ip header, but we do need to + * be sure that it won't cause problems later along the network + * stack. Specifically we want to make sure that iph->ihl is a + * sane value. If ihl points beyond the length of the buffer passed + * in, reject the frame as invalid + */ + err = -EINVAL; + if (iphlen > length) + goto error_free; + + if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = rt->rt_src; iph->check = 0; @@ -372,20 +381,21 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + rt->u.dst.dev, dst_output); if (err > 0) - err = inet->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; out: return 0; -error_fault: - err = -EFAULT; +error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); + if (err == -ENOBUFS && !inet->recverr) + err = 0; return err; } @@ -488,10 +498,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; - daddr = inet->daddr; + daddr = inet->inet_daddr; } - ipc.addr = inet->saddr; + ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.shtx.flags = 0; ipc.oif = sk->sk_bound_dev_if; @@ -576,8 +586,11 @@ back_from_confirm: &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); - else if (!(msg->msg_flags & MSG_MORE)) + else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk); + if (err == -ENOBUFS && !inet->recverr) + err = 0; + } release_sock(sk); } done: @@ -630,9 +643,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; - inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) - inet->saddr = 0; /* Use device */ + inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; out: return ret; @@ -677,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (err) goto done; - sock_recv_timestamp(msg, sk, skb); + sock_recv_ts_and_drops(msg, sk, skb); /* Copy the address. */ if (sin) { @@ -702,7 +715,7 @@ static int raw_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); - if (inet_sk(sk)->num == IPPROTO_ICMP) + if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; } @@ -736,10 +749,10 @@ out: return ret; } static int do_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (optname == ICMP_FILTER) { - if (inet_sk(sk)->num != IPPROTO_ICMP) + if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_seticmpfilter(sk, optval, optlen); @@ -748,7 +761,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); @@ -757,7 +770,7 @@ static int raw_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_COMPAT static int compat_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level != SOL_RAW) return compat_ip_setsockopt(sk, level, optname, optval, optlen); @@ -769,7 +782,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { - if (inet_sk(sk)->num != IPPROTO_ICMP) + if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_geticmpfilter(sk, optval, optlen); @@ -928,10 +941,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop); static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct inet_sock *inet = inet_sk(sp); - __be32 dest = inet->daddr, - src = inet->rcv_saddr; + __be32 dest = inet->inet_daddr, + src = inet->inet_rcv_saddr; __u16 destp = 0, - srcp = inet->num; + srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 278f46f5011b..560acc677ce4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -90,6 +90,7 @@ #include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <linux/slab.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -128,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static int rt_chain_length_max __read_mostly = 20; static struct delayed_work expires_work; @@ -146,7 +146,6 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); static int rt_garbage_collect(struct dst_ops *ops); -static void rt_emergency_hash_rebuild(struct net *net); static struct dst_ops ipv4_dst_ops = { @@ -258,10 +257,9 @@ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); (__raw_get_cpu_var(rt_cache_stat).field++) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) + int genid) { - return jhash_3words((__force u32)(__be32)(daddr), - (__force u32)(__be32)(saddr), + return jhash_3words((__force u32)daddr, (__force u32)saddr, idx, genid) & rt_hash_mask; } @@ -287,12 +285,12 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) if (!rt_hash_table[st->bucket].chain) continue; rcu_read_lock_bh(); - r = rcu_dereference(rt_hash_table[st->bucket].chain); + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); while (r) { if (dev_net(r->u.dst.dev) == seq_file_net(seq) && r->rt_genid == st->genid) return r; - r = rcu_dereference(r->u.dst.rt_next); + r = rcu_dereference_bh(r->u.dst.rt_next); } rcu_read_unlock_bh(); } @@ -314,7 +312,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } - return rcu_dereference(r); + return rcu_dereference_bh(r); } static struct rtable *rt_cache_get_next(struct seq_file *seq, @@ -378,12 +376,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) struct rtable *r = v; int len; - seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, + r->u.dst.__use, 0, (__force u32)r->rt_src, (dst_metric(&r->u.dst, RTAX_ADVMSS) ? (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), dst_metric(&r->u.dst, RTAX_WINDOW), @@ -513,43 +512,42 @@ static const struct file_operations rt_cpu_seq_fops = { }; #ifdef CONFIG_NET_CLS_ROUTE -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - unsigned int i; - - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } - - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; +static int rt_acct_proc_show(struct seq_file *m, void *v) +{ + struct ip_rt_acct *dst, *src; + unsigned int i, j; + + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; + + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } } - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src; + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; +} - src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; +static int rt_acct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt_acct_proc_show, NULL); } + +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif static int __net_init ip_rt_do_proc_init(struct net *net) @@ -567,8 +565,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net) goto err2; #ifdef CONFIG_NET_CLS_ROUTE - pde = create_proc_read_entry("rt_acct", 0, net->proc_net, - ip_rt_acct_read, NULL); + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif @@ -588,7 +585,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_NET_CLS_ROUTE remove_proc_entry("rt_acct", net->proc_net); +#endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { @@ -685,25 +684,24 @@ static inline bool rt_caching(const struct net *net) static inline bool compare_hash_inputs(const struct flowi *fl1, const struct flowi *fl2) { - return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->iif ^ fl2->iif)) == 0); } static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | + return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | + (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | (fl1->oif ^ fl2->oif) | (fl1->iif ^ fl2->iif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); + return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); } static inline int rt_is_expired(struct rtable *rth) @@ -780,11 +778,30 @@ static void rt_do_flush(int process_context) #define FRACT_BITS 3 #define ONE (1UL << FRACT_BITS) +/* + * Given a hash chain and an item in this hash chain, + * find if a previous entry has the same hash_inputs + * (but differs on tos, mark or oif) + * Returns 0 if an alias is found. + * Returns ONE if rth has no alias before itself. + */ +static int has_noalias(const struct rtable *head, const struct rtable *rth) +{ + const struct rtable *aux = head; + + while (aux != rth) { + if (compare_hash_inputs(&aux->fl, &rth->fl)) + return 0; + aux = aux->u.dst.rt_next; + } + return ONE; +} + static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, *aux, **rthp; + struct rtable *rth, **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; unsigned long delta; @@ -835,15 +852,7 @@ nofree: * attributes don't unfairly skew * the length computation */ - for (aux = rt_hash_table[i].chain;;) { - if (aux == rth) { - length += ONE; - break; - } - if (compare_hash_inputs(&aux->fl, &rth->fl)) - break; - aux = aux->u.dst.rt_next; - } + length += has_noalias(rt_hash_table[i].chain, rth); continue; } } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) @@ -902,34 +911,17 @@ void rt_cache_flush(struct net *net, int delay) rt_do_flush(!in_softirq()); } -/* - * We change rt_genid and let gc do the cleanup - */ -static void rt_secret_rebuild(unsigned long __net) +/* Flush previous cache invalidated entries from the cache */ +void rt_cache_flush_batch(void) { - struct net *net = (struct net *)__net; - rt_cache_invalidate(net); - mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); -} - -static void rt_secret_rebuild_oneshot(struct net *net) -{ - del_timer_sync(&net->ipv4.rt_secret_timer); - rt_cache_invalidate(net); - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); - } + rt_do_flush(!in_softirq()); } static void rt_emergency_hash_rebuild(struct net *net) { - if (net_ratelimit()) { + if (net_ratelimit()) printk(KERN_WARNING "Route hash chain too long!\n"); - printk(KERN_WARNING "Adjust your secret_interval!\n"); - } - - rt_secret_rebuild_oneshot(net); + rt_cache_invalidate(net); } /* @@ -1067,8 +1059,23 @@ work_done: out: return 0; } +/* + * Returns number of entries in a hash chain that have different hash_inputs + */ +static int slow_chain_length(const struct rtable *head) +{ + int length = 0; + const struct rtable *rth = head; + + while (rth) { + length += has_noalias(head, rth); + rth = rth->u.dst.rt_next; + } + return length >> FRACT_BITS; +} + static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb) + struct rtable **rp, struct sk_buff *skb, int ifindex) { struct rtable *rth, **rthp; unsigned long now; @@ -1179,14 +1186,20 @@ restart: rt_free(cand); } } else { - if (chain_length > rt_chain_length_max) { + if (chain_length > rt_chain_length_max && + slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { struct net *net = dev_net(rt->u.dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(dev_net(rt->u.dst.dev))) { + if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", rt->u.dst.dev->name, num); } - rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + rt_emergency_hash_rebuild(net); + spin_unlock_bh(rt_hash_lock_addr(hash)); + + hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, + ifindex, rt_genid(net)); + goto restart; } } @@ -1346,9 +1359,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, return; net = dev_net(dev); - if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) - || ipv4_is_zeronet(new_gw)) + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!rt_caching(net)) @@ -1411,7 +1424,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, dev_hold(rt->u.dst.dev); if (rt->idev) in_dev_hold(rt->idev); - rt->u.dst.obsolete = 0; + rt->u.dst.obsolete = -1; rt->u.dst.lastuse = jiffies; rt->u.dst.path = &rt->u.dst; rt->u.dst.neighbour = NULL; @@ -1447,7 +1460,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, &netevent); rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL)) + if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) ip_rt_put(rt); goto do_next; } @@ -1476,11 +1489,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) struct dst_entry *ret = dst; if (rt) { - if (dst->obsolete) { + if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->u.dst.expires) { + (rt->u.dst.expires && + time_after_eq(jiffies, rt->u.dst.expires))) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); @@ -1514,13 +1528,17 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); - struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + struct in_device *in_dev; + int log_martians; - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->u.dst.dev); + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { + rcu_read_unlock(); return; - - if (!IN_DEV_TX_REDIRECTS(in_dev)) - goto out; + } + log_martians = IN_DEV_LOG_MARTIANS(in_dev); + rcu_read_unlock(); /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. @@ -1533,7 +1551,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { rt->u.dst.rate_last = jiffies; - goto out; + return; } /* Check for load limit; set rate_last to the latest sent @@ -1547,7 +1565,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) rt->u.dst.rate_last = jiffies; ++rt->u.dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && + if (log_martians && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", @@ -1555,8 +1573,6 @@ void ip_rt_send_redirect(struct sk_buff *skb) &rt->rt_dst, &rt->rt_gateway); #endif } -out: - in_dev_put(in_dev); } static int ip_error(struct sk_buff *skb) @@ -1626,9 +1642,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, __be32 daddr = iph->daddr; unsigned short est_mtu = 0; - if (ipv4_config.no_pmtu_disc) - return 0; - for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], @@ -1697,7 +1710,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - return NULL; + if (rt_is_expired((struct rtable *)dst)) + return NULL; + return dst; } static void ipv4_dst_destroy(struct dst_entry *dst) @@ -1852,14 +1867,15 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) + dev, &spec_dst, &itag, 0) < 0) goto e_inval; rth = dst_alloc(&ipv4_dst_ops); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; + rth->u.dst.output = ip_rt_bug; + rth->u.dst.obsolete = -1; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; @@ -1898,7 +1914,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); e_nobufs: in_dev_put(in_dev); @@ -1965,7 +1981,7 @@ static int __mkroute_input(struct sk_buff *skb, err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, &spec_dst, &itag, skb->mark); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -1985,8 +2001,13 @@ static int __mkroute_input(struct sk_buff *skb, if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } @@ -2020,6 +2041,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; + rth->u.dst.obsolete = -1; rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); @@ -2059,7 +2081,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, rt_genid(dev_net(rth->u.dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return rt_intern_hash(hash, rth, NULL, skb, fl->iif); } /* @@ -2139,7 +2161,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, int result; result = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, &spec_dst, &itag, skb->mark); if (result < 0) goto martian_source; if (result) @@ -2168,7 +2190,7 @@ brd_input: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag); + &itag, skb->mark); if (err < 0) goto martian_source; if (err) @@ -2184,6 +2206,7 @@ local_input: goto e_nobufs; rth->u.dst.output= ip_rt_bug; + rth->u.dst.obsolete = -1; rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); @@ -2215,7 +2238,7 @@ local_input: } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb); + err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); goto done; no_route: @@ -2254,8 +2277,8 @@ martian_source: goto e_inval; } -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, bool noref) { struct rtable * rth; unsigned hash; @@ -2273,18 +2296,23 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { - if (((rth->fl.fl4_dst ^ daddr) | - (rth->fl.fl4_src ^ saddr) | + if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | + ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); + if (noref) { + dst_use_noref(&rth->u.dst, jiffies); + skb_dst_set_noref(skb, &rth->u.dst); + } else { + dst_use(&rth->u.dst, jiffies); + skb_dst_set(skb, &rth->u.dst); + } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2312,10 +2340,11 @@ skip_cache: ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { + ) { rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); @@ -2326,6 +2355,7 @@ skip_cache: } return ip_route_input_slow(skb, daddr, saddr, tos, dev); } +EXPORT_SYMBOL(ip_route_input_common); static int __mkroute_output(struct rtable **result, struct fib_result *res, @@ -2409,6 +2439,7 @@ static int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output=ip_output; + rth->u.dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2460,7 +2491,7 @@ static int ip_mkroute_output(struct rtable **rp, if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL); + err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); } return err; @@ -2512,9 +2543,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 - && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + if (oldflp->oif == 0 && + (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) @@ -2683,8 +2714,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; + rth = rcu_dereference_bh(rth->u.dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && @@ -2853,7 +2884,7 @@ static int rt_fill_info(struct net *net, error = rt->u.dst.error; expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; if (rt->peer) { - id = rt->peer->ip_id_count; + id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; tsage = get_seconds() - rt->peer->tcp_ts_stamp; @@ -3002,13 +3033,13 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!rt_hash_table[h].chain) continue; rcu_read_lock_bh(); - for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference(rt->u.dst.rt_next), idx++) { + for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; + rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) { if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) continue; if (rt_is_expired(rt)) continue; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set_noref(skb, &rt->u.dst); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { @@ -3034,7 +3065,7 @@ void ip_rt_multicast_event(struct in_device *in_dev) #ifdef CONFIG_SYSCTL static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { @@ -3044,7 +3075,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, memcpy(&ctl, __ctl, sizeof(ctl)); ctl.data = &flush_delay; - proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); + proc_dointvec(&ctl, write, buffer, lenp, ppos); net = (struct net *)__ctl->extra1; rt_cache_flush(net, flush_delay); @@ -3054,85 +3085,8 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, return -EINVAL; } -static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - int delay; - struct net *net; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(delay, (int __user *)newval)) - return -EFAULT; - net = (struct net *)table->extra1; - rt_cache_flush(net, delay); - return 0; -} - -static void rt_secret_reschedule(int old) -{ - struct net *net; - int new = ip_rt_secret_interval; - int diff = new - old; - - if (!diff) - return; - - rtnl_lock(); - for_each_net(net) { - int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); - - if (!new) - continue; - - if (deleted) { - long time = net->ipv4.rt_secret_timer.expires - jiffies; - - if (time <= 0 || (time += diff) <= 0) - time = 0; - - net->ipv4.rt_secret_timer.expires = time; - } else - net->ipv4.rt_secret_timer.expires = new; - - net->ipv4.rt_secret_timer.expires += jiffies; - add_timer(&net->ipv4.rt_secret_timer); - } - rtnl_unlock(); -} - -static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, - struct file *filp, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); - - rt_secret_reschedule(old); - - return ret; -} - -static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - int old = ip_rt_secret_interval; - int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); - - rt_secret_reschedule(old); - - return ret; -} - static ctl_table ipv4_route_table[] = { { - .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), @@ -3140,7 +3094,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), @@ -3150,43 +3103,34 @@ static ctl_table ipv4_route_table[] = { { /* Deprecated. Use gc_min_interval_ms */ - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), @@ -3194,7 +3138,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), @@ -3202,7 +3145,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), @@ -3210,7 +3152,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_COST, .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), @@ -3218,7 +3159,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), @@ -3226,7 +3166,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), @@ -3234,16 +3173,13 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, .procname = "mtu_expires", .data = &ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, .procname = "min_pmtu", .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), @@ -3251,58 +3187,46 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, .procname = "min_adv_mss", .data = &ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { - .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, - .procname = "secret_interval", - .data = &ip_rt_secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ipv4_sysctl_rt_secret_interval, - .strategy = ipv4_sysctl_rt_secret_interval_strategy, - }, - { .ctl_name = 0 } + { } }; static struct ctl_table empty[1]; static struct ctl_table ipv4_skeleton[] = { - { .procname = "route", .ctl_name = NET_IPV4_ROUTE, + { .procname = "route", .mode = 0555, .child = ipv4_route_table}, - { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, + { .procname = "neigh", .mode = 0555, .child = empty}, { } }; static __net_initdata struct ctl_path ipv4_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "net", }, + { .procname = "ipv4", }, { }, }; static struct ctl_table ipv4_route_flush_table[] = { { - .ctl_name = NET_IPV4_ROUTE_FLUSH, .procname = "flush", .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, - .strategy = ipv4_sysctl_rtcache_flush_strategy, }, - { .ctl_name = 0 }, + { }, }; static __net_initdata struct ctl_path ipv4_route_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "route", }, { }, }; @@ -3311,7 +3235,7 @@ static __net_init int sysctl_route_net_init(struct net *net) struct ctl_table *tbl; tbl = ipv4_route_flush_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; @@ -3347,39 +3271,20 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { }; #endif - -static __net_init int rt_secret_timer_init(struct net *net) +static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->ipv4.rt_genid, - (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); - - net->ipv4.rt_secret_timer.function = rt_secret_rebuild; - net->ipv4.rt_secret_timer.data = (unsigned long)net; - init_timer_deferrable(&net->ipv4.rt_secret_timer); - - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires = - jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); - } + get_random_bytes(&net->ipv4.rt_genid, + sizeof(net->ipv4.rt_genid)); return 0; } -static __net_exit void rt_secret_timer_exit(struct net *net) -{ - del_timer_sync(&net->ipv4.rt_secret_timer); -} - -static __net_initdata struct pernet_operations rt_secret_timer_ops = { - .init = rt_secret_timer_init, - .exit = rt_secret_timer_exit, +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, }; #ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct __read_mostly; +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ static __initdata unsigned long rhash_entries; @@ -3412,7 +3317,7 @@ int __init ip_rt_init(void) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), rhash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 15 : 17, 0, &rt_hash_log, @@ -3435,20 +3340,18 @@ int __init ip_rt_init(void) schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (register_pernet_subsys(&rt_secret_timer_ops)) - printk(KERN_ERR "Unable to setup rt_secret_timer\n"); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); - xfrm4_init(); + xfrm4_init(ip_rt_max_size); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif + register_pernet_subsys(&rt_genid_ops); return rc; } @@ -3464,5 +3367,4 @@ void __init ip_static_sysctl_init(void) #endif EXPORT_SYMBOL(__ip_select_ident); -EXPORT_SYMBOL(ip_route_input); EXPORT_SYMBOL(ip_route_output_key); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index cd2b97f1b6e1..9f6b22206c52 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; @@ -252,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { + struct tcp_options_received tcp_opt; + u8 *hash_location; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -262,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; - struct tcp_options_received tcp_opt; if (!sysctl_tcp_syncookies || !th->ack) goto out; @@ -277,7 +279,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); if (tcp_opt.saw_tstamp) cookie_check_timestamp(&tcp_opt); @@ -332,7 +334,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi fl = { .nl_u = { .ip4_u = + struct flowi fl = { .mark = sk->sk_mark, + .nl_u = { .ip4_u = { .daddr = ((opt && opt->srr) ? opt->faddr : ireq->rmt_addr), @@ -344,7 +347,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, { .sport = th->dest, .dport = th->source } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(sock_net(sk), &rt, &fl)) { reqsk_free(req); goto out; } @@ -355,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(&rt->u.dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4710d219f06a..d96c1da4b17c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include <linux/inetdevice.h> #include <linux/seqlock.h> #include <linux/init.h> +#include <linux/slab.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -36,7 +37,7 @@ static void set_local_port_range(int range[2]) } /* Validate changes from /proc interface. */ -static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, +static int ipv4_local_port_range(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -51,7 +52,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, }; inet_get_local_port_range(range, range + 1); - ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) @@ -63,35 +64,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, return ret; } -/* Validate changes from sysctl interface. */ -static int ipv4_sysctl_local_port_range(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int ret; - int range[2]; - ctl_table tmp = { - .data = &range, - .maxlen = sizeof(range), - .mode = table->mode, - .extra1 = &ip_local_port_range_min, - .extra2 = &ip_local_port_range_max, - }; - - inet_get_local_port_range(range, range + 1); - ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen); - if (ret == 0 && newval && newlen) { - if (range[1] < range[0]) - ret = -EINVAL; - else - set_local_port_range(range); - } - return ret; -} - - -static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, +static int proc_tcp_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char val[TCP_CA_NAME_MAX]; @@ -103,33 +76,14 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * tcp_get_default_congestion_control(val); - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_default_congestion_control(val); return ret; } -static int sysctl_tcp_congestion_control(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - char val[TCP_CA_NAME_MAX]; - ctl_table tbl = { - .data = val, - .maxlen = TCP_CA_NAME_MAX, - }; - int ret; - - tcp_get_default_congestion_control(val); - ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); - if (ret == 1 && newval && newlen) - ret = tcp_set_default_congestion_control(val); - return ret; -} - static int proc_tcp_available_congestion_control(ctl_table *ctl, - int write, struct file * filp, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -140,13 +94,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl, if (!tbl.data) return -ENOMEM; tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_allowed_congestion_control(ctl_table *ctl, - int write, struct file * filp, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -158,39 +112,15 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return -ENOMEM; tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); return ret; } -static int strategy_allowed_congestion_control(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; - int ret; - - tbl.data = kmalloc(tbl.maxlen, GFP_USER); - if (!tbl.data) - return -ENOMEM; - - tcp_get_available_congestion_control(tbl.data, tbl.maxlen); - ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); - if (ret == 1 && newval && newlen) - ret = tcp_set_allowed_congestion_control(tbl.data); - kfree(tbl.data); - - return ret; - -} - static struct ctl_table ipv4_table[] = { { - .ctl_name = NET_IPV4_TCP_TIMESTAMPS, .procname = "tcp_timestamps", .data = &sysctl_tcp_timestamps, .maxlen = sizeof(int), @@ -198,7 +128,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_WINDOW_SCALING, .procname = "tcp_window_scaling", .data = &sysctl_tcp_window_scaling, .maxlen = sizeof(int), @@ -206,7 +135,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_SACK, .procname = "tcp_sack", .data = &sysctl_tcp_sack, .maxlen = sizeof(int), @@ -214,7 +142,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE, .procname = "tcp_retrans_collapse", .data = &sysctl_tcp_retrans_collapse, .maxlen = sizeof(int), @@ -222,17 +149,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_DEFAULT_TTL, .procname = "ip_default_ttl", .data = &sysctl_ip_default_ttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipv4_doint_and_flush, - .strategy = ipv4_doint_and_flush_strategy, .extra2 = &init_net, }, { - .ctl_name = NET_IPV4_NO_PMTU_DISC, .procname = "ip_no_pmtu_disc", .data = &ipv4_config.no_pmtu_disc, .maxlen = sizeof(int), @@ -240,7 +164,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_NONLOCAL_BIND, .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -248,7 +171,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_SYN_RETRIES, .procname = "tcp_syn_retries", .data = &sysctl_tcp_syn_retries, .maxlen = sizeof(int), @@ -256,7 +178,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_SYNACK_RETRIES, .procname = "tcp_synack_retries", .data = &sysctl_tcp_synack_retries, .maxlen = sizeof(int), @@ -264,7 +185,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MAX_ORPHANS, .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -272,7 +192,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MAX_TW_BUCKETS, .procname = "tcp_max_tw_buckets", .data = &tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), @@ -280,7 +199,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_DYNADDR, .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, .maxlen = sizeof(int), @@ -288,16 +206,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, .procname = "tcp_keepalive_time", .data = &sysctl_tcp_keepalive_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES, .procname = "tcp_keepalive_probes", .data = &sysctl_tcp_keepalive_probes, .maxlen = sizeof(int), @@ -305,26 +220,21 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL, .procname = "tcp_keepalive_intvl", .data = &sysctl_tcp_keepalive_intvl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_TCP_RETRIES1, .procname = "tcp_retries1", .data = &sysctl_tcp_retries1, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra2 = &tcp_retr1_max }, { - .ctl_name = NET_IPV4_TCP_RETRIES2, .procname = "tcp_retries2", .data = &sysctl_tcp_retries2, .maxlen = sizeof(int), @@ -332,17 +242,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT, .procname = "tcp_fin_timeout", .data = &sysctl_tcp_fin_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, #ifdef CONFIG_SYN_COOKIES { - .ctl_name = NET_TCP_SYNCOOKIES, .procname = "tcp_syncookies", .data = &sysctl_tcp_syncookies, .maxlen = sizeof(int), @@ -351,7 +258,6 @@ static struct ctl_table ipv4_table[] = { }, #endif { - .ctl_name = NET_TCP_TW_RECYCLE, .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), @@ -359,7 +265,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_ABORT_ON_OVERFLOW, .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -367,7 +272,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_STDURG, .procname = "tcp_stdurg", .data = &sysctl_tcp_stdurg, .maxlen = sizeof(int), @@ -375,7 +279,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_RFC1337, .procname = "tcp_rfc1337", .data = &sysctl_tcp_rfc1337, .maxlen = sizeof(int), @@ -383,7 +286,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MAX_SYN_BACKLOG, .procname = "tcp_max_syn_backlog", .data = &sysctl_max_syn_backlog, .maxlen = sizeof(int), @@ -391,17 +293,21 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, .procname = "ip_local_port_range", .data = &sysctl_local_ports.range, .maxlen = sizeof(sysctl_local_ports.range), .mode = 0644, .proc_handler = ipv4_local_port_range, - .strategy = ipv4_sysctl_local_port_range, + }, + { + .procname = "ip_local_reserved_ports", + .data = NULL, /* initialized in sysctl_ipv4_init */ + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, }, #ifdef CONFIG_IP_MULTICAST { - .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS, .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, .maxlen = sizeof(int), @@ -411,7 +317,6 @@ static struct ctl_table ipv4_table[] = { #endif { - .ctl_name = NET_IPV4_IGMP_MAX_MSF, .procname = "igmp_max_msf", .data = &sysctl_igmp_max_msf, .maxlen = sizeof(int), @@ -419,7 +324,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_INET_PEER_THRESHOLD, .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -427,43 +331,34 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_INET_PEER_MINTTL, .procname = "inet_peer_minttl", .data = &inet_peer_minttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_INET_PEER_MAXTTL, .procname = "inet_peer_maxttl", .data = &inet_peer_maxttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME, .procname = "inet_peer_gc_mintime", .data = &inet_peer_gc_mintime, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME, .procname = "inet_peer_gc_maxtime", .data = &inet_peer_gc_maxtime, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_TCP_ORPHAN_RETRIES, .procname = "tcp_orphan_retries", .data = &sysctl_tcp_orphan_retries, .maxlen = sizeof(int), @@ -471,7 +366,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_FACK, .procname = "tcp_fack", .data = &sysctl_tcp_fack, .maxlen = sizeof(int), @@ -479,7 +373,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_REORDERING, .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -487,7 +380,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_ECN, .procname = "tcp_ecn", .data = &sysctl_tcp_ecn, .maxlen = sizeof(int), @@ -495,7 +387,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_DSACK, .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), @@ -503,7 +394,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MEM, .procname = "tcp_mem", .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), @@ -511,7 +401,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_WMEM, .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), @@ -519,7 +408,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_RMEM, .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), @@ -527,7 +415,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_APP_WIN, .procname = "tcp_app_win", .data = &sysctl_tcp_app_win, .maxlen = sizeof(int), @@ -535,7 +422,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_ADV_WIN_SCALE, .procname = "tcp_adv_win_scale", .data = &sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), @@ -543,7 +429,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_TW_REUSE, .procname = "tcp_tw_reuse", .data = &sysctl_tcp_tw_reuse, .maxlen = sizeof(int), @@ -551,7 +436,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_FRTO, .procname = "tcp_frto", .data = &sysctl_tcp_frto, .maxlen = sizeof(int), @@ -559,7 +443,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_FRTO_RESPONSE, .procname = "tcp_frto_response", .data = &sysctl_tcp_frto_response, .maxlen = sizeof(int), @@ -567,7 +450,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_LOW_LATENCY, .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), @@ -575,7 +457,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, .maxlen = sizeof(int), @@ -583,7 +464,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", .data = &sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(int), @@ -591,7 +471,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_TSO_WIN_DIVISOR, .procname = "tcp_tso_win_divisor", .data = &sysctl_tcp_tso_win_divisor, .maxlen = sizeof(int), @@ -599,15 +478,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_CONG_CONTROL, .procname = "tcp_congestion_control", .mode = 0644, .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, - .strategy = sysctl_tcp_congestion_control, }, { - .ctl_name = NET_TCP_ABC, .procname = "tcp_abc", .data = &sysctl_tcp_abc, .maxlen = sizeof(int), @@ -615,7 +491,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_MTU_PROBING, .procname = "tcp_mtu_probing", .data = &sysctl_tcp_mtu_probing, .maxlen = sizeof(int), @@ -623,7 +498,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_BASE_MSS, .procname = "tcp_base_mss", .data = &sysctl_tcp_base_mss, .maxlen = sizeof(int), @@ -631,7 +505,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, .procname = "tcp_workaround_signed_windows", .data = &sysctl_tcp_workaround_signed_windows, .maxlen = sizeof(int), @@ -640,7 +513,6 @@ static struct ctl_table ipv4_table[] = { }, #ifdef CONFIG_NET_DMA { - .ctl_name = NET_TCP_DMA_COPYBREAK, .procname = "tcp_dma_copybreak", .data = &sysctl_tcp_dma_copybreak, .maxlen = sizeof(int), @@ -649,7 +521,6 @@ static struct ctl_table ipv4_table[] = { }, #endif { - .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, .procname = "tcp_slow_start_after_idle", .data = &sysctl_tcp_slow_start_after_idle, .maxlen = sizeof(int), @@ -658,7 +529,6 @@ static struct ctl_table ipv4_table[] = { }, #ifdef CONFIG_NETLABEL { - .ctl_name = NET_CIPSOV4_CACHE_ENABLE, .procname = "cipso_cache_enable", .data = &cipso_v4_cache_enabled, .maxlen = sizeof(int), @@ -666,7 +536,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE, .procname = "cipso_cache_bucket_size", .data = &cipso_v4_cache_bucketsize, .maxlen = sizeof(int), @@ -674,7 +543,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CIPSOV4_RBM_OPTFMT, .procname = "cipso_rbm_optfmt", .data = &cipso_v4_rbm_optfmt, .maxlen = sizeof(int), @@ -682,7 +550,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CIPSOV4_RBM_STRICTVALID, .procname = "cipso_rbm_strictvalid", .data = &cipso_v4_rbm_strictvalid, .maxlen = sizeof(int), @@ -697,15 +564,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_available_congestion_control, }, { - .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL, .procname = "tcp_allowed_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0644, .proc_handler = proc_allowed_congestion_control, - .strategy = strategy_allowed_congestion_control, }, { - .ctl_name = NET_TCP_MAX_SSTHRESH, .procname = "tcp_max_ssthresh", .data = &sysctl_tcp_max_ssthresh, .maxlen = sizeof(int), @@ -713,41 +577,55 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, + .procname = "tcp_cookie_size", + .data = &sysctl_tcp_cookie_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_thin_dupack", + .data = &sysctl_tcp_thin_dupack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &zero }, { - .ctl_name = CTL_UNNUMBERED, .procname = "udp_rmem_min", .data = &sysctl_udp_rmem_min, .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &zero }, { - .ctl_name = CTL_UNNUMBERED, .procname = "udp_wmem_min", .data = &sysctl_udp_wmem_min, .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &zero }, - { .ctl_name = 0 } + { } }; static struct ctl_table ipv4_net_table[] = { { - .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, .maxlen = sizeof(int), @@ -755,7 +633,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, .maxlen = sizeof(int), @@ -763,7 +640,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, .maxlen = sizeof(int), @@ -771,7 +647,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, .maxlen = sizeof(int), @@ -779,16 +654,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_RATELIMIT, .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies }, { - .ctl_name = NET_IPV4_ICMP_RATEMASK, .procname = "icmp_ratemask", .data = &init_net.ipv4.sysctl_icmp_ratemask, .maxlen = sizeof(int), @@ -796,7 +668,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = CTL_UNNUMBERED, .procname = "rt_cache_rebuild_count", .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, .maxlen = sizeof(int), @@ -807,8 +678,8 @@ static struct ctl_table ipv4_net_table[] = { }; struct ctl_path net_ipv4_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "net", }, + { .procname = "ipv4", }, { }, }; EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); @@ -818,7 +689,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) struct ctl_table *table; table = ipv4_net_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -849,7 +720,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; @@ -872,6 +743,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; + struct ctl_table *i; + + for (i = ipv4_table; i->procname; i++) { + if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { + i->data = sysctl_local_reserved_ports; + break; + } + } + if (!i->procname) + return -EINVAL; hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 91145244ea63..65afeaec15b7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -264,6 +264,8 @@ #include <linux/cache.h> #include <linux/err.h> #include <linux/crypto.h> +#include <linux/time.h> +#include <linux/slab.h> #include <net/icmp.h> #include <net/tcp.h> @@ -326,6 +328,43 @@ void tcp_enter_memory_pressure(struct sock *sk) EXPORT_SYMBOL(tcp_enter_memory_pressure); +/* Convert seconds to retransmits based on initial and max timeout */ +static u8 secs_to_retrans(int seconds, int timeout, int rto_max) +{ + u8 res = 0; + + if (seconds > 0) { + int period = timeout; + + res = 1; + while (seconds > period && res < 255) { + res++; + timeout <<= 1; + if (timeout > rto_max) + timeout = rto_max; + period += timeout; + } + } + return res; +} + +/* Convert retransmits to seconds based on initial and max timeout */ +static int retrans_to_secs(u8 retrans, int timeout, int rto_max) +{ + int period = 0; + + if (retrans > 0) { + period = timeout; + while (--retrans) { + timeout <<= 1; + if (timeout > rto_max) + timeout = rto_max; + period += timeout; + } + } + return period; +} + /* * Wait for a TCP event. * @@ -339,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -391,7 +430,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tp->urg_seq == tp->copied_seq && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) - target--; + target++; /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken @@ -498,8 +537,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tp->nonagle &= ~TCP_NAGLE_PUSH; } -static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, - struct sk_buff *skb) +static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; @@ -508,13 +546,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, static inline void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); - if (tcp_send_head(sk)) { - struct sk_buff *skb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + if (!(flags & MSG_MORE) || forced_push(tp)) - tcp_mark_push(tp, skb); - tcp_mark_urg(tp, flags, skb); + tcp_mark_push(tp, tcp_write_queue_tail(sk)); + + tcp_mark_urg(tp, flags); __tcp_push_pending_frames(sk, mss_now, (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); } @@ -570,6 +608,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, ssize_t spliced; int ret; + sock_rps_record_flow(sk); /* * We can't seek on a socket input */ @@ -580,7 +619,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, lock_sock(sk); - timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) @@ -839,12 +878,12 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int select_size(struct sock *sk) +static inline int select_size(struct sock *sk, int sg) { struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; - if (sk->sk_route_caps & NETIF_F_SG) { + if (sg) { if (sk_can_gso(sk)) tmp = 0; else { @@ -868,7 +907,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct sk_buff *skb; int iovlen, flags; int mss_now, size_goal; - int err, copied; + int sg, err, copied; long timeo; lock_sock(sk); @@ -896,6 +935,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; + sg = sk->sk_route_caps & NETIF_F_SG; + while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; @@ -921,8 +962,9 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, select_size(sk), - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, + select_size(sk, sg), + sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -959,9 +1001,7 @@ new_segment: /* We can extend the last page * fragment. */ merge = 1; - } else if (i == MAX_SKB_FRAGS || - (!i && - !(sk->sk_route_caps & NETIF_F_SG))) { + } else if (i == MAX_SKB_FRAGS || !sg) { /* Need to add new fragment and cannot * do this because interface is non-SG, * or because all the page slots are @@ -1146,7 +1186,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); #endif if (inet_csk_ack_scheduled(sk)) { @@ -1214,6 +1256,39 @@ static void tcp_prequeue_process(struct sock *sk) tp->ucopy.memory = 0; } +#ifdef CONFIG_NET_DMA +static void tcp_service_net_dma(struct sock *sk, bool wait) +{ + dma_cookie_t done, used; + dma_cookie_t last_issued; + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->ucopy.dma_chan) + return; + + last_issued = tp->ucopy.dma_cookie; + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + do { + if (dma_async_memcpy_complete(tp->ucopy.dma_chan, + last_issued, &done, + &used) == DMA_SUCCESS) { + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + break; + } else { + struct sk_buff *skb; + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + } while (wait); +} +#endif + static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1295,6 +1370,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb, 0); if (!desc->count) break; + tp->copied_seq = seq; } tp->copied_seq = seq; @@ -1393,11 +1469,13 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Now that we have two receive queues this * shouldn't happen. */ - if (before(*seq, TCP_SKB_CB(skb)->seq)) { - printk(KERN_INFO "recvmsg bug: copied %X " - "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + KERN_INFO "recvmsg bug: copied %X " + "seq %X rcvnxt %X fl %X\n", *seq, + TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) break; - } + offset = *seq - TCP_SKB_CB(skb)->seq; if (tcp_hdr(skb)->syn) offset--; @@ -1405,7 +1483,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - WARN_ON(!(flags & MSG_PEEK)); + WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " + "copied %X seq %X rcvnxt %X fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, + tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ @@ -1501,6 +1582,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1509,6 +1594,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sk_wait_data(sk, &timeo); #ifdef CONFIG_NET_DMA + tcp_service_net_dma(sk, false); /* Don't block */ tp->ucopy.wakeup = 0; #endif @@ -1588,6 +1674,9 @@ do_prequeue: copied = -EFAULT; break; } + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + if ((offset + used) == skb->len) copied_early = 1; @@ -1657,27 +1746,9 @@ skip_copy: } #ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - dma_cookie_t done, used; - - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); - - while (dma_async_memcpy_complete(tp->ucopy.dma_chan, - tp->ucopy.dma_cookie, &done, - &used) == DMA_IN_PROGRESS) { - /* do partial cleanup of sk_async_wait_queue */ - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } + tcp_service_net_dma(sk, true); /* Wait for queue to drain */ + tp->ucopy.dma_chan = NULL; - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - tp->ucopy.dma_chan = NULL; - } if (tp->ucopy.pinned_list) { dma_unpin_iovec_pages(tp->ucopy.pinned_list); tp->ucopy.pinned_list = NULL; @@ -1839,7 +1910,7 @@ void tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_KERNEL); + tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -1998,7 +2069,7 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_async_wait_queue); #endif - inet->dport = 0; + inet->inet_dport = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); @@ -2012,9 +2083,10 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; tp->packets_out = 0; - tp->snd_ssthresh = 0x7fffffff; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2022,7 +2094,7 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - WARN_ON(inet->num && !icsk->icsk_bind_hash); + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; @@ -2032,22 +2104,23 @@ int tcp_disconnect(struct sock *sk, int flags) * Socket option code for TCP. */ static int do_tcp_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, int optlen) + int optname, char __user *optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int val; int err = 0; - /* This is a string value all the others are int's */ - if (optname == TCP_CONGESTION) { + /* These are data/string values, all the others are ints */ + switch (optname) { + case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_user(name, optval, - min(TCP_CA_NAME_MAX-1, optlen)); + min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; @@ -2057,6 +2130,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = NULL; + + if (sizeof(ctd) > optlen) + return -EINVAL; + if (copy_from_user(&ctd, optval, sizeof(ctd))) + return -EFAULT; + + if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || + ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) + return -EINVAL; + + if (ctd.tcpct_cookie_desired == 0) { + /* default to global value */ + } else if ((0x1 & ctd.tcpct_cookie_desired) || + ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || + ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { + return -EINVAL; + } + + if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { + /* Supercedes all other values */ + lock_sock(sk); + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + tp->rx_opt.cookie_in_always = 0; /* false */ + tp->rx_opt.cookie_out_never = 1; /* true */ + release_sock(sk); + return err; + } + + /* Allocate ancillary memory before locking. + */ + if (ctd.tcpct_used > 0 || + (tp->cookie_values == NULL && + (sysctl_tcp_cookie_size > 0 || + ctd.tcpct_cookie_desired > 0 || + ctd.tcpct_s_data_desired > 0))) { + cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, + GFP_KERNEL); + if (cvp == NULL) + return -ENOMEM; + } + lock_sock(sk); + tp->rx_opt.cookie_in_always = + (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); + tp->rx_opt.cookie_out_never = 0; /* false */ + + if (tp->cookie_values != NULL) { + if (cvp != NULL) { + /* Changed values are recorded by a changed + * pointer, ensuring the cookie will differ, + * without separately hashing each value later. + */ + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + kref_init(&cvp->kref); + tp->cookie_values = cvp; + } else { + cvp = tp->cookie_values; + } + } + if (cvp != NULL) { + cvp->cookie_desired = ctd.tcpct_cookie_desired; + + if (ctd.tcpct_used > 0) { + memcpy(cvp->s_data_payload, ctd.tcpct_value, + ctd.tcpct_used); + cvp->s_data_desired = ctd.tcpct_used; + cvp->s_data_constant = 1; /* true */ + } else { + /* No constant payload data. */ + cvp->s_data_desired = ctd.tcpct_s_data_desired; + cvp->s_data_constant = 0; /* false */ + } + } + release_sock(sk); + return err; + } + default: + /* fallthru */ + break; + } if (optlen < sizeof(int)) return -EINVAL; @@ -2095,6 +2255,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_THIN_LINEAR_TIMEOUTS: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_lto = val; + break; + + case TCP_THIN_DUPACK: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_dupack = val; + break; + case TCP_CORK: /* When set indicates to always queue non-full frames. * Later the user clears this option and we transmit @@ -2125,7 +2299,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; + u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else @@ -2163,16 +2337,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - icsk->icsk_accept_queue.rskq_defer_accept = 0; - if (val > 0) { - /* Translate value in seconds to number of - * retransmits */ - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && - val > ((TCP_TIMEOUT_INIT / HZ) << - icsk->icsk_accept_queue.rskq_defer_accept)) - icsk->icsk_accept_queue.rskq_defer_accept++; - icsk->icsk_accept_queue.rskq_defer_accept++; - } + /* Translate value in seconds to number of retransmits */ + icsk->icsk_accept_queue.rskq_defer_accept = + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: @@ -2220,7 +2388,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, - int optlen) + unsigned int optlen) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2232,7 +2400,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, #ifdef CONFIG_COMPAT int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level != SOL_TCP) return inet_csk_compat_setsockopt(sk, level, optname, @@ -2336,13 +2504,13 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: - val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; + val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: - val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; + val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: - val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; + val = keepalive_probes(tp); break; case TCP_SYNCNT: val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; @@ -2353,8 +2521,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); + val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, + TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; @@ -2387,6 +2555,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; + + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = tp->cookie_values; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(ctd)) + return -EINVAL; + + memset(&ctd, 0, sizeof(ctd)); + ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? + TCP_COOKIE_IN_ALWAYS : 0) + | (tp->rx_opt.cookie_out_never ? + TCP_COOKIE_OUT_NEVER : 0); + + if (cvp != NULL) { + ctd.tcpct_flags |= (cvp->s_data_in ? + TCP_S_DATA_IN : 0) + | (cvp->s_data_out ? + TCP_S_DATA_OUT : 0); + + ctd.tcpct_cookie_desired = cvp->cookie_desired; + ctd.tcpct_s_data_desired = cvp->s_data_desired; + + memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], + cvp->cookie_pair_size); + ctd.tcpct_used = cvp->cookie_pair_size; + } + + if (put_user(sizeof(ctd), optlen)) + return -EFAULT; + if (copy_to_user(optval, &ctd, sizeof(ctd))) + return -EFAULT; + return 0; + } default: return -ENOPROTOOPT; } @@ -2518,7 +2722,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct tcphdr *th2; unsigned int len; unsigned int thlen; - unsigned int flags; + __be32 flags; unsigned int mss = 1; unsigned int hlen; unsigned int off; @@ -2568,10 +2772,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) found: flush = NAPI_GRO_CB(p)->flush; - flush |= flags & TCP_FLAG_CWR; - flush |= (flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); - flush |= th->ack_seq ^ th2->ack_seq; + flush |= (__force int)(flags & TCP_FLAG_CWR); + flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); @@ -2592,8 +2796,9 @@ found: out_check_final: flush = len < mss; - flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | - TCP_FLAG_SYN | TCP_FLAG_FIN); + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | + TCP_FLAG_RST | TCP_FLAG_SYN | + TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = head; @@ -2624,10 +2829,10 @@ EXPORT_SYMBOL(tcp_gro_complete); #ifdef CONFIG_TCP_MD5SIG static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool **tcp_md5sig_pool; +static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) +static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) { int cpu; for_each_possible_cpu(cpu) { @@ -2636,7 +2841,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) if (p->md5_desc.tfm) crypto_free_hash(p->md5_desc.tfm); kfree(p); - p = NULL; } } free_percpu(pool); @@ -2644,7 +2848,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) void tcp_free_md5sig_pool(void) { - struct tcp_md5sig_pool **pool = NULL; + struct tcp_md5sig_pool * __percpu *pool = NULL; spin_lock_bh(&tcp_md5sig_pool_lock); if (--tcp_md5sig_users == 0) { @@ -2658,10 +2862,11 @@ void tcp_free_md5sig_pool(void) EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) +static struct tcp_md5sig_pool * __percpu * +__tcp_alloc_md5sig_pool(struct sock *sk) { int cpu; - struct tcp_md5sig_pool **pool; + struct tcp_md5sig_pool * __percpu *pool; pool = alloc_percpu(struct tcp_md5sig_pool *); if (!pool) @@ -2671,7 +2876,7 @@ static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) struct tcp_md5sig_pool *p; struct crypto_hash *hash; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), sk->sk_allocation); if (!p) goto out_free; *per_cpu_ptr(pool, cpu) = p; @@ -2688,9 +2893,9 @@ out_free: return NULL; } -struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) +struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) { - struct tcp_md5sig_pool **pool; + struct tcp_md5sig_pool * __percpu *pool; int alloc = 0; retry: @@ -2709,7 +2914,9 @@ retry: if (alloc) { /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); + struct tcp_md5sig_pool * __percpu *p; + + p = __tcp_alloc_md5sig_pool(sk); spin_lock_bh(&tcp_md5sig_pool_lock); if (!p) { tcp_md5sig_users--; @@ -2731,25 +2938,40 @@ retry: EXPORT_SYMBOL(tcp_alloc_md5sig_pool); -struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) + +/** + * tcp_get_md5sig_pool - get md5sig_pool for this user + * + * We use percpu structure, so if we succeed, we exit with preemption + * and BH disabled, to make sure another thread or softirq handling + * wont try to get same context. + */ +struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { - struct tcp_md5sig_pool **p; - spin_lock_bh(&tcp_md5sig_pool_lock); + struct tcp_md5sig_pool * __percpu *p; + + local_bh_disable(); + + spin_lock(&tcp_md5sig_pool_lock); p = tcp_md5sig_pool; if (p) tcp_md5sig_users++; - spin_unlock_bh(&tcp_md5sig_pool_lock); - return (p ? *per_cpu_ptr(p, cpu) : NULL); -} + spin_unlock(&tcp_md5sig_pool_lock); -EXPORT_SYMBOL(__tcp_get_md5sig_pool); + if (p) + return *per_cpu_ptr(p, smp_processor_id()); -void __tcp_put_md5sig_pool(void) + local_bh_enable(); + return NULL; +} +EXPORT_SYMBOL(tcp_get_md5sig_pool); + +void tcp_put_md5sig_pool(void) { + local_bh_enable(); tcp_free_md5sig_pool(); } - -EXPORT_SYMBOL(__tcp_put_md5sig_pool); +EXPORT_SYMBOL(tcp_put_md5sig_pool); int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, struct tcphdr *th) @@ -2809,6 +3031,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key); #endif +/** + * Each Responder maintains up to two secret values concurrently for + * efficient secret rollover. Each secret value has 4 states: + * + * Generating. (tcp_secret_generating != tcp_secret_primary) + * Generates new Responder-Cookies, but not yet used for primary + * verification. This is a short-term state, typically lasting only + * one round trip time (RTT). + * + * Primary. (tcp_secret_generating == tcp_secret_primary) + * Used both for generation and primary verification. + * + * Retiring. (tcp_secret_retiring != tcp_secret_secondary) + * Used for verification, until the first failure that can be + * verified by the newer Generating secret. At that time, this + * cookie's state is changed to Secondary, and the Generating + * cookie's state is changed to Primary. This is a short-term state, + * typically lasting only one round trip time (RTT). + * + * Secondary. (tcp_secret_retiring == tcp_secret_secondary) + * Used for secondary verification, after primary verification + * failures. This state lasts no more than twice the Maximum Segment + * Lifetime (2MSL). Then, the secret is discarded. + */ +struct tcp_cookie_secret { + /* The secret is divided into two parts. The digest part is the + * equivalent of previously hashing a secret and saving the state, + * and serves as an initialization vector (IV). The message part + * serves as the trailing secret. + */ + u32 secrets[COOKIE_WORKSPACE_WORDS]; + unsigned long expires; +}; + +#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) +#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) +#define TCP_SECRET_LIFE (HZ * 600) + +static struct tcp_cookie_secret tcp_secret_one; +static struct tcp_cookie_secret tcp_secret_two; + +/* Essentially a circular list, without dynamic allocation. */ +static struct tcp_cookie_secret *tcp_secret_generating; +static struct tcp_cookie_secret *tcp_secret_primary; +static struct tcp_cookie_secret *tcp_secret_retiring; +static struct tcp_cookie_secret *tcp_secret_secondary; + +static DEFINE_SPINLOCK(tcp_secret_locker); + +/* Select a pseudo-random word in the cookie workspace. + */ +static inline u32 tcp_cookie_work(const u32 *ws, const int n) +{ + return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; +} + +/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. + * Called in softirq context. + * Returns: 0 for success. + */ +int tcp_cookie_generator(u32 *bakery) +{ + unsigned long jiffy = jiffies; + + if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { + spin_lock_bh(&tcp_secret_locker); + if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { + /* refreshed by another */ + memcpy(bakery, + &tcp_secret_generating->secrets[0], + COOKIE_WORKSPACE_WORDS); + } else { + /* still needs refreshing */ + get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); + + /* The first time, paranoia assumes that the + * randomization function isn't as strong. But, + * this secret initialization is delayed until + * the last possible moment (packet arrival). + * Although that time is observable, it is + * unpredictably variable. Mash in the most + * volatile clock bits available, and expire the + * secret extra quickly. + */ + if (unlikely(tcp_secret_primary->expires == + tcp_secret_secondary->expires)) { + struct timespec tv; + + getnstimeofday(&tv); + bakery[COOKIE_DIGEST_WORDS+0] ^= + (u32)tv.tv_nsec; + + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_1MSL + + (0x0f & tcp_cookie_work(bakery, 0)); + } else { + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_LIFE + + (0xff & tcp_cookie_work(bakery, 1)); + tcp_secret_primary->expires = jiffy + + TCP_SECRET_2MSL + + (0x1f & tcp_cookie_work(bakery, 2)); + } + memcpy(&tcp_secret_secondary->secrets[0], + bakery, COOKIE_WORKSPACE_WORDS); + + rcu_assign_pointer(tcp_secret_generating, + tcp_secret_secondary); + rcu_assign_pointer(tcp_secret_retiring, + tcp_secret_primary); + /* + * Neither call_rcu() nor synchronize_rcu() needed. + * Retiring data is not freed. It is replaced after + * further (locked) pointer updates, and a quiet time + * (minimum 1MSL, maximum LIFE - 2MSL). + */ + } + spin_unlock_bh(&tcp_secret_locker); + } else { + rcu_read_lock_bh(); + memcpy(bakery, + &rcu_dereference(tcp_secret_generating)->secrets[0], + COOKIE_WORKSPACE_WORDS); + rcu_read_unlock_bh(); + } + return 0; +} +EXPORT_SYMBOL(tcp_cookie_generator); + void tcp_done(struct sock *sk) { if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) @@ -2843,6 +3194,7 @@ void __init tcp_init(void) struct sk_buff *skb = NULL; unsigned long nr_pages, limit; int order, i, max_share; + unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -2862,14 +3214,13 @@ void __init tcp_init(void) alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 13 : 15, 0, - &tcp_hashinfo.ehash_size, NULL, + &tcp_hashinfo.ehash_mask, thash_entries ? 0 : 512 * 1024); - tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; - for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); } @@ -2878,8 +3229,8 @@ void __init tcp_init(void) tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), - tcp_hashinfo.ehash_size, - (num_physpages >= 128 * 1024) ? + tcp_hashinfo.ehash_mask + 1, + (totalram_pages >= 128 * 1024) ? 13 : 15, 0, &tcp_hashinfo.bhash_size, @@ -2933,10 +3284,19 @@ void __init tcp_init(void) sysctl_tcp_rmem[2] = max(87380, max_share); printk(KERN_INFO "TCP: Hash tables configured " - "(established %d bind %d)\n", - tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); + "(established %u bind %u)\n", + tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); + + memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); + memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); + tcp_secret_one.expires = jiffy; /* past due */ + tcp_secret_two.expires = jiffy; /* past due */ + tcp_secret_generating = &tcp_secret_one; + tcp_secret_primary = &tcp_secret_one; + tcp_secret_retiring = &tcp_secret_two; + tcp_secret_secondary = &tcp_secret_two; } EXPORT_SYMBOL(tcp_close); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index e92beb9e55e0..0ec9bd0ae94f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> +#include <linux/gfp.h> #include <net/tcp.h> int sysctl_tcp_max_ssthresh = 0; @@ -116,7 +117,7 @@ int tcp_set_default_congestion_control(const char *name) spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); #ifdef CONFIG_MODULES - if (!ca && capable(CAP_SYS_MODULE)) { + if (!ca && capable(CAP_NET_ADMIN)) { spin_unlock(&tcp_cong_list_lock); request_module("tcp_%s", name); @@ -246,7 +247,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) #ifdef CONFIG_MODULES /* not found attempt to autoload module */ - if (!ca && capable(CAP_SYS_MODULE)) { + if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index fcbcd4ff6c5f..939edb3b8e4d 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -27,7 +27,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else { - r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } if (info != NULL) diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 26d5c7fc7de5..7c94a4955416 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) if (icsk->icsk_ca_state == TCP_CA_Open) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; - if (ca->maxRTT < srtt - && srtt <= ca->maxRTT + msecs_to_jiffies(20)) + if (ca->maxRTT < srtt && + srtt <= ca->maxRTT + msecs_to_jiffies(20)) ca->maxRTT = srtt; } } @@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt ca->packetcount += pkts_acked; - if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) - && now - ca->lasttime >= ca->minRTT - && ca->minRTT > 0) { + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && + now - ca->lasttime >= ca->minRTT && + ca->minRTT > 0) { __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); if (htcp_ccount(ca) <= 3) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index c209e054a634..377bc9349371 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * calculate 2^fract in a <<7 value. */ is_slowstart = 1; - increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) - - 128; + increment = ((1 << min(ca->rho, 16U)) * + hybla_fraction(rho_fractions)) - 128; } else { /* * congestion avoidance diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2bdb0da237e6..548d575e6cc6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -62,6 +62,7 @@ */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/kernel.h> @@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; +int sysctl_tcp_thin_dupack __read_mostly; + int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; @@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); - if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows @@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); - hint = min(hint, TCP_MIN_RCVMSS); + hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; @@ -685,7 +688,7 @@ static inline void tcp_set_rto(struct sock *sk) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -696,8 +699,7 @@ static inline void tcp_set_rto(struct sock *sk) /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + tcp_bound_rto(sk); } /* Save metrics learned by this TCP session. @@ -762,7 +764,7 @@ void tcp_update_metrics(struct sock *sk) set_dst_metric_rtt(dst, RTAX_RTTVAR, var); } - if (tp->snd_ssthresh >= 0xFFFF) { + if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && @@ -2301,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ -static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +static inline int tcp_dupack_heuristics(struct tcp_sock *tp) { return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } @@ -2426,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heurestics(tp) > tp->reordering) + if (tcp_dupack_heuristics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast @@ -2448,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk) return 1; } + /* If a thin stream is detected, retransmit after first + * received dupack. Employ only if SACK is supported in order + * to avoid possible corner-case series of spurious retransmissions + * Use only if there are no unsent data. + */ + if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && + tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && + tcp_is_sack(tp) && !tcp_send_head(sk)) + return 1; + return 0; } @@ -2500,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) int err; unsigned int mss; + if (packets == 0) + return; + WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; @@ -2624,7 +2639,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) if (sk->sk_family == AF_INET) { printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, - &inet->daddr, ntohs(inet->dport), + &inet->inet_daddr, ntohs(inet->inet_dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); @@ -2634,7 +2649,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct ipv6_pinfo *np = inet6_sk(sk); printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, - &np->daddr, ntohs(inet->dport), + &np->daddr, ntohs(inet->inet_dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); @@ -2718,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk) } } +/* We can clear retrans_stamp when there are no retransmissions in the + * window. It would seem that it is trivially available for us in + * tp->retrans_out, however, that kind of assumptions doesn't consider + * what will happen if errors occur when sending retransmission for the + * second time. ...It could the that such segment has only + * TCPCB_EVER_RETRANS set at the present time. It seems that checking + * the head skb is enough except for some reneging corner cases that + * are not worth the effort. + * + * Main reason for all this complexity is the fact that connection dying + * time now depends on the validity of the retrans_stamp, in particular, + * that successive retransmissions of a segment must not advance + * retrans_stamp under any conditions. + */ +static int tcp_any_retrans_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (tp->retrans_out) + return 1; + + skb = tcp_write_queue_head(sk); + if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) + return 1; + + return 0; +} + /* Undo during fast recovery after partial ACK. */ static int tcp_try_undo_partial(struct sock *sk, int acked) @@ -2730,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ - if (tp->retrans_out == 0) + if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); @@ -2789,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; - if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) + if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -2804,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_verify_left_out(tp); - if (!tp->frto_counter && tp->retrans_out == 0) + if (!tp->frto_counter && !tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; if (flag & FLAG_ECE) @@ -3666,7 +3710,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) - dst_confirm(sk->sk_dst_cache); + dst_confirm(__sk_dst_get(sk)); return 1; @@ -3699,7 +3743,7 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - int estab) + u8 **hvpp, int estab) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); @@ -3783,6 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif + case TCPOPT_COOKIE: + /* This option is variable length. + */ + switch (opsize) { + case TCPOLEN_COOKIE_BASE: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_PAIR: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_MIN+0: + case TCPOLEN_COOKIE_MIN+2: + case TCPOLEN_COOKIE_MIN+4: + case TCPOLEN_COOKIE_MIN+6: + case TCPOLEN_COOKIE_MAX: + /* 16-bit multiple */ + opt_rx->cookie_plus = opsize; + *hvpp = ptr; + break; + default: + /* ignore option */ + break; + } + break; } ptr += opsize-2; @@ -3811,17 +3879,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) * If it is wrong it falls back on tcp_parse_options(). */ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) + struct tcp_sock *tp, u8 **hvpp) { - if (th->doff == sizeof(struct tcphdr) >> 2) { + /* In the spirit of fast parsing, compare doff directly to constant + * values. Because equality is used, short doff can be ignored here. + */ + if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && - th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) return 1; } - tcp_parse_options(skb, &tp->rx_opt, 1); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); return 1; } @@ -4249,7 +4320,7 @@ static void tcp_ofo_queue(struct sock *sk) } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "ofo packet was already received \n"); + SOCK_DEBUG(sk, "ofo packet was already received\n"); __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; @@ -4297,6 +4368,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; + skb_dst_drop(skb); __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); @@ -4846,11 +4918,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ - && __tcp_select_window(sk) >= tp->rcv_wnd) || + __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ @@ -5071,10 +5143,12 @@ out: static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int syn_inerr) { + u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp, &hash_location) && + tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5362,11 +5436,13 @@ discard: static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { - struct tcp_sock *tp = tcp_sk(sk); + u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); if (th->ack) { /* rfc793: @@ -5463,6 +5539,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * Change state from SYN-SENT only after copied_seq * is initialized. */ tp->copied_seq = tp->rcv_nxt; + + if (cvp != NULL && + cvp->cookie_pair_size > 0 && + tp->rx_opt.cookie_plus > 0) { + int cookie_size = tp->rx_opt.cookie_plus + - TCPOLEN_COOKIE_BASE; + int cookie_pair_size = cookie_size + + cvp->cookie_desired; + + /* A cookie extension option was sent and returned. + * Note that each incoming SYNACK replaces the + * Responder cookie. The initial exchange is most + * fragile, as protection against spoofing relies + * entirely upon the sequence and timestamp (above). + * This replacement strategy allows the correct pair to + * pass through, while any others will be filtered via + * Responder verification later. + */ + if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { + memcpy(&cvp->cookie_pair[cvp->cookie_desired], + hash_location, cookie_size); + cvp->cookie_pair_size = cookie_pair_size; + } + } + smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5700,11 +5801,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. - * Fix it at least with timestamps. + * Force it here. */ - if (tp->rx_opt.saw_tstamp && - tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, 0); + tcp_ack_update_rtt(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -5736,7 +5835,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tp->snd_una == tp->write_seq) { tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst_confirm(sk->sk_dst_cache); + dst_confirm(__sk_dst_get(sk)); if (!sock_flag(sk, SOCK_DEAD)) /* Wake up lingering close() */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6d88219c5e22..fe193e53af44 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -60,6 +60,7 @@ #include <linux/jhash.h> #include <linux/init.h> #include <linux/times.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/icmp.h> @@ -165,10 +166,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = inet->opt->faddr; } - tmp = ip_route_connect(&rt, nexthop, inet->saddr, + tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - inet->sport, usin->sin_port, sk, 1); + inet->inet_sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); @@ -183,11 +184,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->opt || !inet->opt->srr) daddr = rt->rt_dst; - if (!inet->saddr) - inet->saddr = rt->rt_src; - inet->rcv_saddr = inet->saddr; + if (!inet->inet_saddr) + inet->inet_saddr = rt->rt_src; + inet->inet_rcv_saddr = inet->inet_saddr; - if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { + if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; @@ -204,20 +205,20 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * when trying new connection. */ if (peer != NULL && - peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { + (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } - inet->dport = usin->sin_port; - inet->daddr = daddr; + inet->inet_dport = usin->sin_port; + inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; - tp->rx_opt.mss_clamp = 536; + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket @@ -230,7 +231,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, - inet->sport, inet->dport, sk); + inet->inet_sport, inet->inet_dport, sk); if (err) goto failure; @@ -239,12 +240,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) - tp->write_seq = secure_tcp_sequence_number(inet->saddr, - inet->daddr, - inet->sport, + tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, usin->sin_port); - inet->id = tp->write_seq ^ jiffies; + inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; @@ -261,7 +262,7 @@ failure: tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; - inet->dport = 0; + inet->inet_dport = 0; return err; } @@ -328,26 +329,29 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) * */ -void tcp_v4_err(struct sk_buff *skb, u32 info) +void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { - struct iphdr *iph = (struct iphdr *)skb->data; - struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); + struct iphdr *iph = (struct iphdr *)icmp_skb->data; + struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); + struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; + const int type = icmp_hdr(icmp_skb)->type; + const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; + struct sk_buff *skb; __u32 seq; + __u32 remaining; int err; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(icmp_skb->dev); - if (skb->len < (iph->ihl << 2) + 8) { + if (icmp_skb->len < (iph->ihl << 2) + 8) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, - iph->saddr, th->source, inet_iif(skb)); + iph->saddr, th->source, inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; @@ -367,6 +371,12 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + + icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && @@ -393,6 +403,39 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) } err = icmp_err_convert[code].errno; + /* check if icmp_skb allows revert of backoff + * (see draft-zimmermann-tcp-lcd) */ + if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) + break; + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + break; + + icsk->icsk_backoff--; + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << + icsk->icsk_backoff; + tcp_bound_rto(sk); + + skb = tcp_write_queue_head(sk); + BUG_ON(!skb); + + remaining = icsk->icsk_rto - min(icsk->icsk_rto, + tcp_time_stamp - TCP_SKB_CB(skb)->when); + + if (remaining) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else if (sock_owned_by_user(sk)) { + /* RTO revert clocked out retransmission, + * but socket is locked. Will defer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + HZ/20, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now */ + tcp_retransmit_timer(sk); + } + break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; @@ -476,25 +519,31 @@ out: sock_put(sk); } -/* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +static void __tcp_v4_send_check(struct sk_buff *skb, + __be32 saddr, __be32 daddr) { - struct inet_sock *inet = inet_sk(sk); struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(len, inet->saddr, - inet->daddr, 0); + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { - th->check = tcp_v4_check(len, inet->saddr, inet->daddr, + th->check = tcp_v4_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum)); } } +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + + __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); +} + int tcp_v4_gso_send_check(struct sk_buff *skb) { const struct iphdr *iph; @@ -507,10 +556,8 @@ int tcp_v4_gso_send_check(struct sk_buff *skb) th = tcp_hdr(skb); th->check = 0; - th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); return 0; } @@ -704,8 +751,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -715,16 +763,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req); + skb = tcp_make_synack(sk, dst, req, rvp); if (skb) { - struct tcphdr *th = tcp_hdr(skb); - - th->check = tcp_v4_check(skb->len, - ireq->loc_addr, - ireq->rmt_addr, - csum_partial(th, skb->len, - skb->csum)); + __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, @@ -736,9 +778,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, return err; } -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) { - return __tcp_v4_send_synack(sk, req, NULL); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v4_send_synack(sk, NULL, req, rvp); } /* @@ -811,7 +855,7 @@ static struct tcp_md5sig_key * struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, struct sock *addr_sk) { - return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); + return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); } EXPORT_SYMBOL(tcp_v4_md5_lookup); @@ -847,9 +891,9 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, kfree(newkey); return -ENOMEM; } - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool() == NULL) { + if (tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } @@ -886,7 +930,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add); static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, u8 *newkey, u8 newkeylen) { - return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, + return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, newkey, newkeylen); } @@ -970,16 +1014,17 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, if (!tcp_sk(sk)->md5sig_info) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL); + struct tcp_md5sig_info *p; + p = kzalloc(sizeof(*p), sk->sk_allocation); if (!p) return -EINVAL; tp->md5sig_info = p; - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); + newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); if (!newkey) return -ENOMEM; return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, @@ -1051,8 +1096,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, daddr; if (sk) { - saddr = inet_sk(sk)->saddr; - daddr = inet_sk(sk)->daddr; + saddr = inet_sk(sk)->inet_saddr; + daddr = inet_sk(sk)->inet_daddr; } else if (req) { saddr = inet_rsk(req)->loc_addr; daddr = inet_rsk(req)->rmt_addr; @@ -1151,14 +1196,15 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_send_synack, + .rtx_syn_ack = tcp_v4_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, }; @@ -1172,13 +1218,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct inet_request_sock *ireq; + struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; + u8 *hash_location; struct request_sock *req; + struct inet_request_sock *ireq; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1219,16 +1268,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) #endif tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = 536; - tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_release; - tcp_parse_options(skb, &tmp_opt, 0); + /* Secret recipe starts with IP addresses */ + *mess++ ^= (__force u32)daddr; + *mess++ ^= (__force u32)saddr; + + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; + +#ifdef CONFIG_SYN_COOKIES + want_cookie = 0; /* not our kind of cookie */ +#endif + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_release; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); @@ -1266,7 +1349,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { - if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1295,7 +1378,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) + if (tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || + want_cookie) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); @@ -1342,9 +1427,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->daddr = ireq->rmt_addr; - newinet->rcv_saddr = ireq->loc_addr; - newinet->saddr = ireq->loc_addr; + newinet->inet_daddr = ireq->rmt_addr; + newinet->inet_rcv_saddr = ireq->loc_addr; + newinet->inet_saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); @@ -1352,7 +1437,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; - newinet->id = newtp->write_seq ^ jiffies; + newinet->inet_id = newtp->write_seq ^ jiffies; tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1365,7 +1450,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { + key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); + if (key != NULL) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1374,13 +1460,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); if (newkey != NULL) - tcp_v4_md5_do_add(newsk, newinet->daddr, + tcp_v4_md5_do_add(newsk, newinet->inet_daddr, newkey, key->keylen); - newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; + sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif - __inet_hash_nolisten(newsk); + __inet_hash_nolisten(newsk, NULL); __inet_inherit_port(sk, newsk); return newsk; @@ -1469,6 +1555,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + sock_rps_save_rxhash(sk, skb->rxhash); TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; @@ -1493,7 +1580,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } return 0; } - } + } else + sock_rps_save_rxhash(sk, skb->rxhash); + TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { @@ -1572,6 +1661,11 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); @@ -1596,8 +1690,11 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else - sk_add_backlog(sk, skb); + } else if (unlikely(sk_add_backlog(sk, skb))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } bh_unlock_sock(sk); sock_put(sk); @@ -1673,8 +1770,8 @@ int tcp_v4_remember_stamp(struct sock *sk) struct inet_peer *peer = NULL; int release_it = 0; - if (!rt || rt->rt_dst != inet->daddr) { - peer = inet_getpeer(inet->daddr, 1); + if (!rt || rt->rt_dst != inet->inet_daddr) { + peer = inet_getpeer(inet->inet_daddr, 1); release_it = 1; } else { if (!rt->peer) @@ -1684,9 +1781,9 @@ int tcp_v4_remember_stamp(struct sock *sk) if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; } if (release_it) @@ -1705,9 +1802,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); @@ -1717,7 +1814,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct inet_connection_sock_af_ops ipv4_specific = { +const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1737,7 +1834,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_sock_af_ops tcp_sock_ipv4_specific = { +static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_add = tcp_v4_md5_add_func, @@ -1770,9 +1867,9 @@ static int tcp_v4_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_ssthresh = 0x7fffffff; /* Infinity */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; + tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1788,6 +1885,19 @@ static int tcp_v4_init_sock(struct sock *sk) tp->af_specific = &tcp_sock_ipv4_specific; #endif + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1841,6 +1951,13 @@ void tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + /* TCP Cookie Transactions */ + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + percpu_counter_dec(&tcp_sockets_allocated); } @@ -1962,7 +2079,7 @@ static void *established_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; @@ -2023,10 +2140,10 @@ get_tw: st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ - while (++st->bucket < tcp_hashinfo.ehash_size && + while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; - if (st->bucket >= tcp_hashinfo.ehash_size) + if (st->bucket > tcp_hashinfo.ehash_mask) return NULL; spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); @@ -2187,7 +2304,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", i, ireq->loc_addr, - ntohs(inet_sk(sk)->sport), + ntohs(inet_sk(sk)->inet_sport), ireq->rmt_addr, ntohs(ireq->rmt_port), TCP_SYN_RECV, @@ -2210,10 +2327,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - __be32 dest = inet->daddr; - __be32 src = inet->rcv_saddr; - __u16 destp = ntohs(inet->dport); - __u16 srcp = ntohs(inet->sport); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + int rx_queue; if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; @@ -2229,12 +2347,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) timer_expires = jiffies; } + if (sk->sk_state == TCP_LISTEN) + rx_queue = sk->sk_ack_backlog; + else + /* + * because we dont lock socket, we might find a transient negative value + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, - sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : - (tp->rcv_nxt - tp->copied_seq), + rx_queue, timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, @@ -2246,7 +2371,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh, + tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, len); } @@ -2316,12 +2441,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = { }, }; -static int tcp4_proc_init_net(struct net *net) +static int __net_init tcp4_proc_init_net(struct net *net) { return tcp_proc_register(net, &tcp4_seq_afinfo); } -static void tcp4_proc_exit_net(struct net *net) +static void __net_exit tcp4_proc_exit_net(struct net *net) { tcp_proc_unregister(net, &tcp4_seq_afinfo); } @@ -2425,12 +2550,17 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); +} + +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { - .init = tcp_sk_init, - .exit = tcp_sk_exit, + .init = tcp_sk_init, + .exit = tcp_sk_exit, + .exit_batch = tcp_sk_exit_batch, }; void __init tcp_v4_init(void) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index ce3c41ff50b2..de870377fbba 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) goto out; /* we can't calc remote HZ with no different!! */ - if (tp->rx_opt.rcv_tsval == lp->remote_ref_time - || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time || + tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; m = HZ * (tp->rx_opt.rcv_tsval - diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f8d67ccc64f3..794c2e122a41 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,19 +20,14 @@ #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> -#ifdef CONFIG_SYSCTL -#define SYNC_INIT 0 /* let the user enable it */ -#else -#define SYNC_INIT 1 -#endif - -int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; +int sysctl_tcp_syncookies __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_syncookies); int sysctl_tcp_abort_on_overflow __read_mostly; @@ -96,13 +91,14 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { - struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; + u8 *hash_location; + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); int paws_reject = 0; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -322,7 +318,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (key != NULL) { memcpy(&tcptw->tw_md5_key, key->key, key->keylen); tcptw->tw_md5_keylen = key->keylen; - if (tcp_alloc_md5sig_pool() == NULL) + if (tcp_alloc_md5sig_pool(sk) == NULL) BUG(); } } while (0); @@ -363,7 +359,7 @@ void tcp_twsk_destructor(struct sock *sk) #ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_keylen) - tcp_put_md5sig_pool(); + tcp_free_md5sig_pool(); #endif } @@ -389,14 +385,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp; + struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + + /* TCP Cookie Transactions require space for the cookie pair, + * as it differs for each connection. There is no need to + * copy any s_data_payload stored at the original socket. + * Failure will prevent resuming the connection. + * + * Presumed copied, in order of appearance: + * cookie_in_always, cookie_out_never + */ + if (oldcvp != NULL) { + struct tcp_cookie_values *newcvp = + kzalloc(sizeof(*newtp->cookie_values), + GFP_ATOMIC); + + if (newcvp != NULL) { + kref_init(&newcvp->kref); + newcvp->cookie_desired = + oldcvp->cookie_desired; + newtp->cookie_values = newcvp; + } else { + /* Not Yet Implemented */ + newtp->cookie_values = NULL; + } + } /* Now setup tcp_sock */ - newtp = tcp_sk(newsk); newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_up = treq->snt_isn + 1; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); @@ -410,7 +435,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->retrans_out = 0; newtp->sacked_out = 0; newtp->fackets_out = 0; - newtp->snd_ssthresh = 0x7fffffff; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -429,8 +454,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); - newtp->write_seq = treq->snt_isn + 1; - newtp->pushed_seq = newtp->write_seq; + newtp->write_seq = newtp->pushed_seq = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); newtp->rx_opt.saw_tstamp = 0; @@ -476,7 +501,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->af_specific->md5_lookup(sk, newsk)) newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); @@ -495,15 +520,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct request_sock **prev) { + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; - struct tcp_options_received tmp_opt; - struct sock *child; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -537,7 +563,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->rsk_ops->rtx_syn_ack(sk, req); + req->rsk_ops->rtx_syn_ack(sk, req, NULL); return NULL; } @@ -596,7 +622,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Invalid ACK: reset will be sent by listening socket */ if ((flg & TCP_FLAG_ACK) && - (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) + (TCP_SKB_CB(skb)->ack_seq != + tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which @@ -641,10 +668,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ + if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } @@ -657,29 +685,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; -#ifdef CONFIG_TCP_MD5SIG - else { - /* Copy over the MD5 key from the original socket */ - struct tcp_md5sig_key *key; - struct tcp_sock *tp = tcp_sk(sk); - key = tp->af_specific->md5_lookup(sk, child); - if (key != NULL) { - /* - * We're using one, so create a matching key on the - * newsk structure. If we fail to get memory then we - * end up not copying the key across. Shucks. - */ - char *newkey = kmemdup(key->key, key->keylen, - GFP_ATOMIC); - if (newkey) { - if (!tcp_alloc_md5sig_pool()) - BUG(); - tp->af_specific->md5_add(child, child, newkey, - key->keylen); - } - } - } -#endif inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); @@ -725,7 +730,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, * in main socket hash table and lock on listening * socket does not protect us more. */ - sk_add_backlog(child, skb); + __sk_add_backlog(child, skb); } bh_unlock_sock(child); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bd62712848fa..7ed9dc1042d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,7 @@ #include <net/tcp.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <linux/module.h> /* People can turn this off for buggy TCP's found in printers etc. */ @@ -59,6 +60,11 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ +EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); + + +/* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -142,6 +148,7 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } +/* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { @@ -161,6 +168,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } +/* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); @@ -176,7 +184,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale) + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); @@ -225,7 +234,13 @@ void tcp_select_initial_window(int __space, __u32 mss, init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd * mss) + /* when initializing use the value from init_rcv_wnd + * rather than the default from above + */ + if (init_rcv_wnd && + (*rcv_wnd > init_rcv_wnd * mss)) + *rcv_wnd = init_rcv_wnd * mss; + else if (*rcv_wnd > init_cwnd * mss) *rcv_wnd = init_cwnd * mss; } @@ -276,6 +291,7 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } +/* Packet ECN state for a SYN-ACK */ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; @@ -283,6 +299,7 @@ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; } +/* Packet ECN state for a SYN. */ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -301,6 +318,9 @@ TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) th->ece = 1; } +/* Set up ECN state for a packet on a ESTABLISHED socket that is about to + * be sent. + */ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { @@ -330,6 +350,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { + skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; TCP_SKB_CB(skb)->flags = flags; @@ -353,16 +374,49 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SACK_ADVERTISE (1 << 0) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) +#define OPTION_WSCALE (1 << 3) +#define OPTION_COOKIE_EXTENSION (1 << 4) struct tcp_out_options { u8 options; /* bit field of OPTION_* */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 hash_size; /* bytes in hash_location */ u16 mss; /* 0 to disable */ __u32 tsval, tsecr; /* need to include OPTION_TS */ + __u8 *hash_location; /* temporary pointer, overloaded */ }; -/* Beware: Something in the Internet is very sensitive to the ordering of +/* The sysctl int routines are generic, so check consistency here. + */ +static u8 tcp_cookie_size_check(u8 desired) +{ + if (desired > 0) { + /* previously specified */ + return desired; + } + if (sysctl_tcp_cookie_size <= 0) { + /* no default specified */ + return 0; + } + if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { + /* value too small, specify minimum */ + return TCP_COOKIE_MIN; + } + if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { + /* value too large, specify maximum */ + return TCP_COOKIE_MAX; + } + if (0x1 & sysctl_tcp_cookie_size) { + /* 8-bit multiple, illegal, fix it */ + return (u8)(sysctl_tcp_cookie_size + 0x1); + } + return (u8)sysctl_tcp_cookie_size; +} + +/* Write previously computed TCP options to the packet. + * + * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operatibility perspective it seems that we're somewhat stuck with @@ -374,17 +428,34 @@ struct tcp_out_options { * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - const struct tcp_out_options *opts, - __u8 **md5_hash) { - if (unlikely(OPTION_MD5 & opts->options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + struct tcp_out_options *opts) +{ + u8 options = opts->options; /* mungable copy */ + + /* Having both authentication and cookies for security is redundant, + * and there's certainly not enough room. Instead, the cookie-less + * extension variant is proposed. + * + * Consider the pessimal case with authentication. The options + * could look like: + * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_MD5 & options)) { + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + *ptr++ = htonl((TCPOPT_COOKIE << 24) | + (TCPOLEN_COOKIE_BASE << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } else { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } + options &= ~OPTION_COOKIE_EXTENSION; + /* overload cookie hash location */ + opts->hash_location = (__u8 *)ptr; ptr += 4; - } else { - *md5_hash = NULL; } if (unlikely(opts->mss)) { @@ -393,12 +464,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, opts->mss); } - if (likely(OPTION_TS & opts->options)) { - if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { + if (likely(OPTION_TS & options)) { + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -409,15 +481,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - if (unlikely(OPTION_SACK_ADVERTISE & opts->options && - !(OPTION_TS & opts->options))) { + /* Specification requires after timestamp, so do it now. + * + * Consider the pessimal case without authentication. The options + * could look like: + * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + __u8 *cookie_copy = opts->hash_location; + u8 cookie_size = opts->hash_size; + + /* 8-bit multiple handled in tcp_cookie_size_check() above, + * and elsewhere. + */ + if (0x2 & cookie_size) { + __u8 *p = (__u8 *)ptr; + + /* 16-bit multiple */ + *p++ = TCPOPT_COOKIE; + *p++ = TCPOLEN_COOKIE_BASE + cookie_size; + *p++ = *cookie_copy++; + *p++ = *cookie_copy++; + ptr++; + cookie_size -= 2; + } else { + /* 32-bit multiple */ + *ptr++ = htonl(((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_COOKIE << 8) | + TCPOLEN_COOKIE_BASE) + + cookie_size); + } + + if (cookie_size > 0) { + memcpy(ptr, cookie_copy, cookie_size); + ptr += (cookie_size / 4); + } + } + + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } - if (unlikely(opts->ws)) { + if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | @@ -445,17 +554,24 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } } +/* Compute TCP options for SYN packets. This is not the final + * network wire format yet. + */ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); - unsigned size = 0; + struct tcp_cookie_values *cvp = tp->cookie_values; + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? + tcp_cookie_size_check(cvp->cookie_desired) : + 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; } #else *md5 = NULL; @@ -471,76 +587,154 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = tp->rx_opt.ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale; - if (likely(opts->ws)) - size += TCPOLEN_WSCALE_ALIGNED; + opts->options |= OPTION_WSCALE; + remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) - size += TCPOLEN_SACKPERM_ALIGNED; + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + /* Note that timestamps are required by the specification. + * + * Odd numbers of bytes are prohibited by the specification, ensuring + * that the cookie is 16-bit aligned, and the resulting cookie pair is + * 32-bit aligned. + */ + if (*md5 == NULL && + (OPTION_TS & opts->options) && + cookie_size > 0) { + int need = TCPOLEN_COOKIE_BASE + cookie_size; + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + + if (need > remaining) { + /* try shrinking cookie to fit */ + cookie_size -= 2; + need -= 4; + } + } + while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { + cookie_size -= 4; + need -= 4; + } + if (TCP_COOKIE_MIN <= cookie_size) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; + opts->hash_size = cookie_size; + + /* Remember for future incarnations. */ + cvp->cookie_desired = cookie_size; + + if (cvp->cookie_desired != cvp->cookie_pair_size) { + /* Currently use random bytes as a nonce, + * assuming these are completely unpredictable + * by hostile users of the same system. + */ + get_random_bytes(&cvp->cookie_pair[0], + cookie_size); + cvp->cookie_pair_size = cookie_size; + } + + remaining -= need; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } +/* Set up TCP options for SYN-ACKs. */ static unsigned tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned mss, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { - unsigned size = 0; + struct tcp_md5sig_key **md5, + struct tcp_extend_values *xvp) +{ struct inet_request_sock *ireq = inet_rsk(req); - char doing_ts; + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? + xvp->cookie_plus : + 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + + /* We can't fit any SACK blocks in a packet with MD5 + TS + * options. There was discussion about disabling SACK + * rather than TS in order to fit in better with old, + * buggy kernels, but that was deemed to be unnecessary. + */ + ireq->tstamp_ok &= !ireq->sack_ok; } #else *md5 = NULL; #endif - /* we can't fit any SACK blocks in a packet with MD5 + TS - options. There was discussion about disabling SACK rather than TS in - order to fit in better with old, buggy kernels, but that was deemed - to be unnecessary. */ - doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); - + /* We always send an MSS option. */ opts->mss = mss; - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; - if (likely(opts->ws)) - size += TCPOLEN_WSCALE_ALIGNED; + opts->options |= OPTION_WSCALE; + remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(doing_ts)) { + if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = req->ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; - if (unlikely(!doing_ts)) - size += TCPOLEN_SACKPERM_ALIGNED; + if (unlikely(!ireq->tstamp_ok)) + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + /* Similar rationale to tcp_syn_options() applies here, too. + * If the <SYN> options fit, the same options should fit now! + */ + if (*md5 == NULL && + ireq->tstamp_ok && + cookie_plus > TCPOLEN_COOKIE_BASE) { + int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + } + if (need <= remaining) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; + remaining -= need; + } else { + /* There's no error return, so flag it. */ + xvp->cookie_out_never = 1; /* true */ + opts->hash_size = 0; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } +/* Compute TCP options for ESTABLISHED sockets. This is not the + * final wire format yet. + */ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { @@ -601,7 +795,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_out_options opts; unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; struct tcphdr *th; int err; @@ -643,8 +836,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Build TCP header and checksum it. */ th = tcp_hdr(skb); - th->source = inet->sport; - th->dest = inet->dport; + th->source = inet->inet_sport; + th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | @@ -667,25 +860,25 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { - th->urg_ptr = 0xFFFF; + th->urg_ptr = htons(0xFFFF); th->urg = 1; } } - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - tp->af_specific->calc_md5_hash(md5_hash_location, + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, NULL, skb); } #endif - icsk->icsk_af_ops->send_check(sk, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); @@ -694,9 +887,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, + tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb); if (likely(err <= 0)) return err; @@ -705,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return net_xmit_eval(err); } -/* This routine just queue's the buffer +/* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. @@ -722,6 +916,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); } +/* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { @@ -909,6 +1104,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) skb->len = skb->data_len; } +/* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -937,7 +1133,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Not accounting for SACKs here. */ +/* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { struct tcp_sock *tp = tcp_sk(sk); @@ -981,6 +1177,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) return mtu; } +/* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1143,7 +1340,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, return 0; } -/* This must be invoked the first time we consider transmitting +/* Intialize TSO state of a skb. + * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, @@ -1158,6 +1356,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, return tso_segs; } +/* Minshall's variant of the Nagle send check. */ static inline int tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && @@ -1242,6 +1441,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return cwnd_quota; } +/* Test if sending is allowed right now. */ int tcp_may_send_now(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1378,6 +1578,10 @@ send_now: } /* Create a new MTU probe if we are ready. + * MTU probe is regularly attempting to increase the path MTU by + * deliberately sending larger packets. This discovers routing + * changes resulting in larger path MTUs. + * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise @@ -1599,11 +1803,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { - struct sk_buff *skb = tcp_send_head(sk); - - if (!skb) - return; - /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. @@ -1790,6 +1989,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) sk_wmem_free_skb(sk, next_skb); } +/* Check if coalescing SKBs is legal. */ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) @@ -1808,6 +2008,9 @@ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) return 1; } +/* Collapse packets in the retransmit queue to make to create + * less packets on the wire. This is only done on retransmission. + */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { @@ -1886,8 +2089,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) - && TCP_SKB_CB(skb)->seq != tp->snd_una) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; if (skb->len > cur_mss) { @@ -1957,6 +2160,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } +/* Check if we forward retransmits are possible in the current + * window/congestion state. + */ static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2002,6 +2208,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) int mib_idx; int fwd_rexmitting = 0; + if (!tp->packets_out) + return; + if (!tp->lost_out) tp->retransmit_high = tp->snd_una; @@ -2101,7 +2310,8 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, + sk->sk_allocation); if (skb) break; yield(); @@ -2145,7 +2355,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } -/* WARNING: This routine must only be called when we have already sent +/* Send a crossed SYN-ACK during socket establishment. + * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. @@ -2180,23 +2391,26 @@ int tcp_send_synack(struct sock *sk) return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -/* - * Prepare a SYN-ACK. - */ +/* Prepare a SYN-ACK. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) + struct request_sock *req, + struct request_values *rvp) { + struct tcp_out_options opts; + struct tcp_extend_values *xvp = tcp_xv(rvp); struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_cookie_values *cvp = tp->cookie_values; struct tcphdr *th; - int tcp_header_size; - struct tcp_out_options opts; struct sk_buff *skb; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; + int tcp_header_size; int mss; + int s_data_desired = 0; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) + s_data_desired = cvp->s_data_desired; + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); if (skb == NULL) return NULL; @@ -2219,7 +2433,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; } @@ -2231,8 +2446,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5) + - sizeof(struct tcphdr); + skb, &opts, &md5, xvp) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2249,19 +2464,54 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + + if (OPTION_COOKIE_EXTENSION & opts.options) { + if (s_data_desired) { + u8 *buf = skb_put(skb, s_data_desired); + + /* copy data directly from the listening socket. */ + memcpy(buf, cvp->s_data_payload, s_data_desired); + TCP_SKB_CB(skb)->end_seq += s_data_desired; + } + + if (opts.hash_size > 0) { + __u32 workspace[SHA_WORKSPACE_WORDS]; + u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; + u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; + + /* Secret recipe depends on the Timestamp, (future) + * Sequence and Acknowledgment Numbers, Initiator + * Cookie, and others handled by IP variant caller. + */ + *tail-- ^= opts.tsval; + *tail-- ^= tcp_rsk(req)->rcv_isn + 1; + *tail-- ^= TCP_SKB_CB(skb)->seq + 1; + + /* recommended */ + *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); + *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ + + sha_transform((__u32 *)&xvp->cookie_bakery[0], + (char *)mess, + &workspace[0]); + opts.hash_location = + (__u8 *)&xvp->cookie_bakery[0]; + } + } + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { - tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, + tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, md5, NULL, req, skb); } #endif @@ -2269,9 +2519,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, return skb; } -/* - * Do all connect socket setups that can be done AF independent. - */ +/* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); @@ -2309,7 +2557,8 @@ static void tcp_connect_init(struct sock *sk) &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; @@ -2330,9 +2579,7 @@ static void tcp_connect_init(struct sock *sk) tcp_clear_retrans(tp); } -/* - * Build a SYN and send it off. - */ +/* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2359,7 +2606,7 @@ int tcp_connect(struct sock *sk) sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. @@ -2493,6 +2740,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } +/* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 59f5b5e7c566..f8efada580e8 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -22,6 +22,7 @@ #include <linux/kprobes.h> #include <linux/socket.h> #include <linux/tcp.h> +#include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/ktime.h> @@ -39,9 +40,9 @@ static int port __read_mostly = 0; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); -static int bufsize __read_mostly = 4096; +static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); -module_param(bufsize, int, 0); +module_param(bufsize, uint, 0); static int full __read_mostly; MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); @@ -75,12 +76,12 @@ static struct { static inline int tcp_probe_used(void) { - return (tcp_probe.head - tcp_probe.tail) % bufsize; + return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); } static inline int tcp_probe_avail(void) { - return bufsize - tcp_probe_used(); + return bufsize - tcp_probe_used() - 1; } /* @@ -94,8 +95,9 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct inet_sock *inet = inet_sk(sk); /* Only update if port matches */ - if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) - && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { + if ((port == 0 || ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port) && + (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { spin_lock(&tcp_probe.lock); /* If log fills, just silently drop */ @@ -103,10 +105,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_log *p = tcp_probe.log + tcp_probe.head; p->tstamp = ktime_get(); - p->saddr = inet->saddr; - p->sport = inet->sport; - p->daddr = inet->daddr; - p->dport = inet->dport; + p->saddr = inet->inet_saddr; + p->sport = inet->inet_sport; + p->daddr = inet->inet_daddr; + p->dport = inet->inet_dport; p->length = skb->len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; @@ -115,7 +117,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, p->ssthresh = tcp_current_ssthresh(sk); p->srtt = tp->srtt >> 3; - tcp_probe.head = (tcp_probe.head + 1) % bufsize; + tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } tcp_probe.lastcwnd = tp->snd_cwnd; spin_unlock(&tcp_probe.lock); @@ -148,7 +150,7 @@ static int tcpprobe_open(struct inode * inode, struct file * file) static int tcpprobe_sprint(char *tbuf, int n) { const struct tcp_log *p - = tcp_probe.log + tcp_probe.tail % bufsize; + = tcp_probe.log + tcp_probe.tail; struct timespec tv = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); @@ -191,7 +193,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, width = tcpprobe_sprint(tbuf, sizeof(tbuf)); if (cnt + width < len) - tcp_probe.tail = (tcp_probe.tail + 1) % bufsize; + tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); spin_unlock_bh(&tcp_probe.lock); @@ -221,9 +223,10 @@ static __init int tcpprobe_init(void) init_waitqueue_head(&tcp_probe.wait); spin_lock_init(&tcp_probe.lock); - if (bufsize < 0) + if (bufsize == 0) return -EINVAL; + bufsize = roundup_pow_of_two(bufsize); tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); if (!tcp_probe.log) goto err0; @@ -235,7 +238,7 @@ static __init int tcpprobe_init(void) if (ret) goto err1; - pr_info("TCP probe registered (port=%d)\n", port); + pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize); return 0; err1: proc_net_remove(&init_net, procname); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b144a26359bc..440a5c6004f6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -19,6 +19,7 @@ */ #include <linux/module.h> +#include <linux/gfp.h> #include <net/tcp.h> int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; @@ -29,6 +30,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; +int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); @@ -132,22 +134,52 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } } +/* This function calculates a "timeout" which is equivalent to the timeout of a + * TCP connection after "boundary" unsuccessful, exponentially backed-off + * retransmissions with an initial RTO of TCP_RTO_MIN. + */ +static bool retransmits_timed_out(struct sock *sk, + unsigned int boundary) +{ + unsigned int timeout, linear_backoff_thresh; + unsigned int start_ts; + + if (!inet_csk(sk)->icsk_retransmits) + return false; + + if (unlikely(!tcp_sk(sk)->retrans_stamp)) + start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; + else + start_ts = tcp_sk(sk)->retrans_stamp; + + linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); + + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; + else + timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + + return (tcp_time_stamp - start_ts) >= timeout; +} + /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; + bool do_reset; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) - dst_negative_advice(&sk->sk_dst_cache); + dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); - dst_negative_advice(&sk->sk_dst_cache); + dst_negative_advice(sk); } retry_until = sysctl_tcp_retries2; @@ -155,13 +187,15 @@ static int tcp_write_timeout(struct sock *sk) const int alive = (icsk->icsk_rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); + do_reset = alive || + !retransmits_timed_out(sk, retry_until); - if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) + if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (icsk->icsk_retransmits >= retry_until) { + if (retransmits_timed_out(sk, retry_until)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -279,7 +313,7 @@ static void tcp_probe_timer(struct sock *sk) * The TCP retransmit timer. */ -static void tcp_retransmit_timer(struct sock *sk) +void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -300,15 +334,15 @@ static void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &inet->daddr, ntohs(inet->dport), - inet->num, tp->snd_una, tp->snd_nxt); + &inet->inet_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &np->daddr, ntohs(inet->dport), - inet->num, tp->snd_una, tp->snd_nxt); + &np->daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt); } #endif #endif @@ -383,9 +417,27 @@ static void tcp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits++; out_reset_timer: - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating + * exponential backoff behaviour to avoid continue hammering + * linear-timeout retransmissions into a black hole + */ + if (sk->sk_state == TCP_ESTABLISHED && + (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + tcp_stream_is_thin(tp) && + icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + icsk->icsk_backoff = 0; + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + } else { + /* Use normal (exponential) backoff */ + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_tcp_retries1) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) __sk_dst_reset(sk); out:; @@ -442,6 +494,12 @@ static void tcp_synack_timer(struct sock *sk) TCP_TIMEOUT_INIT, TCP_RTO_MAX); } +void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +{ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); +} +EXPORT_SYMBOL(tcp_syn_ack_timeout); + void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) @@ -459,7 +517,7 @@ static void tcp_keepalive_timer (unsigned long data) struct sock *sk = (struct sock *) data; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __u32 elapsed; + u32 elapsed; /* Only process if socket is not in use. */ bh_lock_sock(sk); @@ -496,11 +554,10 @@ static void tcp_keepalive_timer (unsigned long data) if (tp->packets_out || tcp_send_head(sk)) goto resched; - elapsed = tcp_time_stamp - tp->rcv_tstamp; + elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { - if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || - (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { + if (icsk->icsk_probes_out >= keepalive_probes(tp)) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index e9bbff746488..b612acf76183 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * every other rtt. */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (veno->inc - && tp->snd_cwnd < - tp->snd_cwnd_clamp) { + if (veno->inc && + tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; veno->inc = 0; } else diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 66b6821b984e..a0f240358892 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) if (queue > TCP_YEAH_ALPHA || rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { - if (queue > TCP_YEAH_ALPHA - && tp->snd_cwnd > yeah->reno_count) { + if (queue > TCP_YEAH_ALPHA && + tp->snd_cwnd > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , tp->snd_cwnd >> TCP_YEAH_EPSILON); diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index cb1f0e83830b..3b3813cc80b9 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> @@ -132,7 +133,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) } #endif -static struct net_protocol tunnel4_protocol = { +static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, @@ -140,7 +141,7 @@ static struct net_protocol tunnel4_protocol = { }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct net_protocol tunnel64_protocol = { +static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 80e3812837ad..eec4ff456e33 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -95,6 +95,7 @@ #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> @@ -106,42 +107,45 @@ #include <net/xfrm.h> #include "udp_impl.h" -struct udp_table udp_table; +struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); int sysctl_udp_mem[3] __read_mostly; -int sysctl_udp_rmem_min __read_mostly; -int sysctl_udp_wmem_min __read_mostly; - EXPORT_SYMBOL(sysctl_udp_mem); + +int sysctl_udp_rmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_rmem_min); + +int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); -#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) +#define MAX_UDP_PORTS 65536 +#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2), + unsigned int log) { struct sock *sk2; struct hlist_nulls_node *node; sk_nulls_for_each(sk2, node, &hslot->head) - if (net_eq(sock_net(sk2), net) && - sk2 != sk && - (bitmap || sk2->sk_hash == num) && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (bitmap || udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (*saddr_comp)(sk, sk2)) { if (bitmap) - __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, + __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); else return 1; @@ -149,18 +153,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, return 0; } +/* + * Note: we still hold spinlock of primary hash chain, so no other writer + * can insert/delete a socket with local_port == num + */ +static int udp_lib_lport_inuse2(struct net *net, __u16 num, + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + int res = 0; + + spin_lock(&hslot2->lock); + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) { + res = 1; + break; + } + spin_unlock(&hslot2->lock); + return res; +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @saddr_comp: AF-dependent comparison of bound local IP addresses + * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, + * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2 ) ) + const struct sock *sk2), + unsigned int hash2_nulladdr) { - struct udp_hslot *hslot; + struct udp_hslot *hslot, *hslot2; struct udp_table *udptable = sk->sk_prot->h.udp_table; int error = 1; struct net *net = sock_net(sk); @@ -179,13 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ - rand = (rand | 1) * UDP_HTABLE_SIZE; - for (last = first + UDP_HTABLE_SIZE; first != last; first++) { - hslot = &udptable->hash[udp_hashfn(net, first)]; + rand = (rand | 1) * (udptable->mask + 1); + last = first + udptable->mask + 1; + do { + hslot = udp_hashslot(udptable, net, first); bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, - saddr_comp); + saddr_comp, udptable->log); snum = first; /* @@ -195,25 +233,60 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, */ do { if (low <= snum && snum <= high && - !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) + !test_bit(snum >> udptable->log, bitmap) && + !inet_is_reserved_local_port(snum)) goto found; snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); - } + } while (++first != last); goto fail; } else { - hslot = &udptable->hash[udp_hashfn(net, snum)]; + hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); - if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) + if (hslot->count > 10) { + int exist; + unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; + + slot2 &= udptable->mask; + hash2_nulladdr &= udptable->mask; + + hslot2 = udp_hashslot2(udptable, slot2); + if (hslot->count < hslot2->count) + goto scan_primary_hash; + + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + if (!exist && (hash2_nulladdr != slot2)) { + hslot2 = udp_hashslot2(udptable, hash2_nulladdr); + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + } + if (exist) + goto fail_unlock; + else + goto found; + } +scan_primary_hash: + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, + saddr_comp, 0)) goto fail_unlock; } found: - inet_sk(sk)->num = snum; - sk->sk_hash = snum; + inet_sk(sk)->inet_num = snum; + udp_sk(sk)->udp_port_hash = snum; + udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { sk_nulls_add_node_rcu(sk, &hslot->head); + hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + spin_lock(&hslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + hslot2->count++; + spin_unlock(&hslot2->lock); } error = 0; fail_unlock: @@ -221,19 +294,33 @@ fail_unlock: fail: return error; } +EXPORT_SYMBOL(udp_lib_get_port); static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return ( !ipv6_only_sock(sk2) && - (!inet1->rcv_saddr || !inet2->rcv_saddr || - inet1->rcv_saddr == inet2->rcv_saddr )); + return (!ipv6_only_sock(sk2) && + (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || + inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); +} + +static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, + unsigned int port) +{ + return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } int udp_v4_get_port(struct sock *sk, unsigned short snum) { - return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); + unsigned int hash2_nulladdr = + udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + unsigned int hash2_partial = + udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, @@ -242,23 +329,23 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, { int score = -1; - if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && !ipv6_only_sock(sk)) { struct inet_sock *inet = inet_sk(sk); score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) return -1; score += 2; } - if (inet->daddr) { - if (inet->daddr != saddr) + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; score += 2; } - if (inet->dport) { - if (inet->dport != sport) + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; score += 2; } @@ -271,6 +358,89 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, return score; } +/* + * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) + */ +#define SCORE2_MAX (1 + 2 + 2 + 2) +static inline int compute_score2(struct sock *sk, struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->inet_rcv_saddr != daddr) + return -1; + if (inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 2; + } + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; + } + } + return score; +} + + +/* called with read_rcu_lock() */ +static struct sock *udp4_lib_lookup2(struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + if (score == SCORE2_MAX) + goto exact_match; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -281,11 +451,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, struct sock *sk, *result; struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); - unsigned int hash = udp_hashfn(net, hnum); - struct udp_hslot *hslot = &udptable->hash[hash]; + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness; rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp4_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, saddr, sport, + htonl(INADDR_ANY), hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } begin: result = NULL; badness = -1; @@ -302,7 +496,7 @@ begin: * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ - if (get_nulls_value(node) != hash) + if (get_nulls_value(node) != slot) goto begin; if (result) { @@ -352,12 +546,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); - if (!net_eq(sock_net(s), net) || - s->sk_hash != hnum || - (inet->daddr && inet->daddr != rmt_addr) || - (inet->dport != rmt_port && inet->dport) || - (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || - ipv6_only_sock(s) || + if (!net_eq(sock_net(s), net) || + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && + inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) @@ -383,8 +578,8 @@ found: void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; - struct iphdr *iph = (struct iphdr*)skb->data; - struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); + struct iphdr *iph = (struct iphdr *)skb->data; + struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -438,9 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) if (!inet->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; - } else { - ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); - } + } else + ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); + sk->sk_err = err; sk->sk_error_report(sk); out: @@ -474,7 +669,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); * (checksum field must be zeroed out) */ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, int len ) + __be32 src, __be32 dst, int len) { unsigned int offset; struct udphdr *uh = udp_hdr(skb); @@ -545,7 +740,7 @@ static int udp_push_pending_frames(struct sock *sk) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src,fl->fl4_dst, up->len); + udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); goto send; } else /* `normal' UDP */ @@ -553,18 +748,24 @@ static int udp_push_pending_frames(struct sock *sk) /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, - sk->sk_protocol, csum ); + sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: err = ip_push_pending_frames(sk); + if (err) { + if (err == -ENOBUFS && !inet->recverr) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + err = 0; + } + } else + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); out: up->len = 0; up->pending = 0; - if (!err) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -592,7 +793,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Check the flags. */ - if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; ipc.opt = NULL; @@ -619,7 +820,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; + struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -634,14 +835,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - daddr = inet->daddr; - dport = inet->dport; + daddr = inet->inet_daddr; + dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } - ipc.addr = inet->saddr; + ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; err = sock_tx_timestamp(msg, sk, &ipc.shtx); @@ -684,10 +885,11 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (connected) - rt = (struct rtable*)sk_dst_check(sk, 0); + rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { struct flowi fl = { .oif = ipc.oif, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = faddr, .saddr = saddr, @@ -695,7 +897,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet->sport, + { .sport = inet->inet_sport, .dport = dport } } }; struct net *net = sock_net(sk); @@ -739,7 +941,7 @@ back_from_confirm: inet->cork.fl.fl4_dst = daddr; inet->cork.fl.fl_ip_dport = dport; inet->cork.fl.fl4_src = saddr; - inet->cork.fl.fl_ip_sport = inet->sport; + inet->cork.fl.fl_ip_sport = inet->inet_sport; up->pending = AF_INET; do_append_data: @@ -782,6 +984,7 @@ do_confirm: err = 0; goto out; } +EXPORT_SYMBOL(udp_sendmsg); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) @@ -831,6 +1034,44 @@ out: return ret; } + +/** + * first_packet_length - return length of first packet in receive queue + * @sk: socket + * + * Drops all bad checksum frames, until a valid one is found. + * Returns the length of found skb, or 0 if none is found. + */ +static unsigned int first_packet_length(struct sock *sk) +{ + struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue; + struct sk_buff *skb; + unsigned int res; + + __skb_queue_head_init(&list_kill); + + spin_lock_bh(&rcvq->lock); + while ((skb = skb_peek(rcvq)) != NULL && + udp_lib_checksum_complete(skb)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); + __skb_unlink(skb, rcvq); + __skb_queue_tail(&list_kill, skb); + } + res = skb ? skb->len : 0; + spin_unlock_bh(&rcvq->lock); + + if (!skb_queue_empty(&list_kill)) { + bool slow = lock_sock_fast(sk); + + __skb_queue_purge(&list_kill); + sk_mem_reclaim_partial(sk); + unlock_sock_fast(sk, slow); + } + return res; +} + /* * IOCTL requests applicable to the UDP protocol */ @@ -847,21 +1088,16 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) case SIOCINQ: { - struct sk_buff *skb; - unsigned long amount; + unsigned int amount = first_packet_length(sk); - amount = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { + if (amount) /* * We will only return the amount * of this packet since that is all * that will be read. */ - amount = skb->len - sizeof(struct udphdr); - } - spin_unlock_bh(&sk->sk_receive_queue.lock); + amount -= sizeof(struct udphdr); + return put_user(amount, (int __user *)arg); } @@ -871,6 +1107,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return 0; } +EXPORT_SYMBOL(udp_ioctl); /* * This should be easy, if there is something there we @@ -883,16 +1120,17 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; - unsigned int ulen, copied; + unsigned int ulen; int peeked; int err; int is_udplite = IS_UDPLITE(sk); + bool slow; /* * Check any passed addresses */ if (addr_len) - *addr_len=sizeof(*sin); + *addr_len = sizeof(*sin); if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len); @@ -904,10 +1142,9 @@ try_again: goto out; ulen = skb->len - sizeof(struct udphdr); - copied = len; - if (copied > ulen) - copied = ulen; - else if (copied < ulen) + if (len > ulen) + len = ulen; + else if (len < ulen) msg->msg_flags |= MSG_TRUNC; /* @@ -916,16 +1153,18 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { if (udp_lib_checksum_complete(skb)) goto csum_copy_err; } if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied ); + msg->msg_iov, len); else { - err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + err = skb_copy_and_csum_datagram_iovec(skb, + sizeof(struct udphdr), + msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; @@ -938,11 +1177,10 @@ try_again: UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); - sock_recv_timestamp(msg, sk, skb); + sock_recv_ts_and_drops(msg, sk, skb); /* Copy the address. */ - if (sin) - { + if (sin) { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; @@ -951,22 +1189,20 @@ try_again: if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); - err = copied; + err = len; if (flags & MSG_TRUNC) err = ulen; out_free: - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + skb_free_datagram_locked(sk, skb); out: return err; csum_copy_err: - lock_sock(sk); + slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - release_sock(sk); + unlock_sock_fast(sk, slow); if (noblock) return -EAGAIN; @@ -982,31 +1218,42 @@ int udp_disconnect(struct sock *sk, int flags) */ sk->sk_state = TCP_CLOSE; - inet->daddr = 0; - inet->dport = 0; + inet->inet_daddr = 0; + inet->inet_dport = 0; + sock_rps_save_rxhash(sk, 0); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); - inet->sport = 0; + inet->inet_sport = 0; } sk_dst_reset(sk); return 0; } +EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { struct udp_table *udptable = sk->sk_prot->h.udp_table; - unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); - struct udp_hslot *hslot = &udptable->hash[hash]; + struct udp_hslot *hslot, *hslot2; + + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); if (sk_nulls_del_node_init_rcu(sk)) { - inet_sk(sk)->num = 0; + hslot->count--; + inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); } spin_unlock_bh(&hslot->lock); } @@ -1015,25 +1262,26 @@ EXPORT_SYMBOL(udp_lib_unhash); static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int is_udplite = IS_UDPLITE(sk); int rc; - if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { + if (inet_sk(sk)->inet_daddr) + sock_rps_save_rxhash(sk, skb->rxhash); + + rc = ip_queue_rcv_skb(sk, skb); + if (rc < 0) { + int is_udplite = IS_UDPLITE(sk); + /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) { + if (rc == -ENOMEM) UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); - atomic_inc(&sk->sk_drops); - } - goto drop; + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; } return 0; -drop: - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb(skb); - return -1; } /* returns: @@ -1044,7 +1292,7 @@ drop: * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; @@ -1127,66 +1375,107 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) goto drop; } + + if (sk_rcvqueues_full(sk, skb)) + goto drop; + rc = 0; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); + else if (sk_add_backlog(sk, skb)) { + bh_unlock_sock(sk); + goto drop; + } bh_unlock_sock(sk); return rc; drop: UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; } + +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + unsigned int i; + struct sk_buff *skb1 = NULL; + struct sock *sk; + + for (i = 0; i < count; i++) { + sk = stack[i]; + if (likely(skb1 == NULL)) + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + + if (!skb1) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + } + + if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) + skb1 = NULL; + } + if (unlikely(skb1)) + kfree_skb(skb1); +} + /* * Multicasts and broadcasts go to each listener. * - * Note: called only from the BH handler context, - * so we don't need to lock the hashes. + * Note: called only from the BH handler context. */ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct udp_table *udptable) { - struct sock *sk; - struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; + struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); int dif; + unsigned int i, count = 0; spin_lock(&hslot->lock); sk = sk_nulls_head(&hslot->head); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + while (sk) { + stack[count++] = sk; + sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, + daddr, uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; + } + } + /* + * before releasing chain lock, we must take a reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); - do { - struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, - dif); - if (sknext) - skb1 = skb_clone(skb, GFP_ATOMIC); - - if (skb1) { - int ret = udp_queue_rcv_skb(sk, skb1); - if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ - kfree_skb(skb1); - } - sk = sknext; - } while (sknext); - } else - consume_skb(skb); spin_unlock(&hslot->lock); + + /* + * do the slow work with no lock held + */ + if (count) { + flush_stack(stack, count, skb, count - 1); + + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); + } return 0; } @@ -1214,7 +1503,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, proto, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -1250,6 +1539,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, uh = udp_hdr(skb); ulen = ntohs(uh->len); + saddr = ip_hdr(skb)->saddr; + daddr = ip_hdr(skb)->daddr; + if (ulen > skb->len) goto short_packet; @@ -1263,9 +1555,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - saddr = ip_hdr(skb)->saddr; - daddr = ip_hdr(skb)->daddr; - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable); @@ -1338,16 +1627,16 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { - lock_sock(sk); + bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); - release_sock(sk); + unlock_sock_fast(sk, slow); } /* * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen, + char __user *optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); @@ -1355,7 +1644,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int err = 0; int is_udplite = IS_UDPLITE(sk); - if (optlen<sizeof(int)) + if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -1399,8 +1688,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; - else if (val > USHORT_MAX) - val = USHORT_MAX; + else if (val > USHRT_MAX) + val = USHRT_MAX; up->pcslen = val; up->pcflag |= UDPLITE_SEND_CC; break; @@ -1413,8 +1702,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; - else if (val > USHORT_MAX) - val = USHORT_MAX; + else if (val > USHRT_MAX) + val = USHRT_MAX; up->pcrlen = val; up->pcflag |= UDPLITE_RECV_CC; break; @@ -1426,9 +1715,10 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_setsockopt(sk, level, optname, optval, optlen, @@ -1438,7 +1728,7 @@ int udp_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_COMPAT int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_setsockopt(sk, level, optname, optval, optlen, @@ -1453,7 +1743,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, len; - if (get_user(len,optlen)) + if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); @@ -1486,10 +1776,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } +EXPORT_SYMBOL(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -1525,33 +1816,16 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) { unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; - int is_lite = IS_UDPLITE(sk); /* Check for false positives due to checksum errors */ - if ( (mask & POLLRDNORM) && - !(file->f_flags & O_NONBLOCK) && - !(sk->sk_shutdown & RCV_SHUTDOWN)){ - struct sk_buff_head *rcvq = &sk->sk_receive_queue; - struct sk_buff *skb; - - spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL && - udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(sock_net(sk), - UDP_MIB_INERRORS, is_lite); - __skb_unlink(skb, rcvq); - kfree_skb(skb); - } - spin_unlock_bh(&rcvq->lock); - - /* nothing to see, move along */ - if (skb == NULL) - mask &= ~(POLLIN | POLLRDNORM); - } + if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && + !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk)) + mask &= ~(POLLIN | POLLRDNORM); return mask; } +EXPORT_SYMBOL(udp_poll); struct proto udp_prot = { .name = "UDP", @@ -1582,6 +1856,7 @@ struct proto udp_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif }; +EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS @@ -1592,9 +1867,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + for (state->bucket = start; state->bucket <= state->udp_table->mask; + ++state->bucket) { struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; + + if (hlist_nulls_empty(&hslot->head)) + continue; + spin_lock_bh(&hslot->lock); sk_nulls_for_each(sk, node, &hslot->head) { if (!net_eq(sock_net(sk), net)) @@ -1619,7 +1899,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { - if (state->bucket < UDP_HTABLE_SIZE) + if (state->bucket <= state->udp_table->mask) spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); return udp_get_first(seq, state->bucket + 1); } @@ -1639,7 +1919,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) static void *udp_seq_start(struct seq_file *seq, loff_t *pos) { struct udp_iter_state *state = seq->private; - state->bucket = UDP_HTABLE_SIZE; + state->bucket = MAX_UDP_PORTS; return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1661,7 +1941,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; - if (state->bucket < UDP_HTABLE_SIZE) + if (state->bucket <= state->udp_table->mask) spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); } @@ -1703,23 +1983,25 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) rc = -ENOMEM; return rc; } +EXPORT_SYMBOL(udp_proc_register); void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) { proc_net_remove(net, afinfo->name); } +EXPORT_SYMBOL(udp_proc_unregister); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, int bucket, int *len) { struct inet_sock *inet = inet_sk(sp); - __be32 dest = inet->daddr; - __be32 src = inet->rcv_saddr; - __u16 destp = ntohs(inet->dport); - __u16 srcp = ntohs(inet->sport); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); - seq_printf(f, "%4d: %08X:%04X %08X:%04X" + seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), @@ -1741,7 +2023,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) int len; udp4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len ,""); + seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } @@ -1759,12 +2041,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = { }, }; -static int udp4_proc_init_net(struct net *net) +static int __net_init udp4_proc_init_net(struct net *net) { return udp_proc_register(net, &udp4_seq_afinfo); } -static void udp4_proc_exit_net(struct net *net) +static void __net_exit udp4_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udp4_seq_afinfo); } @@ -1785,21 +2067,60 @@ void udp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -void __init udp_table_init(struct udp_table *table) +static __initdata unsigned long uhash_entries; +static int __init set_uhash_entries(char *str) { - int i; + if (!str) + return 0; + uhash_entries = simple_strtoul(str, &str, 0); + if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) + uhash_entries = UDP_HTABLE_SIZE_MIN; + return 1; +} +__setup("uhash_entries=", set_uhash_entries); - for (i = 0; i < UDP_HTABLE_SIZE; i++) { +void __init udp_table_init(struct udp_table *table, const char *name) +{ + unsigned int i; + + if (!CONFIG_BASE_SMALL) + table->hash = alloc_large_system_hash(name, + 2 * sizeof(struct udp_hslot), + uhash_entries, + 21, /* one slot per 2 MB */ + 0, + &table->log, + &table->mask, + 64 * 1024); + /* + * Make sure hash table has the minimum size + */ + if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { + table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * + 2 * sizeof(struct udp_hslot), GFP_KERNEL); + if (!table->hash) + panic(name); + table->log = ilog2(UDP_HTABLE_SIZE_MIN); + table->mask = UDP_HTABLE_SIZE_MIN - 1; + } + table->hash2 = table->hash + (table->mask + 1); + for (i = 0; i <= table->mask; i++) { INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } + for (i = 0; i <= table->mask; i++) { + INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + table->hash2[i].count = 0; + spin_lock_init(&table->hash2[i].lock); + } } void __init udp_init(void) { unsigned long nr_pages, limit; - udp_table_init(&udp_table); + udp_table_init(&udp_table, "UDP"); /* Set the pressure threshold up by the same strategy of TCP. It is a * fraction of global memory that is up to 1/2 at 256 MB, decreasing * toward zero with the amount of memory, with a floor of 128 pages. @@ -1816,16 +2137,64 @@ void __init udp_init(void) sysctl_udp_wmem_min = SK_MEM_QUANTUM; } -EXPORT_SYMBOL(udp_disconnect); -EXPORT_SYMBOL(udp_ioctl); -EXPORT_SYMBOL(udp_prot); -EXPORT_SYMBOL(udp_sendmsg); -EXPORT_SYMBOL(udp_lib_getsockopt); -EXPORT_SYMBOL(udp_lib_setsockopt); -EXPORT_SYMBOL(udp_poll); -EXPORT_SYMBOL(udp_lib_get_port); +int udp4_ufo_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + return 0; +} + +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + int offset; + __wsum csum; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb->csum_start - skb_headroom(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: + return segs; +} -#ifdef CONFIG_PROC_FS -EXPORT_SYMBOL(udp_proc_register); -EXPORT_SYMBOL(udp_proc_unregister); -#endif diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 9f4a6165f722..aaad650d47d9 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -11,13 +11,13 @@ extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); extern int udp_v4_get_port(struct sock *sk, unsigned short snum); extern int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen); + char __user *optval, unsigned int optlen); extern int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); #ifdef CONFIG_COMPAT extern int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen); + char __user *optval, unsigned int optlen); extern int compat_udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); #endif diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index c784891cb7e5..6610bf76369f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -12,7 +12,7 @@ */ #include "udp_impl.h" -struct udp_table udplite_table; +struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); static int udplite_rcv(struct sk_buff *skb) @@ -25,7 +25,7 @@ static void udplite_err(struct sk_buff *skb, u32 info) __udp4_lib_err(skb, info, &udplite_table); } -static struct net_protocol udplite_protocol = { +static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, @@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .capability = -1, .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; @@ -82,12 +81,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = { }, }; -static int udplite4_proc_init_net(struct net *net) +static int __net_init udplite4_proc_init_net(struct net *net) { return udp_proc_register(net, &udplite4_seq_afinfo); } -static void udplite4_proc_exit_net(struct net *net) +static void __net_exit udplite4_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udplite4_seq_afinfo); } @@ -110,7 +109,7 @@ static inline int udplite4_proc_init(void) void __init udplite4_register(void) { - udp_table_init(&udplite_table); + udp_table_init(&udplite_table, "UDP-Lite"); if (proto_register(&udplite_prot, 1)) goto out_register_err; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index f9f922a0ba88..ad8fbb871aa0 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -9,6 +9,7 @@ * */ +#include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> @@ -26,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) if (skb_dst(skb) == NULL) { const struct iphdr *iph = ip_hdr(skb); - if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, - skb->dev)) + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) goto drop; } return dst_input(skb); @@ -60,7 +61,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); - NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 3444f3b34eca..6f368413eb0e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -4,6 +4,7 @@ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index c908bd99bcba..571aa96a175c 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -86,7 +86,7 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { - return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, skb_dst(skb)->dev, xfrm4_output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0071ee6f441f..23883a48ebfb 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,7 +15,6 @@ #include <net/xfrm.h> #include <net/ip.h> -static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, @@ -60,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net, return 0; } -static struct dst_entry * -__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) -{ - struct dst_entry *dst; - - read_lock_bh(&policy->lock); - for (dst = policy->bundles; dst; dst = dst->next) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ - xdst->u.rt.fl.fl4_dst == fl->fl4_dst && - xdst->u.rt.fl.fl4_src == fl->fl4_src && - xdst->u.rt.fl.fl4_tos == fl->fl4_tos && - xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { - dst_clone(dst); - break; - } - } - read_unlock_bh(&policy->lock); - return dst; -} - static int xfrm4_get_tos(struct flowi *fl) { return fl->fl4_tos; @@ -92,11 +70,12 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, return 0; } -static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; - xdst->u.rt.fl = rt->fl; + xdst->u.rt.fl = *fl; xdst->u.dst.dev = dev; dev_hold(dev); @@ -129,6 +108,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) u8 *xprth = skb_network_header(skb) + iph->ihl * 4; memset(fl, 0, sizeof(struct flowi)); + fl->mark = skb->mark; + if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { switch (iph->protocol) { case IPPROTO_UDP: @@ -190,8 +171,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) static inline int xfrm4_garbage_collect(struct dst_ops *ops) { - xfrm4_policy_afinfo.garbage_collect(&init_net); - return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); + struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); + + xfrm4_policy_afinfo.garbage_collect(net); + return (atomic_read(&ops->entries) > ops->gc_thresh * 2); } static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -257,13 +240,27 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, - .find_bundle = __xfrm4_find_bundle, .decode_session = _decode_session4, .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, }; +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm4_policy_table[] = { + { + .procname = "xfrm4_gc_thresh", + .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *sysctl_hdr; +#endif + static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); @@ -271,12 +268,32 @@ static void __init xfrm4_policy_init(void) static void __exit xfrm4_policy_fini(void) { +#ifdef CONFIG_SYSCTL + if (sysctl_hdr) + unregister_net_sysctl_table(sysctl_hdr); +#endif xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); } -void __init xfrm4_init(void) +void __init xfrm4_init(int rt_max_size) { + /* + * Select a default value for the gc_thresh based on the main route + * table hash size. It seems to me the worst case scenario is when + * we have ipsec operating in transport mode, in which we create a + * dst_entry per socket. The xfrm gc algorithm starts trying to remove + * entries at gc_thresh, and prevents new allocations as 2*gc_thresh + * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. + * That will let us store an ipsec connection per route table entry, + * and start cleaning when were 1/2 full + */ + xfrm4_dst_ops.gc_thresh = rt_max_size/2; + xfrm4_state_init(); xfrm4_policy_init(); +#ifdef CONFIG_SYSCTL + sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, + xfrm4_policy_table); +#endif } |