summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/nhc.c2
-rw-r--r--net/8021q/vlan.c18
-rw-r--r--net/8021q/vlan_dev.c22
-rw-r--r--net/8021q/vlan_netlink.c3
-rw-r--r--net/appletalk/ddp.c7
-rw-r--r--net/atm/clip.c4
-rw-r--r--net/atm/ioctl.c16
-rw-r--r--net/atm/lec.c6
-rw-r--r--net/atm/pvc.c1
-rw-r--r--net/atm/svc.c1
-rw-r--r--net/ax25/af_ax25.c13
-rw-r--r--net/bluetooth/af_bluetooth.c8
-rw-r--r--net/bluetooth/hci_conn.c8
-rw-r--r--net/bluetooth/hci_core.c2
-rw-r--r--net/bluetooth/hci_event.c2
-rw-r--r--net/bluetooth/hidp/sock.c1
-rw-r--r--net/bluetooth/l2cap_core.c9
-rw-r--r--net/bluetooth/l2cap_sock.c8
-rw-r--r--net/bluetooth/mgmt.c11
-rw-r--r--net/bluetooth/rfcomm/sock.c1
-rw-r--r--net/bluetooth/sco.c5
-rw-r--r--net/bpf/Makefile2
-rw-r--r--net/bpf/test_run.c149
-rw-r--r--net/bridge/br.c14
-rw-r--r--net/bridge/br_arp_nd_proxy.c18
-rw-r--r--net/bridge/br_input.c100
-rw-r--r--net/bridge/br_multicast.c4
-rw-r--r--net/bridge/br_netlink.c2
-rw-r--r--net/bridge/br_private.h29
-rw-r--r--net/bridge/br_vlan.c214
-rw-r--r--net/bridge/netfilter/ebtable_broute.c63
-rw-r--r--net/bridge/netfilter/ebtables.c10
-rw-r--r--net/caif/caif_dev.c12
-rw-r--r--net/can/af_can.c6
-rw-r--r--net/can/bcm.c1
-rw-r--r--net/can/raw.c1
-rw-r--r--net/compat.c57
-rw-r--r--net/core/datagram.c19
-rw-r--r--net/core/dev.c16
-rw-r--r--net/core/devlink.c22
-rw-r--r--net/core/failover.c6
-rw-r--r--net/core/filter.c99
-rw-r--r--net/core/gen_stats.c2
-rw-r--r--net/core/lwtunnel.c9
-rw-r--r--net/core/neighbour.c5
-rw-r--r--net/core/net-sysfs.c14
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/netprio_cgroup.c2
-rw-r--r--net/core/ptp_classifier.c7
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/core/skbuff.c81
-rw-r--r--net/core/sock.c55
-rw-r--r--net/dccp/ipv4.c1
-rw-r--r--net/dccp/ipv6.c1
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/dns_resolver/dns_query.c2
-rw-r--r--net/hsr/Makefile2
-rw-r--r--net/hsr/hsr_debugfs.c (renamed from net/hsr/hsr_prp_debugfs.c)43
-rw-r--r--net/hsr/hsr_device.c4
-rw-r--r--net/hsr/hsr_forward.c7
-rw-r--r--net/hsr/hsr_main.h9
-rw-r--r--net/ieee802154/socket.c6
-rw-r--r--net/ipv4/af_inet.c9
-rw-r--r--net/ipv4/bpfilter/sockopt.c3
-rw-r--r--net/ipv4/fib_frontend.c69
-rw-r--r--net/ipv4/fib_semantics.c446
-rw-r--r--net/ipv4/fou.c12
-rw-r--r--net/ipv4/inet_connection_sock.c4
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/Kconfig20
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c89
-rw-r--r--net/ipv4/route.c142
-rw-r--r--net/ipv4/sysctl_net_ipv4.c5
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_dctcp.c45
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/udp.c19
-rw-r--r--net/ipv4/xfrm4_policy.c7
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/addrconf_core.c38
-rw-r--r--net/ipv6/addrlabel.c2
-rw-r--r--net/ipv6/af_inet6.c21
-rw-r--r--net/ipv6/fib6_rules.c53
-rw-r--r--net/ipv6/icmp.c31
-rw-r--r--net/ipv6/ip6_fib.c34
-rw-r--r--net/ipv6/ip6_gre.c20
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ndisc.c17
-rw-r--r--net/ipv6/netfilter/Kconfig19
-rw-r--r--net/ipv6/netfilter/Makefile2
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c81
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c91
-rw-r--r--net/ipv6/raw.c1
-rw-r--r--net/ipv6/route.c653
-rw-r--r--net/ipv6/udp.c12
-rw-r--r--net/l2tp/l2tp_ip.c1
-rw-r--r--net/l2tp/l2tp_ip6.c1
-rw-r--r--net/l2tp/l2tp_ppp.c3
-rw-r--r--net/llc/af_llc.c3
-rw-r--r--net/mac80211/driver-ops.h3
-rw-r--r--net/mac80211/key.c9
-rw-r--r--net/mac80211/mesh_pathtbl.c2
-rw-r--r--net/mac80211/rx.c10
-rw-r--r--net/mac80211/trace_msg.h7
-rw-r--r--net/mac80211/tx.c53
-rw-r--r--net/mpls/af_mpls.c2
-rw-r--r--net/mpls/mpls_iptunnel.c12
-rw-r--r--net/ncsi/ncsi-rsp.c6
-rw-r--r--net/netfilter/Kconfig19
-rw-r--r--net/netfilter/Makefile4
-rw-r--r--net/netfilter/core.c1
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c35
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c84
-rw-r--r--net/netfilter/nf_conntrack_core.c43
-rw-r--r--net/netfilter/nf_conntrack_expect.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c50
-rw-r--r--net/netfilter/nf_conntrack_proto.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c93
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c52
-rw-r--r--net/netfilter/nf_conntrack_sip.c2
-rw-r--r--net/netfilter/nf_flow_table_ip.c10
-rw-r--r--net/netfilter/nf_internals.h3
-rw-r--r--net/netfilter/nf_nat_core.c27
-rw-r--r--net/netfilter/nf_nat_masquerade.c104
-rw-r--r--net/netfilter/nf_nat_proto.c59
-rw-r--r--net/netfilter/nf_queue.c1
-rw-r--r--net/netfilter/nf_tables_api.c18
-rw-r--r--net/netfilter/nfnetlink_log.c2
-rw-r--r--net/netfilter/nfnetlink_osf.c14
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netfilter/nft_chain_nat.c36
-rw-r--r--net/netfilter/nft_chain_route.c169
-rw-r--r--net/netfilter/nft_masq.c83
-rw-r--r--net/netfilter/nft_nat.c58
-rw-r--r--net/netfilter/nft_osf.c30
-rw-r--r--net/netfilter/nft_redir.c63
-rw-r--r--net/netfilter/x_tables.c3
-rw-r--r--net/netfilter/xt_MASQUERADE.c (renamed from net/ipv4/netfilter/ipt_MASQUERADE.c)84
-rw-r--r--net/netfilter/xt_time.c23
-rw-r--r--net/netlink/af_netlink.c3
-rw-r--r--net/netrom/af_netrom.c90
-rw-r--r--net/netrom/nr_loopback.c2
-rw-r--r--net/netrom/nr_route.c2
-rw-r--r--net/netrom/sysctl_net_netrom.c5
-rw-r--r--net/nfc/nci/hci.c8
-rw-r--r--net/openvswitch/conntrack.c18
-rw-r--r--net/packet/af_packet.c7
-rw-r--r--net/qrtr/qrtr.c4
-rw-r--r--net/rds/af_rds.c3
-rw-r--r--net/rds/bind.c2
-rw-r--r--net/rds/ib_fmr.c11
-rw-r--r--net/rds/ib_rdma.c3
-rw-r--r--net/rose/af_rose.c7
-rw-r--r--net/rose/rose_loopback.c27
-rw-r--r--net/rxrpc/af_rxrpc.c17
-rw-r--r--net/rxrpc/ar-internal.h1
-rw-r--r--net/rxrpc/conn_event.c11
-rw-r--r--net/rxrpc/input.c30
-rw-r--r--net/rxrpc/local_object.c3
-rw-r--r--net/rxrpc/peer_event.c5
-rw-r--r--net/rxrpc/sendmsg.c21
-rw-r--r--net/sched/cls_flower.c100
-rw-r--r--net/sched/sch_api.c15
-rw-r--r--net/sched/sch_cbs.c98
-rw-r--r--net/sched/sch_generic.c67
-rw-r--r--net/sched/sch_taprio.c160
-rw-r--r--net/sctp/ipv6.c1
-rw-r--r--net/sctp/protocol.c1
-rw-r--r--net/sctp/sm_statefuns.c6
-rw-r--r--net/sctp/socket.c13
-rw-r--r--net/sctp/stream_interleave.c60
-rw-r--r--net/sctp/ulpevent.c19
-rw-r--r--net/sctp/ulpqueue.c53
-rw-r--r--net/smc/af_smc.c413
-rw-r--r--net/smc/smc.h11
-rw-r--r--net/smc/smc_clc.c10
-rw-r--r--net/smc/smc_clc.h20
-rw-r--r--net/smc/smc_close.c25
-rw-r--r--net/smc/smc_close.h1
-rw-r--r--net/smc/smc_core.c93
-rw-r--r--net/smc/smc_core.h25
-rw-r--r--net/smc/smc_ism.c5
-rw-r--r--net/smc/smc_pnet.c50
-rw-r--r--net/smc/smc_pnet.h7
-rw-r--r--net/socket.c65
-rw-r--r--net/strparser/strparser.c28
-rw-r--r--net/sunrpc/cache.c3
-rw-r--r--net/sunrpc/clnt.c52
-rw-r--r--net/sunrpc/xprtrdma/verbs.c2
-rw-r--r--net/tipc/link.c2
-rw-r--r--net/tipc/name_table.c3
-rw-r--r--net/tipc/node.c5
-rw-r--r--net/tipc/socket.c3
-rw-r--r--net/tipc/sysctl.c8
-rw-r--r--net/tipc/udp_media.c10
-rw-r--r--net/tls/tls_device.c16
-rw-r--r--net/tls/tls_device_fallback.c13
-rw-r--r--net/tls/tls_main.c29
-rw-r--r--net/tls/tls_sw.c18
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/wireless/nl80211.c18
-rw-r--r--net/wireless/reg.c39
-rw-r--r--net/wireless/scan.c3
-rw-r--r--net/wireless/util.c6
-rw-r--r--net/x25/af_x25.c27
-rw-r--r--net/xdp/xsk_queue.h56
211 files changed, 4027 insertions, 2451 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 4fa2fdda174d..9e56fb98f33c 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -18,7 +18,7 @@
#include "nhc.h"
static struct rb_root rb_root = RB_ROOT;
-static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX];
+static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX + 1];
static DEFINE_SPINLOCK(lowpan_nhc_lock);
static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index dc4411165e43..1f99678751df 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -75,6 +75,14 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg,
return 0;
}
+static void vlan_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev,
+ struct vlan_dev_priv *vlan)
+{
+ if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_stacked_transfer_operstate(rootdev, dev);
+}
+
void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -180,7 +188,7 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
/* Account for reference in struct vlan_dev_priv */
dev_hold(real_dev);
- netif_stacked_transfer_operstate(real_dev, dev);
+ vlan_stacked_transfer_operstate(real_dev, dev, vlan);
linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */
/* So, got the sucker initialized, now lets place
@@ -399,7 +407,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_CHANGE:
/* Propagate real device state to vlan devices */
vlan_group_for_each_dev(grp, i, vlandev)
- netif_stacked_transfer_operstate(dev, vlandev);
+ vlan_stacked_transfer_operstate(dev, vlandev,
+ vlan_dev_priv(vlandev));
break;
case NETDEV_CHANGEADDR:
@@ -446,7 +455,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
dev_close_many(&close_list, false);
list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) {
- netif_stacked_transfer_operstate(dev, vlandev);
+ vlan_stacked_transfer_operstate(dev, vlandev,
+ vlan_dev_priv(vlandev));
list_del_init(&vlandev->close_list);
}
list_del(&close_list);
@@ -463,7 +473,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
dev_change_flags(vlandev, flgs | IFF_UP,
extack);
- netif_stacked_transfer_operstate(dev, vlandev);
+ vlan_stacked_transfer_operstate(dev, vlandev, vlan);
}
break;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 8d77b6ee4477..f044ae56a313 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -223,7 +223,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
u32 old_flags = vlan->flags;
if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
- VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP |
+ VLAN_FLAG_BRIDGE_BINDING))
return -EINVAL;
vlan->flags = (old_flags & ~mask) | (flags & mask);
@@ -296,7 +297,8 @@ static int vlan_dev_open(struct net_device *dev)
if (vlan->flags & VLAN_FLAG_MVRP)
vlan_mvrp_request_join(dev);
- if (netif_carrier_ok(real_dev))
+ if (netif_carrier_ok(real_dev) &&
+ !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
netif_carrier_on(dev);
return 0;
@@ -326,7 +328,8 @@ static int vlan_dev_stop(struct net_device *dev)
if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
dev_uc_del(real_dev, dev->dev_addr);
- netif_carrier_off(dev);
+ if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_carrier_off(dev);
return 0;
}
@@ -550,7 +553,8 @@ static const struct net_device_ops vlan_netdev_ops;
static int vlan_dev_init(struct net_device *dev)
{
- struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
netif_carrier_off(dev);
@@ -561,6 +565,9 @@ static int vlan_dev_init(struct net_device *dev)
(1<<__LINK_STATE_DORMANT))) |
(1<<__LINK_STATE_PRESENT);
+ if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING)
+ dev->state |= (1 << __LINK_STATE_NOCARRIER);
+
dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
NETIF_F_GSO_ENCAP_ALL |
@@ -591,8 +598,7 @@ static int vlan_dev_init(struct net_device *dev)
#endif
dev->needed_headroom = real_dev->needed_headroom;
- if (vlan_hw_offload_capable(real_dev->features,
- vlan_dev_priv(dev)->vlan_proto)) {
+ if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) {
dev->header_ops = &vlan_passthru_header_ops;
dev->hard_header_len = real_dev->hard_header_len;
} else {
@@ -606,8 +612,8 @@ static int vlan_dev_init(struct net_device *dev)
vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev));
- vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
- if (!vlan_dev_priv(dev)->vlan_pcpu_stats)
+ vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
+ if (!vlan->vlan_pcpu_stats)
return -ENOMEM;
return 0;
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 9b60c1e399e2..a624dccf68fd 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -84,7 +84,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
flags = nla_data(data[IFLA_VLAN_FLAGS]);
if ((flags->flags & flags->mask) &
~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
- VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) {
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP |
+ VLAN_FLAG_BRIDGE_BINDING)) {
NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
return -EINVAL;
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 709d2542f729..e2511027d19b 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1806,12 +1806,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
rc = put_user(amount, (int __user *)argp);
break;
}
- case SIOCGSTAMP:
- rc = sock_get_timestamp(sk, argp);
- break;
- case SIOCGSTAMPNS:
- rc = sock_get_timestampns(sk, argp);
- break;
/* Routing */
case SIOCADDRT:
case SIOCDELRT:
@@ -1871,6 +1865,7 @@ static const struct proto_ops atalk_dgram_ops = {
.getname = atalk_getname,
.poll = datagram_poll,
.ioctl = atalk_ioctl,
+ .gettstamp = sock_gettstamp,
#ifdef CONFIG_COMPAT
.compat_ioctl = atalk_compat_ioctl,
#endif
diff --git a/net/atm/clip.c b/net/atm/clip.c
index d795b9c5aea4..b9e67e589a7b 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -345,8 +345,8 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
}
rt = (struct rtable *) dst;
- if (rt->rt_gateway)
- daddr = &rt->rt_gateway;
+ if (rt->rt_gw_family == AF_INET)
+ daddr = &rt->rt_gw4;
else
daddr = &ip_hdr(skb)->daddr;
n = dst_neigh_lookup(dst, daddr);
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 2ff0e5e470e3..d955b683aa7c 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -81,22 +81,6 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
(int __user *)argp) ? -EFAULT : 0;
goto done;
}
- case SIOCGSTAMP: /* borrowed from IP */
-#ifdef CONFIG_COMPAT
- if (compat)
- error = compat_sock_get_timestamp(sk, argp);
- else
-#endif
- error = sock_get_timestamp(sk, argp);
- goto done;
- case SIOCGSTAMPNS: /* borrowed from IP */
-#ifdef CONFIG_COMPAT
- if (compat)
- error = compat_sock_get_timestampns(sk, argp);
- else
-#endif
- error = sock_get_timestampns(sk, argp);
- goto done;
case ATM_SETSC:
net_warn_ratelimited("ATM_SETSC is obsolete; used by %s:%d\n",
current->comm, task_pid_nr(current));
diff --git a/net/atm/lec.c b/net/atm/lec.c
index d7f5cf5b7594..ad4f829193f0 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -710,7 +710,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
static int lec_mcast_attach(struct atm_vcc *vcc, int arg)
{
- if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg])
+ if (arg < 0 || arg >= MAX_LEC_ITF)
+ return -EINVAL;
+ arg = array_index_nospec(arg, MAX_LEC_ITF);
+ if (!dev_lec[arg])
return -EINVAL;
vcc->proto_data = dev_lec[arg];
return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc);
@@ -728,6 +731,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
i = arg;
if (arg >= MAX_LEC_ITF)
return -EINVAL;
+ i = array_index_nospec(arg, MAX_LEC_ITF);
if (!dev_lec[i]) {
int size;
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2cb10af16afc..02bd2a436bdf 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -118,6 +118,7 @@ static const struct proto_ops pvc_proto_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = vcc_compat_ioctl,
#endif
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = pvc_shutdown,
.setsockopt = pvc_setsockopt,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 2f91b766ac42..908cbb8654f5 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -641,6 +641,7 @@ static const struct proto_ops svc_proto_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = svc_compat_ioctl,
#endif
+ .gettstamp = sock_gettstamp,
.listen = svc_listen,
.shutdown = svc_shutdown,
.setsockopt = svc_setsockopt,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 5d01edf8d819..012c0b6fc4f6 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1714,14 +1714,6 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
- case SIOCGSTAMP:
- res = sock_get_timestamp(sk, argp);
- break;
-
- case SIOCGSTAMPNS:
- res = sock_get_timestampns(sk, argp);
- break;
-
case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */
case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */
case SIOCAX25GETUID: {
@@ -1888,8 +1880,8 @@ static int ax25_info_show(struct seq_file *seq, void *v)
* magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode
*/
- seq_printf(seq, "%8.8lx %s %s%s ",
- (long) ax25,
+ seq_printf(seq, "%p %s %s%s ",
+ ax25,
ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
ax2asc(buf, &ax25->source_addr),
ax25->iamdigi? "*":"");
@@ -1950,6 +1942,7 @@ static const struct proto_ops ax25_proto_ops = {
.getname = ax25_getname,
.poll = datagram_poll,
.ioctl = ax25_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = ax25_listen,
.shutdown = ax25_shutdown,
.setsockopt = ax25_setsockopt,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 8d12198eaa94..94ddf19998c7 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -521,14 +521,6 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
err = put_user(amount, (int __user *) arg);
break;
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *) arg);
- break;
-
- case SIOCGSTAMPNS:
- err = sock_get_timestampns(sk, (struct timespec __user *) arg);
- break;
-
default:
err = -ENOIOCTLCMD;
break;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index bd4978ce8c45..3cf0764d5793 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1276,6 +1276,14 @@ int hci_conn_check_link_mode(struct hci_conn *conn)
!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
return 0;
+ /* The minimum encryption key size needs to be enforced by the
+ * host stack before establishing any L2CAP connections. The
+ * specification in theory allows a minimum of 1, but to align
+ * BR/EDR and LE transports, a minimum of 7 is chosen.
+ */
+ if (conn->enc_key_size < HCI_MIN_ENC_KEY_SIZE)
+ return 0;
+
return 1;
}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index d6b2540ba7f8..3d9175f130b3 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1460,8 +1460,6 @@ static int hci_dev_do_open(struct hci_dev *hdev)
hdev->set_bdaddr)
ret = hdev->set_bdaddr(hdev,
&hdev->public_addr);
- else
- ret = -EADDRNOTAVAIL;
}
setup_failed:
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 609fd6871c5a..66b631ab0d35 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5433,7 +5433,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
ev->data, ev->length);
}
- ptr += sizeof(*ev) + ev->length + 1;
+ ptr += sizeof(*ev) + ev->length;
}
hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 9f85a1943be9..2151913892ce 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -75,6 +75,7 @@ static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user
sockfd_put(csock);
return err;
}
+ ca.name[sizeof(ca.name)-1] = 0;
err = hidp_connection_add(&ca, csock, isock);
if (!err && copy_to_user(argp, &ca, sizeof(ca)))
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index f17e393b43b4..b53acd6c9a3d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -510,12 +510,12 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan)
}
EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults);
-static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
+static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits)
{
chan->sdu = NULL;
chan->sdu_last_frag = NULL;
chan->sdu_len = 0;
- chan->tx_credits = 0;
+ chan->tx_credits = tx_credits;
/* Derive MPS from connection MTU to stop HCI fragmentation */
chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
/* Give enough credits for a full packet */
@@ -1281,7 +1281,7 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
return;
- l2cap_le_flowctl_init(chan);
+ l2cap_le_flowctl_init(chan, 0);
req.psm = chan->psm;
req.scid = cpu_to_le16(chan->scid);
@@ -5532,11 +5532,10 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
chan->dcid = scid;
chan->omtu = mtu;
chan->remote_mps = mps;
- chan->tx_credits = __le16_to_cpu(req->credits);
__l2cap_chan_add(conn, chan);
- l2cap_le_flowctl_init(chan);
+ l2cap_le_flowctl_init(chan, __le16_to_cpu(req->credits));
dcid = chan->scid;
credits = chan->rx_credits;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a3a2cd55e23a..a7be8b59b3c2 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -791,10 +791,13 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
conn = chan->conn;
- /*change security for LE channels */
+ /* change security for LE channels */
if (chan->scid == L2CAP_CID_ATT) {
- if (smp_conn_security(conn->hcon, sec.level))
+ if (smp_conn_security(conn->hcon, sec.level)) {
+ err = -EINVAL;
break;
+ }
+
set_bit(FLAG_PENDING_SECURITY, &chan->flags);
sk->sk_state = BT_CONFIG;
chan->state = BT_CONFIG;
@@ -1655,6 +1658,7 @@ static const struct proto_ops l2cap_sock_ops = {
.recvmsg = l2cap_sock_recvmsg,
.poll = bt_sock_poll,
.ioctl = bt_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
.shutdown = l2cap_sock_shutdown,
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 2457f408d17d..150114e33b20 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -2301,8 +2301,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + key_count *
- sizeof(struct mgmt_link_key_info);
+ expected_len = struct_size(cp, keys, key_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes",
expected_len, len);
@@ -5030,7 +5029,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info);
+ expected_len = struct_size(cp, irks, irk_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes",
expected_len, len);
@@ -5112,8 +5111,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + key_count *
- sizeof(struct mgmt_ltk_info);
+ expected_len = struct_size(cp, keys, key_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes",
expected_len, len);
@@ -5847,8 +5845,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + param_count *
- sizeof(struct mgmt_conn_param);
+ expected_len = struct_size(cp, params, param_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes",
expected_len, len);
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index b1f49fcc0478..90bb53aa4bee 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1039,6 +1039,7 @@ static const struct proto_ops rfcomm_sock_ops = {
.setsockopt = rfcomm_sock_setsockopt,
.getsockopt = rfcomm_sock_getsockopt,
.ioctl = rfcomm_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.poll = bt_sock_poll,
.socketpair = sock_no_socketpair,
.mmap = sock_no_mmap
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 9a580999ca57..b91d6b440fdf 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -523,12 +523,12 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr,
struct sock *sk = sock->sk;
int err = 0;
- BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr);
-
if (!addr || addr_len < sizeof(struct sockaddr_sco) ||
addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr);
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -1190,6 +1190,7 @@ static const struct proto_ops sco_sock_ops = {
.recvmsg = sco_sock_recvmsg,
.poll = bt_sock_poll,
.ioctl = bt_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
.shutdown = sco_sock_shutdown,
diff --git a/net/bpf/Makefile b/net/bpf/Makefile
index 27b2992a0692..b0ca361742e4 100644
--- a/net/bpf/Makefile
+++ b/net/bpf/Makefile
@@ -1 +1 @@
-obj-y := test_run.o
+obj-$(CONFIG_BPF_SYSCALL) := test_run.o
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index fab142b796ef..2221573dacdb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -123,12 +123,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
return data;
}
+static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
+{
+ void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
+ void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+ u32 size = kattr->test.ctx_size_in;
+ void *data;
+ int err;
+
+ if (!data_in && !data_out)
+ return NULL;
+
+ data = kzalloc(max_size, GFP_USER);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ if (data_in) {
+ err = bpf_check_uarg_tail_zero(data_in, max_size, size);
+ if (err) {
+ kfree(data);
+ return ERR_PTR(err);
+ }
+
+ size = min_t(u32, max_size, size);
+ if (copy_from_user(data, data_in, size)) {
+ kfree(data);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+ return data;
+}
+
+static int bpf_ctx_finish(const union bpf_attr *kattr,
+ union bpf_attr __user *uattr, const void *data,
+ u32 size)
+{
+ void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+ int err = -EFAULT;
+ u32 copy_size = size;
+
+ if (!data || !data_out)
+ return 0;
+
+ if (copy_size > kattr->test.ctx_size_out) {
+ copy_size = kattr->test.ctx_size_out;
+ err = -ENOSPC;
+ }
+
+ if (copy_to_user(data_out, data, copy_size))
+ goto out;
+ if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size)))
+ goto out;
+ if (err != -ENOSPC)
+ err = 0;
+out:
+ return err;
+}
+
+/**
+ * range_is_zero - test whether buffer is initialized
+ * @buf: buffer to check
+ * @from: check from this position
+ * @to: check up until (excluding) this position
+ *
+ * This function returns true if the there is a non-zero byte
+ * in the buf in the range [from,to).
+ */
+static inline bool range_is_zero(void *buf, size_t from, size_t to)
+{
+ return !memchr_inv((u8 *)buf + from, 0, to - from);
+}
+
+static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+ struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+ if (!__skb)
+ return 0;
+
+ /* make sure the fields we don't use are zeroed */
+ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority)))
+ return -EINVAL;
+
+ /* priority is allowed */
+
+ if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) +
+ FIELD_SIZEOF(struct __sk_buff, priority),
+ offsetof(struct __sk_buff, cb)))
+ return -EINVAL;
+
+ /* cb is allowed */
+
+ if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) +
+ FIELD_SIZEOF(struct __sk_buff, cb),
+ sizeof(struct __sk_buff)))
+ return -EINVAL;
+
+ skb->priority = __skb->priority;
+ memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN);
+
+ return 0;
+}
+
+static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+ struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+ if (!__skb)
+ return;
+
+ __skb->priority = skb->priority;
+ memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
+}
+
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
bool is_l2 = false, is_direct_pkt_access = false;
u32 size = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
+ struct __sk_buff *ctx = NULL;
u32 retval, duration;
int hh_len = ETH_HLEN;
struct sk_buff *skb;
@@ -141,6 +255,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
if (IS_ERR(data))
return PTR_ERR(data);
+ ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff));
+ if (IS_ERR(ctx)) {
+ kfree(data);
+ return PTR_ERR(ctx);
+ }
+
switch (prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
@@ -158,6 +278,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
sk = kzalloc(sizeof(struct sock), GFP_USER);
if (!sk) {
kfree(data);
+ kfree(ctx);
return -ENOMEM;
}
sock_net_set(sk, current->nsproxy->net_ns);
@@ -166,6 +287,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
skb = build_skb(data, 0);
if (!skb) {
kfree(data);
+ kfree(ctx);
kfree(sk);
return -ENOMEM;
}
@@ -180,32 +302,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
__skb_push(skb, hh_len);
if (is_direct_pkt_access)
bpf_compute_data_pointers(skb);
+ ret = convert___skb_to_skb(skb, ctx);
+ if (ret)
+ goto out;
ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
- if (ret) {
- kfree_skb(skb);
- kfree(sk);
- return ret;
- }
+ if (ret)
+ goto out;
if (!is_l2) {
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
- kfree_skb(skb);
- kfree(sk);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
}
memset(__skb_push(skb, hh_len), 0, hh_len);
}
+ convert_skb_to___skb(skb, ctx);
size = skb->len;
/* bpf program can never convert linear skb to non-linear */
if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
size = skb_headlen(skb);
ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, ctx,
+ sizeof(struct __sk_buff));
+out:
kfree_skb(skb);
kfree(sk);
+ kfree(ctx);
return ret;
}
@@ -220,6 +347,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
void *data;
int ret;
+ if (kattr->test.ctx_in || kattr->test.ctx_out)
+ return -EINVAL;
+
data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0);
if (IS_ERR(data))
return PTR_ERR(data);
@@ -263,6 +393,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
return -EINVAL;
+ if (kattr->test.ctx_in || kattr->test.ctx_out)
+ return -EINVAL;
+
data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
if (IS_ERR(data))
diff --git a/net/bridge/br.c b/net/bridge/br.c
index a5174e5001d8..3c8e4b38f054 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -40,10 +40,13 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
bool changed_addr;
int err;
- /* register of bridge completed, add sysfs entries */
- if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) {
- br_sysfs_addbr(dev);
- return NOTIFY_DONE;
+ if (dev->priv_flags & IFF_EBRIDGE) {
+ if (event == NETDEV_REGISTER) {
+ /* register of bridge completed, add sysfs entries */
+ br_sysfs_addbr(dev);
+ return NOTIFY_DONE;
+ }
+ br_vlan_bridge_event(dev, event, ptr);
}
/* not a port of a bridge */
@@ -126,6 +129,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
break;
}
+ if (event != NETDEV_UNREGISTER)
+ br_vlan_port_event(p, event);
+
/* Events that may cause spanning tree to refresh */
if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
event == NETDEV_CHANGE || event == NETDEV_DOWN))
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 724b474ade54..15116752365a 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -131,7 +131,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
u8 *arpptr, *sha;
__be32 sip, tip;
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
if ((dev->flags & IFF_NOARP) ||
!pskb_may_pull(skb, arp_hdr_len(dev)))
@@ -161,7 +161,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
return;
if (ipv4_is_zeronet(sip) || sip == tip) {
/* prevent flooding to neigh suppress ports */
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
}
@@ -181,7 +181,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
/* its our local ip, so don't proxy reply
* and don't forward to neigh suppress ports
*/
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -217,7 +217,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
*/
if (replied ||
br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
}
neigh_release(n);
@@ -393,7 +393,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
struct ipv6hdr *iphdr;
struct neighbour *n;
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
if (p && (p->flags & BR_NEIGH_SUPPRESS))
return;
@@ -401,7 +401,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
!msg->icmph.icmp6_solicited) {
/* prevent flooding to neigh suppress ports */
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -414,7 +414,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) {
/* prevent flooding to neigh suppress ports */
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -432,7 +432,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
/* its our own ip, so don't proxy reply
* and don't forward to arp suppress ports
*/
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -465,7 +465,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
*/
if (replied ||
br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
}
neigh_release(n);
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 5ea7e56119c1..014af7efef25 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -16,6 +16,9 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/netfilter_bridge.h>
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+#include <net/netfilter/nf_queue.h>
+#endif
#include <linux/neighbour.h>
#include <net/arp.h>
#include <linux/export.h>
@@ -23,10 +26,6 @@
#include "br_private.h"
#include "br_private_tunnel.h"
-/* Hook for brouter */
-br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
-EXPORT_SYMBOL(br_should_route_hook);
-
static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -197,13 +196,63 @@ static void __br_handle_local_finish(struct sk_buff *skb)
/* note: already called with rcu_read_lock */
static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_bridge_port *p = br_port_get_rcu(skb->dev);
-
__br_handle_local_finish(skb);
- BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
- br_pass_frame_up(skb);
- return 0;
+ /* return 1 to signal the okfn() was called so it's ok to use the skb */
+ return 1;
+}
+
+static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
+{
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ struct nf_hook_entries *e = NULL;
+ struct nf_hook_state state;
+ unsigned int verdict, i;
+ struct net *net;
+ int ret;
+
+ net = dev_net(skb->dev);
+#ifdef HAVE_JUMP_LABEL
+ if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING]))
+ goto frame_finish;
+#endif
+
+ e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]);
+ if (!e)
+ goto frame_finish;
+
+ nf_hook_state_init(&state, NF_BR_PRE_ROUTING,
+ NFPROTO_BRIDGE, skb->dev, NULL, NULL,
+ net, br_handle_frame_finish);
+
+ for (i = 0; i < e->num_hook_entries; i++) {
+ verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state);
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) {
+ *pskb = skb;
+ return RX_HANDLER_PASS;
+ }
+ break;
+ case NF_DROP:
+ kfree_skb(skb);
+ return RX_HANDLER_CONSUMED;
+ case NF_QUEUE:
+ ret = nf_queue(skb, &state, e, i, verdict);
+ if (ret == 1)
+ continue;
+ return RX_HANDLER_CONSUMED;
+ default: /* STOLEN */
+ return RX_HANDLER_CONSUMED;
+ }
+ }
+frame_finish:
+ net = dev_net(skb->dev);
+ br_handle_frame_finish(net, NULL, skb);
+#else
+ br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
+#endif
+ return RX_HANDLER_CONSUMED;
}
/*
@@ -215,7 +264,6 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
- br_should_route_hook_t *rhook;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
@@ -227,6 +275,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
if (!skb)
return RX_HANDLER_CONSUMED;
+ memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
+
p = br_port_get_rcu(skb->dev);
if (p->flags & BR_VLAN_TUNNEL) {
if (br_handle_ingress_vlan_tunnel(skb, p,
@@ -280,32 +330,28 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
goto forward;
}
- /* Deliver packet to local host only */
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
- NULL, skb, skb->dev, NULL, br_handle_local_finish);
- return RX_HANDLER_CONSUMED;
+ /* The else clause should be hit when nf_hook():
+ * - returns < 0 (drop/error)
+ * - returns = 0 (stolen/nf_queue)
+ * Thus return 1 from the okfn() to signal the skb is ok to pass
+ */
+ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ br_handle_local_finish) == 1) {
+ return RX_HANDLER_PASS;
+ } else {
+ return RX_HANDLER_CONSUMED;
+ }
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
- rhook = rcu_dereference(br_should_route_hook);
- if (rhook) {
- if ((*rhook)(skb)) {
- *pskb = skb;
- return RX_HANDLER_PASS;
- }
- dest = eth_hdr(skb)->h_dest;
- }
- /* fall through */
case BR_STATE_LEARNING:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
- br_handle_frame_finish);
- break;
+ return nf_hook_bridge_pre(skb, pskb);
default:
drop:
kfree_skb(skb);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 812560d7f7a2..c2a30f79a9d0 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2013,7 +2013,8 @@ static void br_multicast_start_querier(struct net_bridge *br,
__br_multicast_open(br, query);
- list_for_each_entry(port, &br->port_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &br->port_list, list) {
if (port->state == BR_STATE_DISABLED ||
port->state == BR_STATE_BLOCKING)
continue;
@@ -2025,6 +2026,7 @@ static void br_multicast_start_querier(struct net_bridge *br,
br_multicast_enable(&port->ip6_own_query);
#endif
}
+ rcu_read_unlock();
}
int br_multicast_toggle(struct net_bridge *br, unsigned long val)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 4f9f59eba8b4..8dfcc2d285d8 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1441,7 +1441,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) ||
nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT,
- br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT)))
+ br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)))
return -EMSGSIZE;
#endif
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 7946aa3b6e09..334a8c496b50 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -321,6 +321,7 @@ enum net_bridge_opts {
BROPT_MTU_SET_BY_USER,
BROPT_VLAN_STATS_PER_PORT,
BROPT_NO_LL_LEARN,
+ BROPT_VLAN_BRIDGE_BINDING,
};
struct net_bridge {
@@ -425,15 +426,16 @@ struct br_input_skb_cb {
struct net_device *brdev;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- int igmp;
- int mrouters_only;
+ u8 igmp;
+ u8 mrouters_only:1;
#endif
-
- bool proxyarp_replied;
- bool src_port_isolated;
-
+ u8 proxyarp_replied:1;
+ u8 src_port_isolated:1;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- bool vlan_filtered;
+ u8 vlan_filtered:1;
+#endif
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ u8 br_netfilter_broute:1;
#endif
#ifdef CONFIG_NET_SWITCHDEV
@@ -894,6 +896,9 @@ int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack);
int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
void br_vlan_get_stats(const struct net_bridge_vlan *v,
struct br_vlan_stats *stats);
+void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
+void br_vlan_bridge_event(struct net_device *dev, unsigned long event,
+ void *ptr);
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
@@ -1077,6 +1082,16 @@ static inline void br_vlan_get_stats(const struct net_bridge_vlan *v,
struct br_vlan_stats *stats)
{
}
+
+static inline void br_vlan_port_event(struct net_bridge_port *p,
+ unsigned long event)
+{
+}
+
+static inline void br_vlan_bridge_event(struct net_device *dev,
+ unsigned long event, void *ptr)
+{
+}
#endif
struct nf_br_ops {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 0a02822b5667..2db63997f313 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -7,6 +7,8 @@
#include "br_private.h"
#include "br_private_tunnel.h"
+static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid);
+
static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
@@ -293,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
__vlan_add_list(v);
__vlan_add_flags(v, flags);
+
+ if (p)
+ nbp_vlan_set_vlan_dev_state(p, v->vid);
out:
return err;
@@ -357,6 +362,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
br_vlan_rht_params);
__vlan_del_list(v);
+ nbp_vlan_set_vlan_dev_state(p, v->vid);
call_rcu(&v->rcu, nbp_vlan_rcu_free);
}
@@ -1264,3 +1270,211 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid,
return 0;
}
EXPORT_SYMBOL_GPL(br_vlan_get_info);
+
+static int br_vlan_is_bind_vlan_dev(const struct net_device *dev)
+{
+ return is_vlan_dev(dev) &&
+ !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING);
+}
+
+static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev,
+ __always_unused void *data)
+{
+ return br_vlan_is_bind_vlan_dev(dev);
+}
+
+static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev)
+{
+ int found;
+
+ rcu_read_lock();
+ found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn,
+ NULL);
+ rcu_read_unlock();
+
+ return !!found;
+}
+
+struct br_vlan_bind_walk_data {
+ u16 vid;
+ struct net_device *result;
+};
+
+static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev,
+ void *data_in)
+{
+ struct br_vlan_bind_walk_data *data = data_in;
+ int found = 0;
+
+ if (br_vlan_is_bind_vlan_dev(dev) &&
+ vlan_dev_priv(dev)->vlan_id == data->vid) {
+ data->result = dev;
+ found = 1;
+ }
+
+ return found;
+}
+
+static struct net_device *
+br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid)
+{
+ struct br_vlan_bind_walk_data data = {
+ .vid = vid,
+ };
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn,
+ &data);
+ rcu_read_unlock();
+
+ return data.result;
+}
+
+static bool br_vlan_is_dev_up(const struct net_device *dev)
+{
+ return !!(dev->flags & IFF_UP) && netif_oper_up(dev);
+}
+
+static void br_vlan_set_vlan_dev_state(const struct net_bridge *br,
+ struct net_device *vlan_dev)
+{
+ u16 vid = vlan_dev_priv(vlan_dev)->vlan_id;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+ bool has_carrier = false;
+
+ if (!netif_carrier_ok(br->dev)) {
+ netif_carrier_off(vlan_dev);
+ return;
+ }
+
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+ if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) {
+ has_carrier = true;
+ break;
+ }
+ }
+
+ if (has_carrier)
+ netif_carrier_on(vlan_dev);
+ else
+ netif_carrier_off(vlan_dev);
+}
+
+static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p)
+{
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+ struct net_bridge_vlan *vlan;
+ struct net_device *vlan_dev;
+
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev,
+ vlan->vid);
+ if (vlan_dev) {
+ if (br_vlan_is_dev_up(p->dev)) {
+ if (netif_carrier_ok(p->br->dev))
+ netif_carrier_on(vlan_dev);
+ } else {
+ br_vlan_set_vlan_dev_state(p->br, vlan_dev);
+ }
+ }
+ }
+}
+
+static void br_vlan_upper_change(struct net_device *dev,
+ struct net_device *upper_dev,
+ bool linking)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (!br_vlan_is_bind_vlan_dev(upper_dev))
+ return;
+
+ if (linking) {
+ br_vlan_set_vlan_dev_state(br, upper_dev);
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
+ } else {
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
+ br_vlan_has_upper_bind_vlan_dev(dev));
+ }
+}
+
+struct br_vlan_link_state_walk_data {
+ struct net_bridge *br;
+};
+
+static int br_vlan_link_state_change_fn(struct net_device *vlan_dev,
+ void *data_in)
+{
+ struct br_vlan_link_state_walk_data *data = data_in;
+
+ if (br_vlan_is_bind_vlan_dev(vlan_dev))
+ br_vlan_set_vlan_dev_state(data->br, vlan_dev);
+
+ return 0;
+}
+
+static void br_vlan_link_state_change(struct net_device *dev,
+ struct net_bridge *br)
+{
+ struct br_vlan_link_state_walk_data data = {
+ .br = br
+ };
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn,
+ &data);
+ rcu_read_unlock();
+}
+
+/* Must be protected by RTNL. */
+static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid)
+{
+ struct net_device *vlan_dev;
+
+ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+
+ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid);
+ if (vlan_dev)
+ br_vlan_set_vlan_dev_state(p->br, vlan_dev);
+}
+
+/* Must be protected by RTNL. */
+void br_vlan_bridge_event(struct net_device *dev, unsigned long event,
+ void *ptr)
+{
+ struct netdev_notifier_changeupper_info *info;
+ struct net_bridge *br;
+
+ switch (event) {
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ br_vlan_upper_change(dev, info->upper_dev, info->linking);
+ break;
+
+ case NETDEV_CHANGE:
+ case NETDEV_UP:
+ br = netdev_priv(dev);
+ if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+ br_vlan_link_state_change(dev, br);
+ break;
+ }
+}
+
+/* Must be protected by RTNL. */
+void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
+{
+ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ case NETDEV_DOWN:
+ case NETDEV_UP:
+ br_vlan_set_all_vlan_dev_state(p);
+ break;
+ }
+}
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 276b60262981..ec2652a459da 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -15,6 +15,8 @@
#include <linux/module.h>
#include <linux/if_bridge.h>
+#include "../br_private.h"
+
/* EBT_ACCEPT means the frame will be bridged
* EBT_DROP means the frame will be routed
*/
@@ -48,30 +50,63 @@ static const struct ebt_table broute_table = {
.me = THIS_MODULE,
};
-static int ebt_broute(struct sk_buff *skb)
+static unsigned int ebt_broute(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *s)
{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct nf_hook_state state;
+ unsigned char *dest;
int ret;
+ if (!p || p->state != BR_STATE_FORWARDING)
+ return NF_ACCEPT;
+
nf_hook_state_init(&state, NF_BR_BROUTING,
- NFPROTO_BRIDGE, skb->dev, NULL, NULL,
- dev_net(skb->dev), NULL);
+ NFPROTO_BRIDGE, s->in, NULL, NULL,
+ s->net, NULL);
ret = ebt_do_table(skb, &state, state.net->xt.broute_table);
- if (ret == NF_DROP)
- return 1; /* route it */
- return 0; /* bridge it */
+
+ if (ret != NF_DROP)
+ return ret;
+
+ /* DROP in ebtables -t broute means that the
+ * skb should be routed, not bridged.
+ * This is awkward, but can't be changed for compatibility
+ * reasons.
+ *
+ * We map DROP to ACCEPT and set the ->br_netfilter_broute flag.
+ */
+ BR_INPUT_SKB_CB(skb)->br_netfilter_broute = 1;
+
+ /* undo PACKET_HOST mangling done in br_input in case the dst
+ * address matches the logical bridge but not the port.
+ */
+ dest = eth_hdr(skb)->h_dest;
+ if (skb->pkt_type == PACKET_HOST &&
+ !ether_addr_equal(skb->dev->dev_addr, dest) &&
+ ether_addr_equal(p->br->dev->dev_addr, dest))
+ skb->pkt_type = PACKET_OTHERHOST;
+
+ return NF_ACCEPT;
}
+static const struct nf_hook_ops ebt_ops_broute = {
+ .hook = ebt_broute,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_FIRST,
+};
+
static int __net_init broute_net_init(struct net *net)
{
- return ebt_register_table(net, &broute_table, NULL,
+ return ebt_register_table(net, &broute_table, &ebt_ops_broute,
&net->xt.broute_table);
}
static void __net_exit broute_net_exit(struct net *net)
{
- ebt_unregister_table(net, net->xt.broute_table, NULL);
+ ebt_unregister_table(net, net->xt.broute_table, &ebt_ops_broute);
}
static struct pernet_operations broute_net_ops = {
@@ -81,21 +116,11 @@ static struct pernet_operations broute_net_ops = {
static int __init ebtable_broute_init(void)
{
- int ret;
-
- ret = register_pernet_subsys(&broute_net_ops);
- if (ret < 0)
- return ret;
- /* see br_input.c */
- RCU_INIT_POINTER(br_should_route_hook,
- (br_should_route_hook_t *)ebt_broute);
- return 0;
+ return register_pernet_subsys(&broute_net_ops);
}
static void __exit ebtable_broute_fini(void)
{
- RCU_INIT_POINTER(br_should_route_hook, NULL);
- synchronize_net();
unregister_pernet_subsys(&broute_net_ops);
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index eb15891f8b9f..4e0091311d40 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1221,10 +1221,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
mutex_unlock(&ebt_mutex);
WRITE_ONCE(*res, table);
-
- if (!ops)
- return 0;
-
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret) {
__ebt_unregister_table(net, table);
@@ -1248,8 +1244,7 @@ out:
void ebt_unregister_table(struct net *net, struct ebt_table *table,
const struct nf_hook_ops *ops)
{
- if (ops)
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
__ebt_unregister_table(net, table);
}
@@ -2032,7 +2027,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
if (match_kern)
match_kern->match_size = ret;
- if (WARN_ON(type == EBT_COMPAT_TARGET && size_left))
+ /* rule should have no remaining data after target */
+ if (type == EBT_COMPAT_TARGET && size_left)
return -EINVAL;
match32 = (struct compat_ebt_entry_mwt *) buf;
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 711d7156efd8..6c6e01963aac 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -186,15 +186,19 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt)
goto noxoff;
if (likely(!netif_queue_stopped(caifd->netdev))) {
+ struct Qdisc *sch;
+
/* If we run with a TX queue, check if the queue is too long*/
txq = netdev_get_tx_queue(skb->dev, 0);
- qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc));
-
- if (likely(qlen == 0))
+ sch = rcu_dereference_bh(txq->qdisc);
+ if (likely(qdisc_is_empty(sch)))
goto noxoff;
+ /* can check for explicit qdisc len value only !NOLOCK,
+ * always set flow off otherwise
+ */
high = (caifd->netdev->tx_queue_len * q_high) / 100;
- if (likely(qlen < high))
+ if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high))
goto noxoff;
}
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1684ba5b51eb..e8fd5dc1780a 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -89,13 +89,7 @@ static atomic_t skbcounter = ATOMIC_INIT(0);
int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- struct sock *sk = sock->sk;
-
switch (cmd) {
-
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
-
default:
return -ENOIOCTLCMD;
}
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 79bb8afa9c0c..a34ee52f19ea 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1689,6 +1689,7 @@ static const struct proto_ops bcm_ops = {
.getname = sock_no_getname,
.poll = datagram_poll,
.ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/net/can/raw.c b/net/can/raw.c
index c70207537488..afcbff063a67 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -846,6 +846,7 @@ static const struct proto_ops raw_ops = {
.getname = raw_getname,
.poll = datagram_poll,
.ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = raw_setsockopt,
diff --git a/net/compat.c b/net/compat.c
index eeea5eb71639..a031bd333092 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -395,63 +395,6 @@ COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
return __compat_sys_setsockopt(fd, level, optname, optval, optlen);
}
-int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
-{
- struct compat_timeval __user *ctv;
- int err;
- struct timeval tv;
-
- if (COMPAT_USE_64BIT_TIME)
- return sock_get_timestamp(sk, userstamp);
-
- ctv = (struct compat_timeval __user *) userstamp;
- err = -ENOENT;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- tv = ktime_to_timeval(sock_read_timestamp(sk));
-
- if (tv.tv_sec == -1)
- return err;
- if (tv.tv_sec == 0) {
- ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);
- tv = ktime_to_timeval(kt);
- }
- err = 0;
- if (put_user(tv.tv_sec, &ctv->tv_sec) ||
- put_user(tv.tv_usec, &ctv->tv_usec))
- err = -EFAULT;
- return err;
-}
-EXPORT_SYMBOL(compat_sock_get_timestamp);
-
-int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
-{
- struct compat_timespec __user *ctv;
- int err;
- struct timespec ts;
-
- if (COMPAT_USE_64BIT_TIME)
- return sock_get_timestampns (sk, userstamp);
-
- ctv = (struct compat_timespec __user *) userstamp;
- err = -ENOENT;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- ts = ktime_to_timespec(sock_read_timestamp(sk));
- if (ts.tv_sec == -1)
- return err;
- if (ts.tv_sec == 0) {
- ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);
- ts = ktime_to_timespec(kt);
- }
- err = 0;
- if (put_user(ts.tv_sec, &ctv->tv_sec) ||
- put_user(ts.tv_nsec, &ctv->tv_nsec))
- err = -EFAULT;
- return err;
-}
-EXPORT_SYMBOL(compat_sock_get_timestampns);
-
static int __compat_sys_getsockopt(int fd, int level, int optname,
char __user *optval,
int __user *optlen)
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 91bb5a083fee..45a162ef5e02 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -167,7 +167,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb),
- int *peeked, int *off, int *err,
+ int *off, int *err,
struct sk_buff **last)
{
bool peek_at_off = false;
@@ -194,7 +194,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
return NULL;
}
}
- *peeked = 1;
refcount_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
@@ -212,7 +211,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
* @sk: socket
* @flags: MSG\_ flags
* @destructor: invoked under the receive lock on successful dequeue
- * @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
@@ -246,7 +244,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb),
- int *peeked, int *off, int *err,
+ int *off, int *err,
struct sk_buff **last)
{
struct sk_buff_head *queue = &sk->sk_receive_queue;
@@ -260,7 +258,6 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
- *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -270,7 +267,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
- peeked, off, &error, last);
+ off, &error, last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
goto no_packet;
@@ -294,7 +291,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb),
- int *peeked, int *off, int *err)
+ int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
@@ -302,8 +299,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
- off, err, &last);
+ skb = __skb_try_recv_datagram(sk, flags, destructor, off, err,
+ &last);
if (skb)
return skb;
@@ -319,10 +316,10 @@ EXPORT_SYMBOL(__skb_recv_datagram);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int noblock, int *err)
{
- int peeked, off = 0;
+ int off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- NULL, &peeked, &off, err);
+ NULL, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index b430f851f377..22f2640f559a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1184,7 +1184,21 @@ int dev_change_name(struct net_device *dev, const char *newname)
BUG_ON(!dev_net(dev));
net = dev_net(dev);
- if (dev->flags & IFF_UP)
+
+ /* Some auto-enslaved devices e.g. failover slaves are
+ * special, as userspace might rename the device after
+ * the interface had been brought up and running since
+ * the point kernel initiated auto-enslavement. Allow
+ * live name change even when these slave devices are
+ * up and running.
+ *
+ * Typically, users of these auto-enslaving devices
+ * don't actually care about slave name change, as
+ * they are supposed to operate on master interface
+ * directly.
+ */
+ if (dev->flags & IFF_UP &&
+ likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
return -EBUSY;
write_seqcount_begin(&devnet_rename_seq);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b2715a187a11..7b91605e75d6 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1047,14 +1047,15 @@ out:
static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
u16 pool_index, u32 size,
- enum devlink_sb_threshold_type threshold_type)
+ enum devlink_sb_threshold_type threshold_type,
+ struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink->ops;
if (ops->sb_pool_set)
return ops->sb_pool_set(devlink, sb_index, pool_index,
- size, threshold_type);
+ size, threshold_type, extack);
return -EOPNOTSUPP;
}
@@ -1082,7 +1083,8 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
return devlink_sb_pool_set(devlink, devlink_sb->index,
- pool_index, size, threshold_type);
+ pool_index, size, threshold_type,
+ info->extack);
}
static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
@@ -1243,14 +1245,15 @@ out:
static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
unsigned int sb_index, u16 pool_index,
- u32 threshold)
+ u32 threshold,
+ struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
if (ops->sb_port_pool_set)
return ops->sb_port_pool_set(devlink_port, sb_index,
- pool_index, threshold);
+ pool_index, threshold, extack);
return -EOPNOTSUPP;
}
@@ -1273,7 +1276,7 @@ static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
- pool_index, threshold);
+ pool_index, threshold, info->extack);
}
static int
@@ -1472,7 +1475,8 @@ out:
static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
unsigned int sb_index, u16 tc_index,
enum devlink_sb_pool_type pool_type,
- u16 pool_index, u32 threshold)
+ u16 pool_index, u32 threshold,
+ struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
@@ -1480,7 +1484,7 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
if (ops->sb_tc_pool_bind_set)
return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
tc_index, pool_type,
- pool_index, threshold);
+ pool_index, threshold, extack);
return -EOPNOTSUPP;
}
@@ -1515,7 +1519,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
tc_index, pool_type,
- pool_index, threshold);
+ pool_index, threshold, info->extack);
}
static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
diff --git a/net/core/failover.c b/net/core/failover.c
index 4a92a98ccce9..b5cd3c727285 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev)
goto err_upper_link;
}
- slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+ slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
if (fops && fops->slave_register &&
!fops->slave_register(slave_dev, failover_dev))
return NOTIFY_OK;
netdev_upper_dev_unlink(slave_dev, failover_dev);
- slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
err_upper_link:
netdev_rx_handler_unregister(slave_dev);
done:
@@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev)
netdev_rx_handler_unregister(slave_dev);
netdev_upper_dev_unlink(slave_dev, failover_dev);
- slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
if (fops && fops->slave_unregister &&
!fops->slave_unregister(slave_dev, failover_dev))
diff --git a/net/core/filter.c b/net/core/filter.c
index 8904e3407163..9d28e7e8a4cb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2970,11 +2970,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
- BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2( \
+ BPF_ADJ_ROOM_ENCAP_L2_MASK))
static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags)
{
+ u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
u16 mac_len = 0, inner_net = 0, inner_trans = 0;
unsigned int gso_type = SKB_GSO_DODGY;
@@ -3009,6 +3012,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
mac_len = skb->network_header - skb->mac_header;
inner_net = skb->network_header;
+ if (inner_mac_len > len_diff)
+ return -EINVAL;
inner_trans = skb->transport_header;
}
@@ -3017,8 +3022,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
return ret;
if (encap) {
- /* inner mac == inner_net on l3 encap */
- skb->inner_mac_header = inner_net;
+ skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
skb_set_inner_protocol(skb, skb->protocol);
@@ -3032,7 +3036,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
gso_type |= SKB_GSO_GRE;
else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
gso_type |= SKB_GSO_IPXIP6;
- else
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
gso_type |= SKB_GSO_IPXIP4;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
@@ -3065,6 +3069,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
{
int ret;
+ if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO)
+ return -EINVAL;
+
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
/* udp gso_size delineates datagrams, only allow if fixed */
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
@@ -4430,8 +4437,7 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;
- if (val)
- tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+ tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}
@@ -4458,6 +4464,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
* Only binding to IP is supported.
*/
err = -EINVAL;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return err;
if (addr->sa_family == AF_INET) {
if (addr_len < sizeof(struct sockaddr_in))
return err;
@@ -4639,15 +4647,26 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
return BPF_FIB_LKUP_RET_UNSUPP_LWT;
dev = nhc->nhc_dev;
- if (nhc->nhc_has_gw)
- params->ipv4_dst = nhc->nhc_gw.ipv4;
params->rt_metric = res.fi->fib_priority;
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
*/
- neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+ if (likely(nhc->nhc_gw_family != AF_INET6)) {
+ if (nhc->nhc_gw_family)
+ params->ipv4_dst = nhc->nhc_gw.ipv4;
+
+ neigh = __ipv4_neigh_lookup_noref(dev,
+ (__force u32)params->ipv4_dst);
+ } else {
+ struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
+
+ params->family = AF_INET6;
+ *dst = nhc->nhc_gw.ipv6;
+ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
+ }
+
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;
@@ -4661,13 +4680,13 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
{
struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct fib6_result res = {};
struct neighbour *neigh;
struct net_device *dev;
struct inet6_dev *idev;
- struct fib6_info *f6i;
struct flowi6 fl6;
int strict = 0;
- int oif;
+ int oif, err;
u32 mtu;
/* link local addresses are never forwarded */
@@ -4709,61 +4728,57 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (unlikely(!tb))
return BPF_FIB_LKUP_RET_NOT_FWDED;
- f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+ err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
+ strict);
} else {
fl6.flowi6_mark = 0;
fl6.flowi6_secid = 0;
fl6.flowi6_tun_key.tun_id = 0;
fl6.flowi6_uid = sock_net_uid(net, NULL);
- f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+ err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
}
- if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+ if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
+ res.f6i == net->ipv6.fib6_null_entry))
return BPF_FIB_LKUP_RET_NOT_FWDED;
- if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
- switch (f6i->fib6_type) {
- case RTN_BLACKHOLE:
- return BPF_FIB_LKUP_RET_BLACKHOLE;
- case RTN_UNREACHABLE:
- return BPF_FIB_LKUP_RET_UNREACHABLE;
- case RTN_PROHIBIT:
- return BPF_FIB_LKUP_RET_PROHIBIT;
- default:
- return BPF_FIB_LKUP_RET_NOT_FWDED;
- }
- }
-
- if (f6i->fib6_type != RTN_UNICAST)
+ switch (res.fib6_type) {
+ /* only unicast is forwarded */
+ case RTN_UNICAST:
+ break;
+ case RTN_BLACKHOLE:
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ case RTN_UNREACHABLE:
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ case RTN_PROHIBIT:
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+ default:
return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
- if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
- f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
- fl6.flowi6_oif, NULL,
- strict);
+ ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
+ fl6.flowi6_oif != 0, NULL, strict);
if (check_mtu) {
- mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
+ mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
if (params->tot_len > mtu)
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
}
- if (f6i->fib6_nh.fib_nh_lws)
+ if (res.nh->fib_nh_lws)
return BPF_FIB_LKUP_RET_UNSUPP_LWT;
- if (f6i->fib6_nh.fib_nh_has_gw)
- *dst = f6i->fib6_nh.fib_nh_gw6;
+ if (res.nh->fib_nh_gw_family)
+ *dst = res.nh->fib_nh_gw6;
- dev = f6i->fib6_nh.fib_nh_dev;
- params->rt_metric = f6i->fib6_metric;
+ dev = res.nh->fib_nh_dev;
+ params->rt_metric = res.f6i->fib6_metric;
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
- * not needed here. Can not use __ipv6_neigh_lookup_noref here
- * because we need to get nd_tbl via the stub
+ * not needed here.
*/
- neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
- ndisc_hashfn, dst, dev);
+ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index ac679f74ba47..9bf1b9ad1780 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -291,6 +291,7 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
for_each_possible_cpu(i) {
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
+ qstats->qlen = 0;
qstats->backlog += qcpu->backlog;
qstats->drops += qcpu->drops;
qstats->requeues += qcpu->requeues;
@@ -306,6 +307,7 @@ void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
if (cpu) {
__gnet_stats_copy_queue_cpu(qstats, cpu);
} else {
+ qstats->qlen = q->qlen;
qstats->backlog = q->backlog;
qstats->drops = q->drops;
qstats->requeues = q->requeues;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 19b557bd294b..94749e0e2cfd 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,7 +26,7 @@
#include <net/lwtunnel.h>
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#ifdef CONFIG_MODULES
@@ -223,7 +223,8 @@ void lwtstate_free(struct lwtunnel_state *lws)
}
EXPORT_SYMBOL_GPL(lwtstate_free);
-int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
+ int encap_attr, int encap_type_attr)
{
const struct lwtunnel_encap_ops *ops;
struct nlattr *nest;
@@ -236,7 +237,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
lwtstate->type > LWTUNNEL_ENCAP_MAX)
return 0;
- nest = nla_nest_start(skb, RTA_ENCAP);
+ nest = nla_nest_start(skb, encap_attr);
if (!nest)
return -EMSGSIZE;
@@ -250,7 +251,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
if (ret)
goto nla_put_failure;
nla_nest_end(skb, nest);
- ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+ ret = nla_put_u16(skb, encap_type_attr, lwtstate->type);
if (ret)
goto nla_put_failure;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 30f6fd8f68e0..997cfa8f99ba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1920,6 +1920,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
+ if (tbl->allow_add && !tbl->allow_add(dev, extack)) {
+ err = -EINVAL;
+ goto out;
+ }
+
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
bool exempt_from_gc;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c14f0dc0157c..e4fd68389d6f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1747,20 +1747,16 @@ int netdev_register_kobject(struct net_device *ndev)
error = device_add(dev);
if (error)
- goto error_put_device;
+ return error;
error = register_queue_kobjects(ndev);
- if (error)
- goto error_device_del;
+ if (error) {
+ device_del(dev);
+ return error;
+ }
pm_runtime_set_memalloc_noio(dev, true);
- return 0;
-
-error_device_del:
- device_del(dev);
-error_put_device:
- put_device(dev);
return error;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7e6dcc625701..ebb5b6d21a13 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -839,7 +839,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
nla = tb[NETNSA_FD];
} else if (tb[NETNSA_NSID]) {
- peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID]));
+ peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
if (!peer)
peer = ERR_PTR(-ENOENT);
nla = tb[NETNSA_NSID];
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index b9057478d69c..7e3d0d99dfae 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -301,6 +301,4 @@ static int __init init_cgroup_netprio(void)
register_netdevice_notifier(&netprio_device_notifier);
return 0;
}
-
subsys_initcall(init_cgroup_netprio);
-MODULE_LICENSE("GPL v2");
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index 703cf76aa7c2..7109c168b5e0 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -185,9 +185,10 @@ void __init ptp_classifier_init(void)
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
};
- struct sock_fprog_kern ptp_prog = {
- .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
- };
+ struct sock_fprog_kern ptp_prog;
+
+ ptp_prog.len = ARRAY_SIZE(ptp_filter);
+ ptp_prog.filter = ptp_filter;
BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f9b964fd4e4d..5fa5bf3e9945 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4951,7 +4951,7 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
{
struct if_stats_msg *ifsm;
- if (nlh->nlmsg_len < sizeof(*ifsm)) {
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) {
NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
return -EINVAL;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9901f5322852..e89be6282693 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -258,6 +258,33 @@ nodata:
}
EXPORT_SYMBOL(__alloc_skb);
+/* Caller must provide SKB that is memset cleared */
+static struct sk_buff *__build_skb_around(struct sk_buff *skb,
+ void *data, unsigned int frag_size)
+{
+ struct skb_shared_info *shinfo;
+ unsigned int size = frag_size ? : ksize(data);
+
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ /* Assumes caller memset cleared SKB */
+ skb->truesize = SKB_TRUESIZE(size);
+ refcount_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+
+ return skb;
+}
+
/**
* __build_skb - build a network buffer
* @data: data buffer provided by caller
@@ -279,32 +306,15 @@ EXPORT_SYMBOL(__alloc_skb);
*/
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
- struct skb_shared_info *shinfo;
struct sk_buff *skb;
- unsigned int size = frag_size ? : ksize(data);
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
- if (!skb)
+ if (unlikely(!skb))
return NULL;
- size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-
memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->truesize = SKB_TRUESIZE(size);
- refcount_set(&skb->users, 1);
- skb->head = data;
- skb->data = data;
- skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
- skb->mac_header = (typeof(skb->mac_header))~0U;
- skb->transport_header = (typeof(skb->transport_header))~0U;
-
- /* make sure we initialize shinfo sequentially */
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
- return skb;
+ return __build_skb_around(skb, data, frag_size);
}
/* build_skb() is wrapper over __build_skb(), that specifically
@@ -325,6 +335,29 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(build_skb);
+/**
+ * build_skb_around - build a network buffer around provided skb
+ * @skb: sk_buff provide by caller, must be memset cleared
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ */
+struct sk_buff *build_skb_around(struct sk_buff *skb,
+ void *data, unsigned int frag_size)
+{
+ if (unlikely(!skb))
+ return NULL;
+
+ skb = __build_skb_around(skb, data, frag_size);
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (page_is_pfmemalloc(virt_to_head_page(data)))
+ skb->pfmemalloc = 1;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(build_skb_around);
+
#define NAPI_SKB_CACHE_SIZE 64
struct napi_alloc_cache {
@@ -5082,7 +5115,8 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
- int mac_len;
+ int mac_len, meta_len;
+ void *meta;
if (skb_cow(skb, skb_headroom(skb)) < 0) {
kfree_skb(skb);
@@ -5094,6 +5128,13 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
mac_len - VLAN_HLEN - ETH_TLEN);
}
+
+ meta_len = skb_metadata_len(skb);
+ if (meta_len) {
+ meta = skb_metadata_end(skb) - meta_len;
+ memmove(meta + VLAN_HLEN, meta, meta_len);
+ }
+
skb->mac_header += VLAN_HLEN;
return skb;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 782343bb925b..925b84a872dd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -348,7 +348,7 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
}
- if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
*(struct old_timeval32 *)optval = tv32;
return sizeof(tv32);
@@ -372,7 +372,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool
{
struct __kernel_sock_timeval tv;
- if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
struct old_timeval32 tv32;
if (optlen < sizeof(tv32))
@@ -2977,39 +2977,44 @@ bool lock_sock_fast(struct sock *sk)
}
EXPORT_SYMBOL(lock_sock_fast);
-int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+ bool timeval, bool time32)
{
- struct timeval tv;
+ struct sock *sk = sock->sk;
+ struct timespec64 ts;
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- tv = ktime_to_timeval(sock_read_timestamp(sk));
- if (tv.tv_sec == -1)
+ ts = ktime_to_timespec64(sock_read_timestamp(sk));
+ if (ts.tv_sec == -1)
return -ENOENT;
- if (tv.tv_sec == 0) {
+ if (ts.tv_sec == 0) {
ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);
- tv = ktime_to_timeval(kt);
+ sock_write_timestamp(sk, kt);;
+ ts = ktime_to_timespec64(kt);
}
- return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL(sock_get_timestamp);
-int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
-{
- struct timespec ts;
+ if (timeval)
+ ts.tv_nsec /= 1000;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- ts = ktime_to_timespec(sock_read_timestamp(sk));
- if (ts.tv_sec == -1)
- return -ENOENT;
- if (ts.tv_sec == 0) {
- ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);
- ts = ktime_to_timespec(sk->sk_stamp);
+#ifdef CONFIG_COMPAT_32BIT_TIME
+ if (time32)
+ return put_old_timespec32(&ts, userstamp);
+#endif
+#ifdef CONFIG_SPARC64
+ /* beware of padding in sparc64 timeval */
+ if (timeval && !in_compat_syscall()) {
+ struct __kernel_old_timeval __user tv = {
+ .tv_sec = ts.tv_sec,
+ .tv_usec = ts.tv_nsec,
+ };
+ if (copy_to_user(userstamp, &tv, sizeof(tv)))
+ return -EFAULT;
+ return 0;
}
- return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+#endif
+ return put_timespec64(&ts, userstamp);
}
-EXPORT_SYMBOL(sock_get_timestampns);
+EXPORT_SYMBOL(sock_gettstamp);
void sock_enable_timestamp(struct sock *sk, int flag)
{
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 26a21d97b6b0..004535e4c070 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -991,6 +991,7 @@ static const struct proto_ops inet_dccp_ops = {
/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
.poll = dccp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
/* FIXME: work on inet_listen to rename it to sock_common_listen */
.listen = inet_dccp_listen,
.shutdown = inet_shutdown,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 57d84e9b7b6f..c4e4d1301062 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1075,6 +1075,7 @@ static const struct proto_ops inet6_dccp_ops = {
.getname = inet6_getname,
.poll = dccp_poll,
.ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = inet_dccp_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 6cd3737593a6..7e47ffdd1412 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -42,7 +42,7 @@
#include <net/dn_fib.h>
#include <net/dn_neigh.h>
#include <net/dn_dev.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#define RT_MIN_TABLE 1
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 76338c38738a..19aa32fc1802 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -94,8 +94,6 @@ int dns_query(const char *type, const char *name, size_t namelen,
desclen += typelen + 1;
}
- if (!namelen)
- namelen = strnlen(name, 256);
if (namelen < 3 || namelen > 255)
return -EINVAL;
desclen += namelen + 1;
diff --git a/net/hsr/Makefile b/net/hsr/Makefile
index d74d89d013b0..e45757fc477f 100644
--- a/net/hsr/Makefile
+++ b/net/hsr/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_HSR) += hsr.o
hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \
hsr_netlink.o hsr_slave.o hsr_forward.o
-hsr-$(CONFIG_DEBUG_FS) += hsr_prp_debugfs.o
+hsr-$(CONFIG_DEBUG_FS) += hsr_debugfs.o
diff --git a/net/hsr/hsr_prp_debugfs.c b/net/hsr/hsr_debugfs.c
index b30e98734c61..94447974a3c0 100644
--- a/net/hsr/hsr_prp_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -1,9 +1,9 @@
/*
- * hsr_prp_debugfs code
- * Copyright (C) 2017 Texas Instruments Incorporated
+ * hsr_debugfs code
+ * Copyright (C) 2019 Texas Instruments Incorporated
*
* Author(s):
- * Murali Karicheri <m-karicheri2@ti.com?
+ * Murali Karicheri <m-karicheri2@ti.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@@ -26,9 +26,9 @@ static void print_mac_address(struct seq_file *sfp, unsigned char *mac)
mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
}
-/* hsr_prp_node_table_show - Formats and prints node_table entries */
+/* hsr_node_table_show - Formats and prints node_table entries */
static int
-hsr_prp_node_table_show(struct seq_file *sfp, void *data)
+hsr_node_table_show(struct seq_file *sfp, void *data)
{
struct hsr_priv *priv = (struct hsr_priv *)sfp->private;
struct hsr_node *node;
@@ -52,40 +52,40 @@ hsr_prp_node_table_show(struct seq_file *sfp, void *data)
return 0;
}
-/* hsr_prp_node_table_open - Open the node_table file
+/* hsr_node_table_open - Open the node_table file
*
* Description:
* This routine opens a debugfs file node_table of specific hsr device
*/
static int
-hsr_prp_node_table_open(struct inode *inode, struct file *filp)
+hsr_node_table_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, hsr_prp_node_table_show, inode->i_private);
+ return single_open(filp, hsr_node_table_show, inode->i_private);
}
-static const struct file_operations hsr_prp_fops = {
+static const struct file_operations hsr_fops = {
.owner = THIS_MODULE,
- .open = hsr_prp_node_table_open,
+ .open = hsr_node_table_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
-/* hsr_prp_debugfs_init - create hsr-prp node_table file for dumping
+/* hsr_debugfs_init - create hsr node_table file for dumping
* the node table
*
* Description:
* When debugfs is configured this routine sets up the node_table file per
- * hsr/prp device for dumping the node_table entries
+ * hsr device for dumping the node_table entries
*/
-int hsr_prp_debugfs_init(struct hsr_priv *priv)
+int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
{
int rc = -1;
struct dentry *de = NULL;
- de = debugfs_create_dir("hsr", NULL);
+ de = debugfs_create_dir(hsr_dev->name, NULL);
if (!de) {
- pr_err("Cannot create hsr-prp debugfs root\n");
+ pr_err("Cannot create hsr debugfs root\n");
return rc;
}
@@ -93,25 +93,24 @@ int hsr_prp_debugfs_init(struct hsr_priv *priv)
de = debugfs_create_file("node_table", S_IFREG | 0444,
priv->node_tbl_root, priv,
- &hsr_prp_fops);
+ &hsr_fops);
if (!de) {
- pr_err("Cannot create hsr-prp node_table directory\n");
+ pr_err("Cannot create hsr node_table directory\n");
return rc;
}
priv->node_tbl_file = de;
- rc = 0;
- return rc;
+ return 0;
}
-/* hsr_prp_debugfs_term - Tear down debugfs intrastructure
+/* hsr_debugfs_term - Tear down debugfs intrastructure
*
* Description:
* When Debufs is configured this routine removes debugfs file system
- * elements that are specific to hsr-prp
+ * elements that are specific to hsr
*/
void
-hsr_prp_debugfs_term(struct hsr_priv *priv)
+hsr_debugfs_term(struct hsr_priv *priv)
{
debugfs_remove(priv->node_tbl_file);
priv->node_tbl_file = NULL;
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index b47a621e3f4e..15c72065df79 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -354,7 +354,7 @@ static void hsr_dev_destroy(struct net_device *hsr_dev)
hsr = netdev_priv(hsr_dev);
- hsr_prp_debugfs_term(hsr);
+ hsr_debugfs_term(hsr);
rtnl_lock();
hsr_for_each_port(hsr, port)
@@ -485,7 +485,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
goto fail;
mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));
- res = hsr_prp_debugfs_init(hsr);
+ res = hsr_debugfs_init(hsr, hsr_dev);
if (res)
goto fail;
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 0cac992192d0..ddd9605bad04 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -359,6 +359,13 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port)
goto out_drop;
hsr_register_frame_in(frame.node_src, port, frame.sequence_nr);
hsr_forward_do(&frame);
+ /* Gets called for ingress frames as well as egress from master port.
+ * So check and increment stats for master port only here.
+ */
+ if (port->type == HSR_PT_MASTER) {
+ port->dev->stats.tx_packets++;
+ port->dev->stats.tx_bytes += skb->len;
+ }
if (frame.skb_hsr)
kfree_skb(frame.skb_hsr);
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 778213f07fe0..96fac696a1e1 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -184,15 +184,16 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb)
}
#if IS_ENABLED(CONFIG_DEBUG_FS)
-int hsr_prp_debugfs_init(struct hsr_priv *priv);
-void hsr_prp_debugfs_term(struct hsr_priv *priv);
+int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev);
+void hsr_debugfs_term(struct hsr_priv *priv);
#else
-static inline int hsr_prp_debugfs_init(struct hsr_priv *priv)
+static inline int hsr_debugfs_init(struct hsr_priv *priv,
+ struct net_device *hsr_dev)
{
return 0;
}
-static inline void hsr_prp_debugfs_term(struct hsr_priv *priv)
+static inline void hsr_debugfs_term(struct hsr_priv *priv)
{}
#endif
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index bc6b912603f1..ce2dfb997537 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -164,10 +164,6 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
struct sock *sk = sock->sk;
switch (cmd) {
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
case SIOCGIFADDR:
case SIOCSIFADDR:
return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg,
@@ -426,6 +422,7 @@ static const struct proto_ops ieee802154_raw_ops = {
.getname = sock_no_getname,
.poll = datagram_poll,
.ioctl = ieee802154_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -988,6 +985,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
.getname = sock_no_getname,
.poll = datagram_poll,
.ioctl = ieee802154_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_common_setsockopt,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 08a8430f5647..5183a2daba64 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -915,12 +915,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct rtentry rt;
switch (cmd) {
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
- break;
- case SIOCGSTAMPNS:
- err = sock_get_timestampns(sk, (struct timespec __user *)arg);
- break;
case SIOCADDRT:
case SIOCDELRT:
if (copy_from_user(&rt, p, sizeof(struct rtentry)))
@@ -992,6 +986,7 @@ const struct proto_ops inet_stream_ops = {
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -1027,6 +1022,7 @@ const struct proto_ops inet_dgram_ops = {
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -1059,6 +1055,7 @@ static const struct proto_ops inet_sockraw_ops = {
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 1e976bb93d99..15427163a041 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -77,5 +77,4 @@ static int __init bpfilter_sockopt_init(void)
return 0;
}
-
-module_init(bpfilter_sockopt_init);
+device_initcall(bpfilter_sockopt_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 15f779bd26b3..d4b63f94f7be 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,7 +558,8 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (rt->rt_gateway.sa_family == AF_INET && addr) {
unsigned int addr_type;
- cfg->fc_gw = addr;
+ cfg->fc_gw4 = addr;
+ cfg->fc_gw_family = AF_INET;
addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
if (rt->rt_flags & RTF_GATEWAY &&
addr_type == RTN_UNICAST)
@@ -568,7 +569,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (cmd == SIOCDELRT)
return 0;
- if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+ if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
return -EINVAL;
if (cfg->fc_scope == RT_SCOPE_NOWHERE)
@@ -664,10 +665,55 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_DPORT] = { .type = NLA_U16 },
};
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct rtvia *via;
+ int alen;
+
+ if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
+ return -EINVAL;
+ }
+
+ via = nla_data(nla);
+ alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (alen != sizeof(__be32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
+ return -EINVAL;
+ }
+ cfg->fc_gw_family = AF_INET;
+ cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
+ break;
+ case AF_INET6:
+#ifdef CONFIG_IPV6
+ if (alen != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
+ return -EINVAL;
+ }
+ cfg->fc_gw_family = AF_INET6;
+ cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
+#else
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EINVAL;
+#endif
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
struct nlmsghdr *nlh, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
+ bool has_gw = false, has_via = false;
struct nlattr *attr;
int err, remaining;
struct rtmsg *rtm;
@@ -708,12 +754,17 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_oif = nla_get_u32(attr);
break;
case RTA_GATEWAY:
- cfg->fc_gw = nla_get_be32(attr);
+ has_gw = true;
+ cfg->fc_gw4 = nla_get_be32(attr);
+ if (cfg->fc_gw4)
+ cfg->fc_gw_family = AF_INET;
break;
case RTA_VIA:
- NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute");
- err = -EINVAL;
- goto errout;
+ has_via = true;
+ err = fib_gw_from_via(cfg, attr, extack);
+ if (err)
+ goto errout;
+ break;
case RTA_PRIORITY:
cfg->fc_priority = nla_get_u32(attr);
break;
@@ -752,6 +803,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
}
}
+ if (has_gw && has_via) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ goto errout;
+ }
+
return 0;
errout:
return err;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8e0cb1687a74..4336f1ec8ab0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -41,8 +41,9 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/ip6_fib.h>
#include <net/netlink.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>
#include <net/addrconf.h>
@@ -276,7 +277,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
for_nexthops(fi) {
if (nh->fib_nh_oif != onh->fib_nh_oif ||
- nh->fib_nh_gw4 != onh->fib_nh_gw4 ||
+ nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
nh->fib_nh_scope != onh->fib_nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->fib_nh_weight != onh->fib_nh_weight ||
@@ -287,6 +288,15 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
return -1;
+
+ if (nh->fib_nh_gw_family == AF_INET &&
+ nh->fib_nh_gw4 != onh->fib_nh_gw4)
+ return -1;
+
+ if (nh->fib_nh_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
+ return -1;
+
onh++;
} endfor_nexthops(fi);
return 0;
@@ -447,10 +457,18 @@ static int fib_detect_death(struct fib_info *fi, int order,
struct fib_info **last_resort, int *last_idx,
int dflt)
{
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
struct neighbour *n;
int state = NUD_NONE;
- n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev);
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
+ else if (nhc->nhc_gw_family == AF_INET6)
+ n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6,
+ nhc->nhc_dev);
+ else
+ n = NULL;
+
if (n) {
state = n->nud_state;
neigh_release(n);
@@ -511,10 +529,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
goto init_failure;
nh->fib_nh_oif = cfg->fc_oif;
- if (cfg->fc_gw) {
- nh->fib_nh_gw4 = cfg->fc_gw;
- nh->fib_nh_has_gw = 1;
- }
+ nh->fib_nh_gw_family = cfg->fc_gw_family;
+ if (cfg->fc_gw_family == AF_INET)
+ nh->fib_nh_gw4 = cfg->fc_gw4;
+ else if (cfg->fc_gw_family == AF_INET6)
+ nh->fib_nh_gw6 = cfg->fc_gw6;
+
nh->fib_nh_flags = cfg->fc_flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -586,11 +606,24 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- if (nla)
- fib_cfg.fc_gw = nla_get_in_addr(nla);
+ nlav = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nlav) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ return -EINVAL;
+ }
+ if (nla) {
+ fib_cfg.fc_gw4 = nla_get_in_addr(nla);
+ if (fib_cfg.fc_gw4)
+ fib_cfg.fc_gw_family = AF_INET;
+ } else if (nlav) {
+ ret = fib_gw_from_via(&fib_cfg, nlav, extack);
+ if (ret)
+ goto errout;
+ }
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla)
@@ -616,10 +649,16 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
"Nexthop device index does not match RTA_OIF");
goto errout;
}
- if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) {
- NL_SET_ERR_MSG(extack,
- "Nexthop gateway does not match RTA_GATEWAY");
- goto errout;
+ if (cfg->fc_gw_family) {
+ if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
+ (cfg->fc_gw_family == AF_INET &&
+ fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) ||
+ (cfg->fc_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
+ goto errout;
+ }
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
@@ -719,7 +758,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
return 1;
- if (cfg->fc_oif || cfg->fc_gw) {
+ if (cfg->fc_oif || cfg->fc_gw_family) {
if (cfg->fc_encap) {
if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
fi->fib_nh, cfg, extack))
@@ -730,10 +769,20 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
cfg->fc_flow != fi->fib_nh->nh_tclassid)
return 1;
#endif
- if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) &&
- (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->fib_nh_gw4))
- return 0;
- return 1;
+ if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) ||
+ (cfg->fc_gw_family &&
+ cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family))
+ return 1;
+
+ if (cfg->fc_gw_family == AF_INET &&
+ cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
+ return 1;
+
+ if (cfg->fc_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6))
+ return 1;
+
+ return 0;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -754,11 +803,43 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4)
- return 1;
+ nlav = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nlav) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ return -EINVAL;
+ }
+
+ if (nla) {
+ if (nh->fib_nh_gw_family != AF_INET ||
+ nla_get_in_addr(nla) != nh->fib_nh_gw4)
+ return 1;
+ } else if (nlav) {
+ struct fib_config cfg2;
+ int err;
+
+ err = fib_gw_from_via(&cfg2, nlav, extack);
+ if (err)
+ return err;
+
+ switch (nh->fib_nh_gw_family) {
+ case AF_INET:
+ if (cfg2.fc_gw_family != AF_INET ||
+ cfg2.fc_gw4 != nh->fib_nh_gw4)
+ return 1;
+ break;
+ case AF_INET6:
+ if (cfg2.fc_gw_family != AF_INET6 ||
+ ipv6_addr_cmp(&cfg2.fc_gw6,
+ &nh->fib_nh_gw6))
+ return 1;
+ break;
+ }
+ }
+
#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla && nla_get_u32(nla) != nh->nh_tclassid)
@@ -812,6 +893,30 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
return true;
}
+static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
+ u32 table, struct netlink_ext_ack *extack)
+{
+ struct fib6_config cfg = {
+ .fc_table = table,
+ .fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
+ .fc_ifindex = nh->fib_nh_oif,
+ .fc_gateway = nh->fib_nh_gw6,
+ };
+ struct fib6_nh fib6_nh = {};
+ int err;
+
+ err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
+ if (!err) {
+ nh->fib_nh_dev = fib6_nh.fib_nh_dev;
+ dev_hold(nh->fib_nh_dev);
+ nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
+ nh->fib_nh_scope = RT_SCOPE_LINK;
+
+ ipv6_stub->fib6_nh_release(&fib6_nh);
+ }
+
+ return err;
+}
/*
* Picture
@@ -856,134 +961,152 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
* |
* |-> {local prefix} (terminal node)
*/
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
- struct netlink_ext_ack *extack)
+static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
+ u8 scope, struct netlink_ext_ack *extack)
{
- int err = 0;
- struct net *net;
struct net_device *dev;
+ struct fib_result res;
+ int err;
- net = cfg->fc_nlinfo.nl_net;
- if (nh->fib_nh_gw4) {
- struct fib_result res;
-
- if (nh->fib_nh_flags & RTNH_F_ONLINK) {
- unsigned int addr_type;
+ if (nh->fib_nh_flags & RTNH_F_ONLINK) {
+ unsigned int addr_type;
- if (cfg->fc_scope >= RT_SCOPE_LINK) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid scope");
- return -EINVAL;
- }
- dev = __dev_get_by_index(net, nh->fib_nh_oif);
- if (!dev) {
- NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
- return -ENODEV;
- }
- if (!(dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop device is not up");
- return -ENETDOWN;
- }
- addr_type = inet_addr_type_dev_table(net, dev,
- nh->fib_nh_gw4);
- if (addr_type != RTN_UNICAST) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway");
- return -EINVAL;
- }
- if (!netif_carrier_ok(dev))
- nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- nh->fib_nh_dev = dev;
- dev_hold(dev);
- nh->fib_nh_scope = RT_SCOPE_LINK;
- return 0;
+ if (scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
+ return -EINVAL;
}
- rcu_read_lock();
- {
- struct fib_table *tbl = NULL;
- struct flowi4 fl4 = {
- .daddr = nh->fib_nh_gw4,
- .flowi4_scope = cfg->fc_scope + 1,
- .flowi4_oif = nh->fib_nh_oif,
- .flowi4_iif = LOOPBACK_IFINDEX,
- };
-
- /* It is not necessary, but requires a bit of thinking */
- if (fl4.flowi4_scope < RT_SCOPE_LINK)
- fl4.flowi4_scope = RT_SCOPE_LINK;
-
- if (cfg->fc_table)
- tbl = fib_get_table(net, cfg->fc_table);
-
- if (tbl)
- err = fib_table_lookup(tbl, &fl4, &res,
- FIB_LOOKUP_IGNORE_LINKSTATE |
- FIB_LOOKUP_NOREF);
-
- /* on error or if no table given do full lookup. This
- * is needed for example when nexthops are in the local
- * table rather than the given table
- */
- if (!tbl || err) {
- err = fib_lookup(net, &fl4, &res,
- FIB_LOOKUP_IGNORE_LINKSTATE);
- }
-
- if (err) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway");
- rcu_read_unlock();
- return err;
- }
+ dev = __dev_get_by_index(net, nh->fib_nh_oif);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
+ return -ENODEV;
}
- err = -EINVAL;
- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
- NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
- goto out;
+ if (!(dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ return -ENETDOWN;
}
- nh->fib_nh_scope = res.scope;
- nh->fib_nh_oif = FIB_RES_OIF(res);
- nh->fib_nh_dev = dev = FIB_RES_DEV(res);
- if (!dev) {
- NL_SET_ERR_MSG(extack,
- "No egress device for nexthop gateway");
- goto out;
+ addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
+ if (addr_type != RTN_UNICAST) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ return -EINVAL;
}
- dev_hold(dev);
if (!netif_carrier_ok(dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
- } else {
- struct in_device *in_dev;
-
- if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
- NL_SET_ERR_MSG(extack,
- "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
- return -EINVAL;
+ nh->fib_nh_dev = dev;
+ dev_hold(dev);
+ nh->fib_nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ rcu_read_lock();
+ {
+ struct fib_table *tbl = NULL;
+ struct flowi4 fl4 = {
+ .daddr = nh->fib_nh_gw4,
+ .flowi4_scope = scope + 1,
+ .flowi4_oif = nh->fib_nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+
+ if (table)
+ tbl = fib_get_table(net, table);
+
+ if (tbl)
+ err = fib_table_lookup(tbl, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE |
+ FIB_LOOKUP_NOREF);
+
+ /* on error or if no table given do full lookup. This
+ * is needed for example when nexthops are in the local
+ * table rather than the given table
+ */
+ if (!tbl || err) {
+ err = fib_lookup(net, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE);
}
- rcu_read_lock();
- err = -ENODEV;
- in_dev = inetdev_by_index(net, nh->fib_nh_oif);
- if (!in_dev)
- goto out;
- err = -ENETDOWN;
- if (!(in_dev->dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
goto out;
}
- nh->fib_nh_dev = in_dev->dev;
- dev_hold(nh->fib_nh_dev);
- nh->fib_nh_scope = RT_SCOPE_HOST;
- if (!netif_carrier_ok(nh->fib_nh_dev))
- nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- err = 0;
}
+
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ goto out;
+ }
+ nh->fib_nh_scope = res.scope;
+ nh->fib_nh_oif = FIB_RES_OIF(res);
+ nh->fib_nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack,
+ "No egress device for nexthop gateway");
+ goto out;
+ }
+ dev_hold(dev);
+ if (!netif_carrier_ok(dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct in_device *in_dev;
+ int err;
+
+ if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+
+ err = -ENODEV;
+ in_dev = inetdev_by_index(net, nh->fib_nh_oif);
+ if (!in_dev)
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+ goto out;
+ }
+
+ nh->fib_nh_dev = in_dev->dev;
+ dev_hold(nh->fib_nh_dev);
+ nh->fib_nh_scope = RT_SCOPE_HOST;
+ if (!netif_carrier_ok(nh->fib_nh_dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ err = 0;
out:
rcu_read_unlock();
return err;
}
+static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ u32 table = cfg->fc_table;
+ int err;
+
+ if (nh->fib_nh_gw_family == AF_INET)
+ err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack);
+ else if (nh->fib_nh_gw_family == AF_INET6)
+ err = fib_check_nh_v6_gw(net, nh, table, extack);
+ else
+ err = fib_check_nh_nongw(net, nh, extack);
+
+ return err;
+}
+
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
unsigned int mask = (fib_info_hash_size - 1);
@@ -1204,7 +1327,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
if (fib_props[cfg->fc_type].error) {
- if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) {
+ if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
NL_SET_ERR_MSG(extack,
"Gateway, device and multipath can not be specified for this route type");
goto err_inval;
@@ -1238,7 +1361,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
"Route with host scope can not have multiple nexthops");
goto err_inval;
}
- if (nh->fib_nh_gw4) {
+ if (nh->fib_nh_gw_family) {
NL_SET_ERR_MSG(extack,
"Route with host scope can not have a gateway");
goto err_inval;
@@ -1269,6 +1392,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
+ if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+ fi->fib_nh_is_v6 = true;
} endfor_nexthops(fi)
fib_rebalance(fi);
@@ -1319,7 +1444,7 @@ failure:
}
int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
- unsigned int *flags, bool skip_oif)
+ unsigned char *flags, bool skip_oif)
{
if (nhc->nhc_flags & RTNH_F_DEAD)
*flags |= RTNH_F_DEAD;
@@ -1341,18 +1466,32 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
rcu_read_unlock();
}
- if (nhc->nhc_has_gw) {
- switch (nhc->nhc_family) {
- case AF_INET:
- if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
- goto nla_put_failure;
- break;
- case AF_INET6:
- if (nla_put_in6_addr(skb, RTA_GATEWAY,
- &nhc->nhc_gw.ipv6) < 0)
+ switch (nhc->nhc_gw_family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ /* if gateway family does not match nexthop family
+ * gateway is encoded as RTA_VIA
+ */
+ if (nhc->nhc_gw_family != nhc->nhc_family) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
goto nla_put_failure;
- break;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
+ } else if (nla_put_in6_addr(skb, RTA_GATEWAY,
+ &nhc->nhc_gw.ipv6) < 0) {
+ goto nla_put_failure;
}
+ break;
}
*flags |= (nhc->nhc_flags & RTNH_F_ONLINK);
@@ -1364,7 +1503,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
goto nla_put_failure;
if (nhc->nhc_lwtstate &&
- lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0)
+ lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
+ RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
goto nla_put_failure;
return 0;
@@ -1380,7 +1520,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
{
const struct net_device *dev = nhc->nhc_dev;
struct rtnexthop *rtnh;
- unsigned int flags = 0;
+ unsigned char flags = 0;
rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
if (!rtnh)
@@ -1479,7 +1619,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
if (fi->fib_nhs == 1) {
struct fib_nh *nh = &fi->fib_nh[0];
- unsigned int flags = 0;
+ unsigned char flags = 0;
if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
goto nla_put_failure;
@@ -1762,7 +1902,7 @@ out:
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
*/
-int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
struct fib_info *prev_fi;
unsigned int hash;
@@ -1832,8 +1972,14 @@ static bool fib_good_nh(const struct fib_nh *nh)
rcu_read_lock_bh();
- n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
- (__force u32)nh->fib_nh_gw4);
+ if (likely(nh->fib_nh_gw_family == AF_INET))
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ else if (nh->fib_nh_gw_family == AF_INET6)
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev,
+ &nh->fib_nh_gw6);
+ else
+ n = NULL;
if (n)
state = n->nud_state;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 100e63f57ea6..1ca1586a7e46 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -121,6 +121,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
struct guehdr *guehdr;
void *data;
u16 doffset = 0;
+ u8 proto_ctype;
if (!fou)
return 1;
@@ -136,7 +137,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
break;
case 1: {
- /* Direct encasulation of IPv4 or IPv6 */
+ /* Direct encapsulation of IPv4 or IPv6 */
int prot;
@@ -170,9 +171,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
/* guehdr may change after pull */
guehdr = (struct guehdr *)&udp_hdr(skb)[1];
- hdrlen = sizeof(struct guehdr) + optlen;
-
- if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
+ if (validate_gue_flags(guehdr, optlen))
goto drop;
hdrlen = sizeof(struct guehdr) + optlen;
@@ -212,13 +211,14 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
if (unlikely(guehdr->control))
return gue_control_message(skb, guehdr);
+ proto_ctype = guehdr->proto_ctype;
__skb_pull(skb, sizeof(struct udphdr) + hdrlen);
skb_reset_transport_header(skb);
if (iptunnel_pull_offloads(skb))
goto drop;
- return -guehdr->proto_ctype;
+ return -proto_ctype;
drop:
kfree_skb(skb);
@@ -1137,7 +1137,7 @@ static int gue_err(struct sk_buff *skb, u32 info)
case 0: /* Full GUE header present */
break;
case 1: {
- /* Direct encasulation of IPv4 or IPv6 */
+ /* Direct encapsulation of IPv4 or IPv6 */
skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
switch (((struct iphdr *)guehdr)->version) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6ea523d71947..a175e3e7ae97 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,7 +564,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
goto route_err;
rcu_read_unlock();
return &rt->dst;
@@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
goto route_err;
return &rt->dst;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 00ec819f949b..06f6f280b9ff 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb)
rt = skb_rtable(skb);
- if (opt->is_strictroute && rt->rt_uses_gateway)
+ if (opt->is_strictroute && rt->rt_gw_family)
goto sr_failed;
IPCB(skb)->flags |= IPSKB_FORWARDED;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fd219f7bd3ea..4b0526441476 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
struct erspan_base_hdr *ershdr;
- struct erspan_metadata *pkt_md;
struct ip_tunnel_net *itn;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@@ -282,9 +281,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (unlikely(!pskb_may_pull(skb, len)))
return PACKET_REJECT;
- ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
- pkt_md = (struct erspan_metadata *)(ershdr + 1);
-
if (__iptunnel_pull_header(skb,
len,
htons(ETH_P_TEB),
@@ -292,8 +288,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
goto drop;
if (tunnel->collect_md) {
+ struct erspan_metadata *pkt_md, *md;
struct ip_tunnel_info *info;
- struct erspan_metadata *md;
+ unsigned char *gh;
__be64 tun_id;
__be16 flags;
@@ -306,6 +303,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!tun_dst)
return PACKET_REJECT;
+ /* skb can be uncloned in __iptunnel_pull_header, so
+ * old pkt_md is no longer valid and we need to reset
+ * it
+ */
+ gh = skb_network_header(skb) +
+ skb_network_header_len(skb);
+ pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+ sizeof(*ershdr));
md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
md->version = ver;
md2 = &md->u.md2;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 10b35328cfbc..4e42c1974ba2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -188,7 +188,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
- u32 nexthop;
+ bool is_v6gw = false;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
@@ -218,16 +218,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
}
rcu_read_lock_bh();
- nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
- neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
- if (unlikely(!neigh))
- neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
if (!IS_ERR(neigh)) {
int res;
sock_confirm_neigh(skb, neigh);
- res = neigh_output(neigh, skb);
-
+ /* if crossing protocols, can not use the cached header */
+ res = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock_bh();
return res;
}
@@ -472,7 +469,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9a3f13edc98e..a8eb97777c0a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -66,7 +66,7 @@
#include <net/netlink.h>
#include <net/fib_rules.h>
#include <linux/netconf.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#include <linux/nospec.h>
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index c98391d49200..1412b029f37f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -27,14 +27,6 @@ config NF_TABLES_IPV4
if NF_TABLES_IPV4
-config NFT_CHAIN_ROUTE_IPV4
- tristate "IPv4 nf_tables route chain support"
- help
- This option enables the "route" chain for IPv4 in nf_tables. This
- chain type is used to force packet re-routing after mangling header
- fields such as the source, destination, type of service and
- the packet mark.
-
config NFT_REJECT_IPV4
select NF_REJECT_IPV4
default NFT_REJECT
@@ -232,16 +224,10 @@ if IP_NF_NAT
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- select NF_NAT_MASQUERADE
- default m if NETFILTER_ADVANCED=n
+ select NETFILTER_XT_TARGET_MASQUERADE
help
- Masquerading is a special case of NAT: all outgoing connections are
- changed to seem to come from a particular interface's address, and
- if the interface goes down, those connections are lost. This is
- only useful for dialup accounts with dynamic IP address (ie. your IP
- address will be different on next dialup).
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
config IP_NF_TARGET_NETMAP
tristate "NETMAP target support"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index e241f5188ebe..c50e0ec095d2 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -24,7 +24,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
@@ -49,7 +48,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
# targets
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
deleted file mode 100644
index 7d82934c46f4..000000000000
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-static unsigned int nf_route_table_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- struct nft_pktinfo pkt;
- u32 mark;
- __be32 saddr, daddr;
- u_int8_t tos;
- const struct iphdr *iph;
- int err;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
-
- mark = skb->mark;
- iph = ip_hdr(skb);
- saddr = iph->saddr;
- daddr = iph->daddr;
- tos = iph->tos;
-
- ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_STOLEN) {
- iph = ip_hdr(skb);
-
- if (iph->saddr != saddr ||
- iph->daddr != daddr ||
- skb->mark != mark ||
- iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
- }
- return ret;
-}
-
-static const struct nft_chain_type nft_chain_route_ipv4 = {
- .name = "route",
- .type = NFT_CHAIN_T_ROUTE,
- .family = NFPROTO_IPV4,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_LOCAL_OUT),
- .hooks = {
- [NF_INET_LOCAL_OUT] = nf_route_table_hook,
- },
-};
-
-static int __init nft_chain_route_init(void)
-{
- nft_register_chain_type(&nft_chain_route_ipv4);
-
- return 0;
-}
-
-static void __exit nft_chain_route_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_route_ipv4);
-}
-
-module_init(nft_chain_route_init);
-module_exit(nft_chain_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3f2adf630d4..4950adeb05c0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -434,37 +434,46 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
struct net_device *dev = dst->dev;
- const __be32 *pkey = daddr;
- const struct rtable *rt;
struct neighbour *n;
- rt = (const struct rtable *) dst;
- if (rt->rt_gateway)
- pkey = (const __be32 *) &rt->rt_gateway;
- else if (skb)
- pkey = &ip_hdr(skb)->daddr;
+ rcu_read_lock_bh();
+
+ if (likely(rt->rt_gw_family == AF_INET)) {
+ n = ip_neigh_gw4(dev, rt->rt_gw4);
+ } else if (rt->rt_gw_family == AF_INET6) {
+ n = ip_neigh_gw6(dev, &rt->rt_gw6);
+ } else {
+ __be32 pkey;
+
+ pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
+ n = ip_neigh_gw4(dev, pkey);
+ }
+
+ if (n && !refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
+
+ rcu_read_unlock_bh();
- n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
- if (n)
- return n;
- return neigh_create(&arp_tbl, pkey, dev);
+ return n;
}
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
struct net_device *dev = dst->dev;
const __be32 *pkey = daddr;
- const struct rtable *rt;
- rt = (const struct rtable *)dst;
- if (rt->rt_gateway)
- pkey = (const __be32 *)&rt->rt_gateway;
- else if (!daddr ||
+ if (rt->rt_gw_family == AF_INET) {
+ pkey = (const __be32 *)&rt->rt_gw4;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
+ } else if (!daddr ||
(rt->rt_flags &
- (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
return;
-
+ }
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
@@ -629,8 +638,8 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
- rt->rt_gateway = fnhe->fnhe_gw;
- rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = fnhe->fnhe_gw;
}
}
@@ -747,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
return;
}
- if (rt->rt_gateway != old_gw)
+ if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
return;
in_dev = __in_dev_get_rcu(dev);
@@ -1189,11 +1198,39 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
return dst;
}
+static void ipv4_send_dest_unreach(struct sk_buff *skb)
+{
+ struct ip_options opt;
+ int res;
+
+ /* Recompile ip options since IPCB may not be valid anymore.
+ * Also check we have a reasonable ipv4 header.
+ */
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
+ ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
+ return;
+
+ memset(&opt, 0, sizeof(opt));
+ if (ip_hdr(skb)->ihl > 5) {
+ if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
+ return;
+ opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+
+ rcu_read_lock();
+ res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
+ return;
+ }
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
+}
+
static void ipv4_link_failure(struct sk_buff *skb)
{
struct rtable *rt;
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ ipv4_send_dest_unreach(skb);
rt = skb_rtable(skb);
if (rt)
@@ -1282,7 +1319,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = READ_ONCE(dst->dev->mtu);
if (unlikely(ip_mtu_locked(dst))) {
- if (rt->rt_uses_gateway && mtu > 576)
+ if (rt->rt_gw_family && mtu > 576)
mtu = 576;
}
@@ -1410,8 +1447,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
orig = NULL;
}
fill_route_from_fnhe(rt, fnhe);
- if (!rt->rt_gateway)
- rt->rt_gateway = daddr;
+ if (!rt->rt_gw4) {
+ rt->rt_gw4 = daddr;
+ rt->rt_gw_family = AF_INET;
+ }
if (do_cache) {
dst_hold(&rt->dst);
@@ -1535,14 +1574,20 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
if (fi) {
struct fib_nh_common *nhc = FIB_RES_NHC(*res);
- struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
+ struct fib_nh *nh;
- if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) {
- rt->rt_gateway = nh->fib_nh_gw4;
- rt->rt_uses_gateway = 1;
+ if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+ rt->rt_gw_family = nhc->nhc_gw_family;
+ /* only INET and INET6 are supported */
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ rt->rt_gw4 = nhc->nhc_gw.ipv4;
+ else
+ rt->rt_gw6 = nhc->nhc_gw.ipv6;
}
+
ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+ nh = container_of(nhc, struct fib_nh, nh_common);
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
@@ -1557,8 +1602,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- if (!rt->rt_gateway)
- rt->rt_gateway = daddr;
+ if (!rt->rt_gw4) {
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = daddr;
+ }
rt_add_uncached_list(rt);
}
} else
@@ -1591,8 +1638,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
rt->rt_iif = 0;
rt->rt_pmtu = 0;
rt->rt_mtu_locked = 0;
- rt->rt_gateway = 0;
- rt->rt_uses_gateway = 0;
+ rt->rt_gw_family = 0;
+ rt->rt_gw4 = 0;
INIT_LIST_HEAD(&rt->rt_uncached);
rt->dst.output = ip_output;
@@ -1734,8 +1781,9 @@ static int __mkroute_input(struct sk_buff *skb,
do_cache = res->fi && !itag;
if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
skb->protocol == htons(ETH_P_IP)) {
- __be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+ __be32 gw;
+ gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
if (IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, gw))
IPCB(skb)->flags |= IPSKB_DOREDIRECT;
@@ -2284,7 +2332,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
} else {
if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH &&
- !(nhc->nhc_has_gw &&
+ !(nhc->nhc_gw_family &&
nhc->nhc_scope == RT_SCOPE_LINK))) {
do_cache = false;
goto add;
@@ -2594,8 +2642,11 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
- rt->rt_gateway = ort->rt_gateway;
- rt->rt_uses_gateway = ort->rt_uses_gateway;
+ rt->rt_gw_family = ort->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ rt->rt_gw4 = ort->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ rt->rt_gw6 = ort->rt_gw6;
INIT_LIST_HEAD(&rt->rt_uncached);
}
@@ -2674,9 +2725,22 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
- if (rt->rt_uses_gateway &&
- nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
+ if (rt->rt_gw_family == AF_INET &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
goto nla_put_failure;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ }
expires = rt->dst.expires;
if (expires) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2316c08e9591..875867b64d6a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -49,6 +49,7 @@ static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static int comp_sack_nr_max = 255;
static u32 u32_max_div_HZ = UINT_MAX / HZ;
+static int one_day_secs = 24 * 3600;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -1160,7 +1161,9 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one_day_secs
},
{
.procname = "tcp_autocorking",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 603e770d59b3..f7567a3698eb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -868,7 +868,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
if (likely(!size)) {
skb = sk->sk_tx_skb_cache;
if (skb && !skb_cloned(skb)) {
- skb->truesize -= skb->data_len;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
sk->sk_tx_skb_cache = NULL;
pskb_trim(skb, 0);
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 359da68d7c06..477cb4aa456c 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -49,9 +49,8 @@
#define DCTCP_MAX_ALPHA 1024U
struct dctcp {
- u32 acked_bytes_ecn;
- u32 acked_bytes_total;
- u32 prior_snd_una;
+ u32 old_delivered;
+ u32 old_delivered_ce;
u32 prior_rcv_nxt;
u32 dctcp_alpha;
u32 next_seq;
@@ -73,8 +72,8 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
{
ca->next_seq = tp->snd_nxt;
- ca->acked_bytes_ecn = 0;
- ca->acked_bytes_total = 0;
+ ca->old_delivered = tp->delivered;
+ ca->old_delivered_ce = tp->delivered_ce;
}
static void dctcp_init(struct sock *sk)
@@ -86,7 +85,6 @@ static void dctcp_init(struct sock *sk)
sk->sk_state == TCP_CLOSE)) {
struct dctcp *ca = inet_csk_ca(sk);
- ca->prior_snd_una = tp->snd_una;
ca->prior_rcv_nxt = tp->rcv_nxt;
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
@@ -118,37 +116,25 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
- u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
-
- /* If ack did not advance snd_una, count dupack as MSS size.
- * If ack did update window, do not count it at all.
- */
- if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
- acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
- if (acked_bytes) {
- ca->acked_bytes_total += acked_bytes;
- ca->prior_snd_una = tp->snd_una;
-
- if (flags & CA_ACK_ECE)
- ca->acked_bytes_ecn += acked_bytes;
- }
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
- u64 bytes_ecn = ca->acked_bytes_ecn;
+ u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
- if (bytes_ecn) {
+ if (delivered_ce) {
+ u32 delivered = tp->delivered - ca->old_delivered;
+
/* If dctcp_shift_g == 1, a 32bit value would overflow
- * after 8 Mbytes.
+ * after 8 M packets.
*/
- bytes_ecn <<= (10 - dctcp_shift_g);
- do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
+ delivered_ce <<= (10 - dctcp_shift_g);
+ delivered_ce /= max(1U, delivered);
- alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
+ alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
}
/* dctcp_alpha can be read from dctcp_get_info() without
* synchro, so we ask compiler to not use dctcp_alpha
@@ -200,6 +186,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
{
const struct dctcp *ca = inet_csk_ca(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Fill it also in case of VEGASINFO due to req struct limits.
* We can still correctly retrieve it later.
@@ -211,8 +198,10 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
info->dctcp.dctcp_enabled = 1;
info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
info->dctcp.dctcp_alpha = ca->dctcp_alpha;
- info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
- info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
+ info->dctcp.dctcp_ab_ecn = tp->mss_cache *
+ (tp->delivered_ce - ca->old_delivered_ce);
+ info->dctcp.dctcp_ab_tot = tp->mss_cache *
+ (tp->delivered - ca->old_delivered);
}
*attr = INET_DIAG_DCTCPINFO;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6660ce2a7333..97671bff597a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -402,11 +402,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int room;
+
+ room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
/* Check #1 */
- if (tp->rcv_ssthresh < tp->window_clamp &&
- (int)tp->rcv_ssthresh < tcp_space(sk) &&
- !tcp_under_memory_pressure(sk)) {
+ if (room > 0 && !tcp_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -419,8 +420,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
if (incr) {
incr = max_t(int, incr, 2 * skb->len);
- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
- tp->window_clamp);
+ tp->rcv_ssthresh += min(room, incr);
inet_csk(sk)->icsk_ack.quick |= 1;
}
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 372fdc5381a9..3c58ba02af7d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1631,7 +1631,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
EXPORT_SYMBOL(udp_ioctl);
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
- int noblock, int *peeked, int *off, int *err)
+ int noblock, int *off, int *err)
{
struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
struct sk_buff_head *queue;
@@ -1650,13 +1650,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
break;
error = -EAGAIN;
- *peeked = 0;
do {
spin_lock_bh(&queue->lock);
skb = __skb_try_recv_from_queue(sk, queue, flags,
udp_skb_destructor,
- peeked, off, err,
- &last);
+ off, err, &last);
if (skb) {
spin_unlock_bh(&queue->lock);
return skb;
@@ -1677,8 +1675,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
skb = __skb_try_recv_from_queue(sk, queue, flags,
udp_skb_dtor_locked,
- peeked, off, err,
- &last);
+ off, err, &last);
spin_unlock(&sk_queue->lock);
spin_unlock_bh(&queue->lock);
if (skb)
@@ -1713,8 +1710,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, peeking, off;
- int err;
+ int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
@@ -1722,9 +1718,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
try_again:
- peeking = flags & MSG_PEEK;
off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
if (!skb)
return err;
@@ -1762,7 +1757,7 @@ try_again:
}
if (unlikely(err)) {
- if (!peeked) {
+ if (!peeking) {
atomic_inc(&sk->sk_drops);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
@@ -1771,7 +1766,7 @@ try_again:
return err;
}
- if (!peeked)
+ if (!peeking)
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS, is_udplite);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d73a6d6652f6..72d19b1838ed 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -97,8 +97,11 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
RTCF_LOCAL);
xdst->u.rt.rt_type = rt->rt_type;
- xdst->u.rt.rt_gateway = rt->rt_gateway;
- xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+ xdst->u.rt.rt_gw_family = rt->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ xdst->u.rt.rt_gw4 = rt->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ xdst->u.rt.rt_gw6 = rt->rt_gw6;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2e8d1d2d8d3d..340a0f06f974 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2421,7 +2421,7 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex)
continue;
- if (no_gw && rt->fib6_nh.fib_nh_has_gw)
+ if (no_gw && rt->fib6_nh.fib_nh_gw_family)
continue;
if ((rt->fib6_flags & flags) != flags)
continue;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 945b66e3008f..763a947e0d14 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -144,43 +144,53 @@ static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
return NULL;
}
-static struct fib6_info *
+static int
eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6, int flags)
+ int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- return NULL;
+ return -EAFNOSUPPORT;
}
-static struct fib6_info *
+static int
eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
- int flags)
+ struct fib6_result *res, int flags)
{
- return NULL;
+ return -EAFNOSUPPORT;
}
-static struct fib6_info *
-eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb, int strict)
+static void
+eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res,
+ struct flowi6 *fl6, int oif, bool have_oif_match,
+ const struct sk_buff *skb, int strict)
{
- return f6i;
}
static u32
-eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
- struct in6_addr *saddr)
+eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
return 0;
}
+static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+ struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EAFNOSUPPORT;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
.ipv6_route_input = eafnosupport_ipv6_route_input,
.fib6_get_table = eafnosupport_fib6_get_table,
.fib6_table_lookup = eafnosupport_fib6_table_lookup,
.fib6_lookup = eafnosupport_fib6_lookup,
- .fib6_multipath_select = eafnosupport_fib6_multipath_select,
+ .fib6_select_path = eafnosupport_fib6_select_path,
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
+ .fib6_nh_init = eafnosupport_fib6_nh_init,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index d43d076c98f5..1766325423b5 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -476,7 +476,7 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
}
if (nlmsg_attrlen(nlh, sizeof(*ifal))) {
- NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst");
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request");
return -EINVAL;
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1789bf99c419..c04ae282f4e4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -547,12 +547,6 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct net *net = sock_net(sk);
switch (cmd) {
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
-
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
-
case SIOCADDRT:
case SIOCDELRT:
@@ -585,6 +579,7 @@ const struct proto_ops inet6_stream_ops = {
.getname = inet6_getname,
.poll = tcp_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = inet_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
@@ -618,6 +613,7 @@ const struct proto_ops inet6_dgram_ops = {
.getname = inet6_getname,
.poll = udp_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
@@ -850,6 +846,15 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
+
+ /* By default, rate limit error messages.
+ * Except for pmtu discovery, it would break it.
+ * proc_do_large_bitmap needs pointer to the bitmap.
+ */
+ bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1);
+ bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1);
+ net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask;
+
net->ipv6.sysctl.flowlabel_consistency = 1;
net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
net->ipv6.sysctl.idgen_retries = 3;
@@ -917,8 +922,10 @@ static const struct ipv6_stub ipv6_stub_impl = {
.fib6_get_table = fib6_get_table,
.fib6_table_lookup = fib6_table_lookup,
.fib6_lookup = fib6_lookup,
- .fib6_multipath_select = fib6_multipath_select,
+ .fib6_select_path = fib6_select_path,
.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
+ .fib6_nh_init = fib6_nh_init,
+ .fib6_nh_release = fib6_nh_release,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index f590446595d8..06d1b7763600 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -61,16 +61,16 @@ unsigned int fib6_rules_seq_read(struct net *net)
}
/* called with rcu lock held; no reference taken on fib6_info */
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
- int flags)
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- struct fib6_info *f6i;
int err;
if (net->ipv6.fib6_has_custom_rules) {
struct fib_lookup_arg arg = {
.lookup_ptr = fib6_table_lookup,
.lookup_data = &oif,
+ .result = res,
.flags = FIB_LOOKUP_NOREF,
};
@@ -78,19 +78,15 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- if (err)
- return ERR_PTR(err);
-
- f6i = arg.result ? : net->ipv6.fib6_null_entry;
} else {
- f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
- oif, fl6, flags);
- if (!f6i || f6i == net->ipv6.fib6_null_entry)
- f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
- oif, fl6, flags);
+ err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif,
+ fl6, res, flags);
+ if (err || res->f6i == net->ipv6.fib6_null_entry)
+ err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, res, flags);
}
- return f6i;
+ return err;
}
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
@@ -98,9 +94,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
if (net->ipv6.fib6_has_custom_rules) {
+ struct fib6_result res = {};
struct fib_lookup_arg arg = {
.lookup_ptr = lookup,
.lookup_data = skb,
+ .result = &res,
.flags = FIB_LOOKUP_NOREF,
};
@@ -110,8 +108,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- if (arg.result)
- return arg.result;
+ if (res.rt6)
+ return &res.rt6->dst;
} else {
struct rt6_info *rt;
@@ -157,11 +155,11 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
+ struct fib6_result *res = arg->result;
struct flowi6 *flp6 = &flp->u.ip6;
struct net *net = rule->fr_net;
struct fib6_table *table;
- struct fib6_info *f6i;
- int err = -EAGAIN, *oif;
+ int err, *oif;
u32 tb_id;
switch (rule->action) {
@@ -182,14 +180,12 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
return -EAGAIN;
oif = (int *)arg->lookup_data;
- f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
- if (f6i != net->ipv6.fib6_null_entry) {
+ err = fib6_table_lookup(net, table, *oif, flp6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
err = fib6_rule_saddr(net, rule, flags, flp6,
- fib6_info_nh_dev(f6i));
-
- if (likely(!err))
- arg->result = f6i;
- }
+ res->nh->fib_nh_dev);
+ else
+ err = -EAGAIN;
return err;
}
@@ -197,6 +193,7 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
+ struct fib6_result *res = arg->result;
struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
struct fib6_table *table;
@@ -251,7 +248,7 @@ again:
discard_pkt:
dst_hold(&rt->dst);
out:
- arg->result = rt;
+ res->rt6 = rt;
return err;
}
@@ -266,9 +263,13 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
- struct rt6_info *rt = (struct rt6_info *) arg->result;
+ struct fib6_result *res = arg->result;
+ struct rt6_info *rt = res->rt6;
struct net_device *dev = NULL;
+ if (!rt)
+ return false;
+
if (rt->rt6i_idev)
dev = rt->rt6i_idev->dev;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index cc14b9998941..afb915807cd0 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -168,22 +168,21 @@ static bool is_ineligible(const struct sk_buff *skb)
return false;
}
-static bool icmpv6_mask_allow(int type)
+static bool icmpv6_mask_allow(struct net *net, int type)
{
- /* Informational messages are not limited. */
- if (type & ICMPV6_INFOMSG_MASK)
+ if (type > ICMPV6_MSG_MAX)
return true;
- /* Do not limit pmtu discovery, it would break it. */
- if (type == ICMPV6_PKT_TOOBIG)
+ /* Limit if icmp type is set in ratemask. */
+ if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask))
return true;
return false;
}
-static bool icmpv6_global_allow(int type)
+static bool icmpv6_global_allow(struct net *net, int type)
{
- if (icmpv6_mask_allow(type))
+ if (icmpv6_mask_allow(net, type))
return true;
if (icmp_global_allow())
@@ -202,7 +201,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
struct dst_entry *dst;
bool res = false;
- if (icmpv6_mask_allow(type))
+ if (icmpv6_mask_allow(net, type))
return true;
/*
@@ -511,7 +510,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type))
+ if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
goto out_bh_enable;
mip6_addr_swap(skb);
@@ -731,6 +730,11 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (IS_ERR(dst))
goto out;
+ /* Check the ratelimit */
+ if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) ||
+ !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6))
+ goto out_dst_release;
+
idev = __in6_dev_get(skb->dev);
msg.skb = skb;
@@ -751,6 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
skb->len + sizeof(struct icmp6hdr));
}
+out_dst_release:
dst_release(dst);
out:
icmpv6_xmit_unlock(sk);
@@ -1137,6 +1142,13 @@ static struct ctl_table ipv6_icmp_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "ratemask",
+ .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr,
+ .maxlen = ICMPV6_MSG_MAX + 1,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
{ },
};
@@ -1153,6 +1165,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
+ table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
}
return table;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 8c00609a1513..a8919c217cc2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -162,7 +162,7 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
}
INIT_LIST_HEAD(&f6i->fib6_siblings);
- atomic_inc(&f6i->fib6_ref);
+ refcount_set(&f6i->fib6_ref, 1);
return f6i;
}
@@ -175,10 +175,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
WARN_ON(f6i->fib6_node);
bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
- if (bucket) {
- f6i->rt6i_exception_bucket = NULL;
- kfree(bucket);
- }
+ kfree(bucket);
if (f6i->rt6i_pcpu) {
int cpu;
@@ -354,10 +351,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
}
/* called with rcu lock held; no reference taken on fib6_info */
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
- int flags)
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
+ res, flags);
}
static void __net_init fib6_tables_init(struct net *net)
@@ -848,8 +846,8 @@ insert_above:
RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
- atomic_inc(&rcu_dereference_protected(in->leaf,
- lockdep_is_held(&table->tb6_lock))->fib6_ref);
+ fib6_info_hold(rcu_dereference_protected(in->leaf,
+ lockdep_is_held(&table->tb6_lock)));
/* update parent pointer */
if (dir)
@@ -931,7 +929,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
{
struct fib6_table *table = rt->fib6_table;
- if (atomic_read(&rt->fib6_ref) != 1) {
+ if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
* which must be released in time. So, scan ascendant nodes
@@ -944,7 +942,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct fib6_info *new_leaf;
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
- atomic_inc(&new_leaf->fib6_ref);
+ fib6_info_hold(new_leaf);
rcu_assign_pointer(fn->leaf, new_leaf);
fib6_info_release(rt);
@@ -1110,7 +1108,7 @@ add:
return err;
rcu_assign_pointer(rt->fib6_next, iter);
- atomic_inc(&rt->fib6_ref);
+ fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
@@ -1138,7 +1136,7 @@ add:
if (err)
return err;
- atomic_inc(&rt->fib6_ref);
+ fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
rt->fib6_next = iter->fib6_next;
rcu_assign_pointer(*ins, rt);
@@ -1280,7 +1278,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
if (!sfn)
goto failure;
- atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
+ fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
rcu_assign_pointer(sfn->leaf,
info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
@@ -1323,7 +1321,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
rcu_assign_pointer(fn->leaf,
info->nl_net->ipv6.fib6_null_entry);
} else {
- atomic_inc(&rt->fib6_ref);
+ fib6_info_hold(rt);
rcu_assign_pointer(fn->leaf, rt);
}
}
@@ -2304,7 +2302,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->fib6_nh.fib_nh_has_gw) {
+ if (rt->fib6_nh.fib_nh_gw_family) {
flags |= RTF_GATEWAY;
seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6);
} else {
@@ -2313,7 +2311,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
dev = rt->fib6_nh.fib_nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
- rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
+ rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
flags, dev ? dev->name : "");
iter->w.leaf = NULL;
return 0;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b32c95f02128..655e46b227f9 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -525,10 +525,10 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
}
static int ip6erspan_rcv(struct sk_buff *skb,
- struct tnl_ptk_info *tpi)
+ struct tnl_ptk_info *tpi,
+ int gre_hdr_len)
{
struct erspan_base_hdr *ershdr;
- struct erspan_metadata *pkt_md;
const struct ipv6hdr *ipv6h;
struct erspan_md2 *md2;
struct ip6_tnl *tunnel;
@@ -547,18 +547,16 @@ static int ip6erspan_rcv(struct sk_buff *skb,
if (unlikely(!pskb_may_pull(skb, len)))
return PACKET_REJECT;
- ershdr = (struct erspan_base_hdr *)skb->data;
- pkt_md = (struct erspan_metadata *)(ershdr + 1);
-
if (__iptunnel_pull_header(skb, len,
htons(ETH_P_TEB),
false, false) < 0)
return PACKET_REJECT;
if (tunnel->parms.collect_md) {
+ struct erspan_metadata *pkt_md, *md;
struct metadata_dst *tun_dst;
struct ip_tunnel_info *info;
- struct erspan_metadata *md;
+ unsigned char *gh;
__be64 tun_id;
__be16 flags;
@@ -571,6 +569,14 @@ static int ip6erspan_rcv(struct sk_buff *skb,
if (!tun_dst)
return PACKET_REJECT;
+ /* skb can be uncloned in __iptunnel_pull_header, so
+ * old pkt_md is no longer valid and we need to reset
+ * it
+ */
+ gh = skb_network_header(skb) +
+ skb_network_header_len(skb);
+ pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+ sizeof(*ershdr));
info = &tun_dst->u.tun_info;
md = ip_tunnel_info_opts(info);
md->version = ver;
@@ -607,7 +613,7 @@ static int gre_rcv(struct sk_buff *skb)
if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
tpi.proto == htons(ETH_P_ERSPAN2))) {
- if (ip6erspan_rcv(skb, &tpi) == PACKET_RCVD)
+ if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
goto out;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e51f3c648b09..adef2236abe2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -117,7 +117,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
if (!IS_ERR(neigh)) {
sock_confirm_neigh(skb, neigh);
- ret = neigh_output(neigh, skb);
+ ret = neigh_output(neigh, skb, false);
rcu_read_unlock_bh();
return ret;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 66c8b294e02b..4c8e2ea8bf19 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -77,6 +77,8 @@ static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
+static bool ndisc_allow_add(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -117,6 +119,7 @@ struct neigh_table nd_tbl = {
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
.proxy_redo = pndisc_redo,
+ .allow_add = ndisc_allow_add,
.id = "ndisc_cache",
.parms = {
.tbl = &nd_tbl,
@@ -392,6 +395,20 @@ static void pndisc_destructor(struct pneigh_entry *n)
ipv6_dev_mc_dec(dev, &maddr);
}
+/* called with rtnl held */
+static bool ndisc_allow_add(const struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (!idev || idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device");
+ return false;
+ }
+
+ return true;
+}
+
static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
int len)
{
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ddc99a1653aa..086fc669279e 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -23,14 +23,6 @@ config NF_TABLES_IPV6
if NF_TABLES_IPV6
-config NFT_CHAIN_ROUTE_IPV6
- tristate "IPv6 nf_tables route chain support"
- help
- This option enables the "route" chain for IPv6 in nf_tables. This
- chain type is used to force packet re-routing after mangling header
- fields such as the source, destination, flowlabel, hop-limit and
- the packet mark.
-
config NFT_REJECT_IPV6
select NF_REJECT_IPV6
default NFT_REJECT
@@ -278,15 +270,10 @@ if IP6_NF_NAT
config IP6_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- select NF_NAT_MASQUERADE
+ select NETFILTER_XT_TARGET_MASQUERADE
help
- Masquerading is a special case of NAT: all outgoing connections are
- changed to seem to come from a particular interface's address, and
- if the interface goes down, those connections are lost. This is
- only useful for dialup accounts with dynamic IP address (ie. your IP
- address will be different on next dialup).
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
config IP6_NF_TARGET_NPT
tristate "NPT (Network Prefix translation) target support"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 3853c648ebaa..731a74c60dca 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -27,7 +27,6 @@ obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o
# nf_tables
-obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
@@ -47,7 +46,6 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o
# targets
-obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o
obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
deleted file mode 100644
index 29c7f1915a96..000000000000
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6
- * NAT funded by Astaro.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/ipv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/addrconf.h>
-#include <net/ipv6.h>
-#include <net/netfilter/ipv6/nf_nat_masquerade.h>
-
-static unsigned int
-masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
-{
- return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
-}
-
-static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_range2 *range = par->targinfo;
-
- if (range->flags & NF_NAT_RANGE_MAP_IPS)
- return -EINVAL;
- return nf_ct_netns_get(par->net, par->family);
-}
-
-static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par)
-{
- nf_ct_netns_put(par->net, par->family);
-}
-
-static struct xt_target masquerade_tg6_reg __read_mostly = {
- .name = "MASQUERADE",
- .family = NFPROTO_IPV6,
- .checkentry = masquerade_tg6_checkentry,
- .destroy = masquerade_tg6_destroy,
- .target = masquerade_tg6,
- .targetsize = sizeof(struct nf_nat_range),
- .table = "nat",
- .hooks = 1 << NF_INET_POST_ROUTING,
- .me = THIS_MODULE,
-};
-
-static int __init masquerade_tg6_init(void)
-{
- int err;
-
- err = xt_register_target(&masquerade_tg6_reg);
- if (err)
- return err;
-
- err = nf_nat_masquerade_ipv6_register_notifier();
- if (err)
- xt_unregister_target(&masquerade_tg6_reg);
-
- return err;
-}
-static void __exit masquerade_tg6_exit(void)
-{
- nf_nat_masquerade_ipv6_unregister_notifier();
- xt_unregister_target(&masquerade_tg6_reg);
-}
-
-module_init(masquerade_tg6_init);
-module_exit(masquerade_tg6_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("Xtables: automatic address SNAT");
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
deleted file mode 100644
index da3f1f8cb325..000000000000
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-#include <net/route.h>
-
-static unsigned int nf_route_table_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- struct nft_pktinfo pkt;
- struct in6_addr saddr, daddr;
- u_int8_t hop_limit;
- u32 mark, flowlabel;
- int err;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
-
- /* save source/dest address, mark, hoplimit, flowlabel, priority */
- memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
- memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
- mark = skb->mark;
- hop_limit = ipv6_hdr(skb)->hop_limit;
-
- /* flowlabel and prio (includes version, which shouldn't change either */
- flowlabel = *((u32 *)ipv6_hdr(skb));
-
- ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
- memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
- skb->mark != mark ||
- ipv6_hdr(skb)->hop_limit != hop_limit ||
- flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
- err = ip6_route_me_harder(state->net, skb);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-
- return ret;
-}
-
-static const struct nft_chain_type nft_chain_route_ipv6 = {
- .name = "route",
- .type = NFT_CHAIN_T_ROUTE,
- .family = NFPROTO_IPV6,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_LOCAL_OUT),
- .hooks = {
- [NF_INET_LOCAL_OUT] = nf_route_table_hook,
- },
-};
-
-static int __init nft_chain_route_init(void)
-{
- nft_register_chain_type(&nft_chain_route_ipv6);
-
- return 0;
-}
-
-static void __exit nft_chain_route_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_route_ipv6);
-}
-
-module_init(nft_chain_route_init);
-module_exit(nft_chain_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route");
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5a426226c762..84dbe21b71e5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1356,6 +1356,7 @@ const struct proto_ops inet6_sockraw_ops = {
.getname = inet6_getname,
.poll = datagram_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6e89151693d0..9c0127a44f9f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -59,7 +59,7 @@
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
@@ -102,14 +102,15 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
-static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
+ int strict);
static size_t rt6_nlmsg_size(struct fib6_info *rt);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags);
-static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
struct in6_addr *daddr,
struct in6_addr *saddr);
@@ -295,7 +296,7 @@ static const struct fib6_info fib6_null_entry_template = {
.fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
.fib6_protocol = RTPROT_KERNEL,
.fib6_metric = ~(u32)0,
- .fib6_ref = ATOMIC_INIT(1),
+ .fib6_ref = REFCOUNT_INIT(1),
.fib6_type = RTN_UNREACHABLE,
.fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
};
@@ -427,13 +428,15 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
}
-struct fib6_info *fib6_multipath_select(const struct net *net,
- struct fib6_info *match,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb,
- int strict)
+void fib6_select_path(const struct net *net, struct fib6_result *res,
+ struct flowi6 *fl6, int oif, bool have_oif_match,
+ const struct sk_buff *skb, int strict)
{
struct fib6_info *sibling, *next_sibling;
+ struct fib6_info *match = res->f6i;
+
+ if (!match->fib6_nsiblings || have_oif_match)
+ goto out;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
@@ -442,60 +445,88 @@ struct fib6_info *fib6_multipath_select(const struct net *net,
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
- return match;
+ goto out;
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
fib6_siblings) {
+ const struct fib6_nh *nh = &sibling->fib6_nh;
int nh_upper_bound;
- nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound);
+ nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
if (fl6->mp_hash > nh_upper_bound)
continue;
- if (rt6_score_route(sibling, oif, strict) < 0)
+ if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
break;
match = sibling;
break;
}
- return match;
+out:
+ res->f6i = match;
+ res->nh = &match->fib6_nh;
}
/*
* Route lookup. rcu_read_lock() should be held.
*/
-static inline struct fib6_info *rt6_device_match(struct net *net,
- struct fib6_info *rt,
- const struct in6_addr *saddr,
- int oif,
- int flags)
+static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
+ const struct in6_addr *saddr, int oif, int flags)
{
- struct fib6_info *sprt;
+ const struct net_device *dev;
- if (!oif && ipv6_addr_any(saddr) &&
- !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
- return rt;
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
+ return false;
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
- const struct net_device *dev = sprt->fib6_nh.fib_nh_dev;
+ dev = nh->fib_nh_dev;
+ if (oif) {
+ if (dev->ifindex == oif)
+ return true;
+ } else {
+ if (ipv6_chk_addr(net, saddr, dev,
+ flags & RT6_LOOKUP_F_IFACE))
+ return true;
+ }
- if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
- continue;
+ return false;
+}
- if (oif) {
- if (dev->ifindex == oif)
- return sprt;
- } else {
- if (ipv6_chk_addr(net, saddr, dev,
- flags & RT6_LOOKUP_F_IFACE))
- return sprt;
+static void rt6_device_match(struct net *net, struct fib6_result *res,
+ const struct in6_addr *saddr, int oif, int flags)
+{
+ struct fib6_info *f6i = res->f6i;
+ struct fib6_info *spf6i;
+ struct fib6_nh *nh;
+
+ if (!oif && ipv6_addr_any(saddr)) {
+ nh = &f6i->fib6_nh;
+ if (!(nh->fib_nh_flags & RTNH_F_DEAD))
+ goto out;
+ }
+
+ for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
+ nh = &spf6i->fib6_nh;
+ if (__rt6_device_match(net, nh, saddr, oif, flags)) {
+ res->f6i = spf6i;
+ goto out;
}
}
- if (oif && flags & RT6_LOOKUP_F_IFACE)
- return net->ipv6.fib6_null_entry;
+ if (oif && flags & RT6_LOOKUP_F_IFACE) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ nh = &res->f6i->fib6_nh;
+ goto out;
+ }
- return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
+ nh = &f6i->fib6_nh;
+ if (nh->fib_nh_flags & RTNH_F_DEAD) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ nh = &res->f6i->fib6_nh;
+ }
+out:
+ res->nh = nh;
+ res->fib6_type = res->f6i->fib6_type;
+ res->fib6_flags = res->f6i->fib6_flags;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -517,7 +548,7 @@ static void rt6_probe_deferred(struct work_struct *w)
kfree(work);
}
-static void rt6_probe(struct fib6_info *rt)
+static void rt6_probe(struct fib6_nh *fib6_nh)
{
struct __rt6_probe_work *work = NULL;
const struct in6_addr *nh_gw;
@@ -533,11 +564,11 @@ static void rt6_probe(struct fib6_info *rt)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (!rt || !rt->fib6_nh.fib_nh_has_gw)
+ if (fib6_nh->fib_nh_gw_family)
return;
- nh_gw = &rt->fib6_nh.fib_nh_gw6;
- dev = rt->fib6_nh.fib_nh_dev;
+ nh_gw = &fib6_nh->fib_nh_gw6;
+ dev = fib6_nh->fib_nh_dev;
rcu_read_lock_bh();
idev = __in6_dev_get(dev);
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
@@ -554,13 +585,13 @@ static void rt6_probe(struct fib6_info *rt)
__neigh_set_probe_once(neigh);
}
write_unlock(&neigh->lock);
- } else if (time_after(jiffies, rt->last_probe +
+ } else if (time_after(jiffies, fib6_nh->last_probe +
idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
if (work) {
- rt->last_probe = jiffies;
+ fib6_nh->last_probe = jiffies;
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = *nh_gw;
dev_hold(dev);
@@ -572,7 +603,7 @@ out:
rcu_read_unlock_bh();
}
#else
-static inline void rt6_probe(struct fib6_info *rt)
+static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
}
#endif
@@ -580,27 +611,14 @@ static inline void rt6_probe(struct fib6_info *rt)
/*
* Default Router Selection (RFC 2461 6.3.6)
*/
-static inline int rt6_check_dev(struct fib6_info *rt, int oif)
-{
- const struct net_device *dev = rt->fib6_nh.fib_nh_dev;
-
- if (!oif || dev->ifindex == oif)
- return 2;
- return 0;
-}
-
-static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
+static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
struct neighbour *neigh;
- if (rt->fib6_flags & RTF_NONEXTHOP ||
- !rt->fib6_nh.fib_nh_has_gw)
- return RT6_NUD_SUCCEED;
-
rcu_read_lock_bh();
- neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev,
- &rt->fib6_nh.fib_nh_gw6);
+ neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
+ &fib6_nh->fib_nh_gw6);
if (neigh) {
read_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
@@ -621,43 +639,44 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
return ret;
}
-static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
+ int strict)
{
- int m;
+ int m = 0;
+
+ if (!oif || nh->fib_nh_dev->ifindex == oif)
+ m = 2;
- m = rt6_check_dev(rt, oif);
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
- m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
- if (strict & RT6_LOOKUP_F_REACHABLE) {
- int n = rt6_check_neigh(rt);
+ if ((strict & RT6_LOOKUP_F_REACHABLE) &&
+ !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
+ int n = rt6_check_neigh(nh);
if (n < 0)
return n;
}
return m;
}
-static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
- int *mpri, struct fib6_info *match,
- bool *do_rr)
+static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
+ int oif, int strict, int *mpri, bool *do_rr)
{
- int m;
bool match_do_rr = false;
+ bool rc = false;
+ int m;
- if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
goto out;
- if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) &&
- rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
+ if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
+ nh->fib_nh_flags & RTNH_F_LINKDOWN &&
!(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
- if (fib6_check_expired(rt))
- goto out;
-
- m = rt6_score_route(rt, oif, strict);
+ m = rt6_score_route(nh, fib6_flags, oif, strict);
if (m == RT6_NUD_FAIL_DO_RR) {
match_do_rr = true;
m = 0; /* lowest valid score */
@@ -666,67 +685,82 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
}
if (strict & RT6_LOOKUP_F_REACHABLE)
- rt6_probe(rt);
+ rt6_probe(nh);
/* note that m can be RT6_NUD_FAIL_PROBE at this point */
if (m > *mpri) {
*do_rr = match_do_rr;
*mpri = m;
- match = rt;
+ rc = true;
}
out:
- return match;
+ return rc;
}
-static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
- struct fib6_info *leaf,
- struct fib6_info *rr_head,
- u32 metric, int oif, int strict,
- bool *do_rr)
+static void __find_rr_leaf(struct fib6_info *f6i_start,
+ struct fib6_info *nomatch, u32 metric,
+ struct fib6_result *res, struct fib6_info **cont,
+ int oif, int strict, bool *do_rr, int *mpri)
{
- struct fib6_info *rt, *match, *cont;
- int mpri = -1;
+ struct fib6_info *f6i;
- match = NULL;
- cont = NULL;
- for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
- if (rt->fib6_metric != metric) {
- cont = rt;
- break;
+ for (f6i = f6i_start;
+ f6i && f6i != nomatch;
+ f6i = rcu_dereference(f6i->fib6_next)) {
+ struct fib6_nh *nh;
+
+ if (cont && f6i->fib6_metric != metric) {
+ *cont = f6i;
+ return;
}
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
- }
+ if (fib6_check_expired(f6i))
+ continue;
- for (rt = leaf; rt && rt != rr_head;
- rt = rcu_dereference(rt->fib6_next)) {
- if (rt->fib6_metric != metric) {
- cont = rt;
- break;
+ nh = &f6i->fib6_nh;
+ if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
+ res->f6i = f6i;
+ res->nh = nh;
+ res->fib6_flags = f6i->fib6_flags;
+ res->fib6_type = f6i->fib6_type;
}
-
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
}
+}
+
+static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
+ struct fib6_info *rr_head, int oif, int strict,
+ bool *do_rr, struct fib6_result *res)
+{
+ u32 metric = rr_head->fib6_metric;
+ struct fib6_info *cont = NULL;
+ int mpri = -1;
- if (match || !cont)
- return match;
+ __find_rr_leaf(rr_head, NULL, metric, res, &cont,
+ oif, strict, do_rr, &mpri);
- for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
+ __find_rr_leaf(leaf, rr_head, metric, res, &cont,
+ oif, strict, do_rr, &mpri);
- return match;
+ if (res->f6i || !cont)
+ return;
+
+ __find_rr_leaf(cont, NULL, metric, res, NULL,
+ oif, strict, do_rr, &mpri);
}
-static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
- int oif, int strict)
+static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
+ struct fib6_result *res, int strict)
{
struct fib6_info *leaf = rcu_dereference(fn->leaf);
- struct fib6_info *match, *rt0;
+ struct fib6_info *rt0;
bool do_rr = false;
int key_plen;
+ /* make sure this function or its helpers sets f6i */
+ res->f6i = NULL;
+
if (!leaf || leaf == net->ipv6.fib6_null_entry)
- return net->ipv6.fib6_null_entry;
+ goto out;
rt0 = rcu_dereference(fn->rr_ptr);
if (!rt0)
@@ -743,11 +777,9 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
key_plen = rt0->fib6_src.plen;
#endif
if (fn->fn_bit != key_plen)
- return net->ipv6.fib6_null_entry;
-
- match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
- &do_rr);
+ goto out;
+ find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
if (do_rr) {
struct fib6_info *next = rcu_dereference(rt0->fib6_next);
@@ -764,12 +796,19 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
}
}
- return match ? match : net->ipv6.fib6_null_entry;
+out:
+ if (!res->f6i) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ res->nh = &res->f6i->fib6_nh;
+ res->fib6_flags = res->f6i->fib6_flags;
+ res->fib6_type = res->f6i->fib6_type;
+ }
}
-static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
+static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
{
- return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw;
+ return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
+ res->nh->fib_nh_gw_family;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -853,17 +892,17 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
*/
/* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
+static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
- struct net_device *dev = rt->fib6_nh.fib_nh_dev;
+ struct net_device *dev = res->nh->fib_nh_dev;
- if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+ if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
/* for copies of local routes, dst->dev needs to be the
* device if it is a master device, the master device if
* device is enslaved, and the loopback as the default
*/
if (netif_is_l3_slave(dev) &&
- !rt6_need_strict(&rt->fib6_dst.addr))
+ !rt6_need_strict(&res->f6i->fib6_dst.addr))
dev = l3mdev_master_dev_rcu(dev);
else if (!netif_is_l3_master(dev))
dev = dev_net(dev)->loopback_dev;
@@ -909,11 +948,11 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
return flags;
}
-static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
{
- rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
+ rt->dst.error = ip6_rt_type_to_error(fib6_type);
- switch (ort->fib6_type) {
+ switch (fib6_type) {
case RTN_BLACKHOLE:
rt->dst.output = dst_discard_out;
rt->dst.input = dst_discard;
@@ -931,26 +970,28 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
}
}
-static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
+static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
{
- if (ort->fib6_flags & RTF_REJECT) {
- ip6_rt_init_dst_reject(rt, ort);
+ struct fib6_info *f6i = res->f6i;
+
+ if (res->fib6_flags & RTF_REJECT) {
+ ip6_rt_init_dst_reject(rt, res->fib6_type);
return;
}
rt->dst.error = 0;
rt->dst.output = ip6_output;
- if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
+ if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
rt->dst.input = ip6_input;
- } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
+ } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
rt->dst.input = ip6_mc_input;
} else {
rt->dst.input = ip6_forward;
}
- if (ort->fib6_nh.fib_nh_lws) {
- rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
+ if (res->nh->fib_nh_lws) {
+ rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
lwtunnel_set_redirect(&rt->dst);
}
@@ -965,23 +1006,25 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}
-/* Caller must already hold reference to @ort */
-static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
+/* Caller must already hold reference to f6i in result */
+static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
{
- struct net_device *dev = fib6_info_nh_dev(ort);
+ const struct fib6_nh *nh = res->nh;
+ const struct net_device *dev = nh->fib_nh_dev;
+ struct fib6_info *f6i = res->f6i;
- ip6_rt_init_dst(rt, ort);
+ ip6_rt_init_dst(rt, res);
- rt->rt6i_dst = ort->fib6_dst;
+ rt->rt6i_dst = f6i->fib6_dst;
rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
- rt->rt6i_flags = ort->fib6_flags;
- if (ort->fib6_nh.fib_nh_has_gw) {
- rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
+ rt->rt6i_flags = res->fib6_flags;
+ if (nh->fib_nh_gw_family) {
+ rt->rt6i_gateway = nh->fib_nh_gw6;
rt->rt6i_flags |= RTF_GATEWAY;
}
- rt6_set_from(rt, ort);
+ rt6_set_from(rt, f6i);
#ifdef CONFIG_IPV6_SUBTREES
- rt->rt6i_src = ort->fib6_src;
+ rt->rt6i_src = f6i->fib6_src;
#endif
}
@@ -1020,22 +1063,24 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
}
/* called with rcu_lock held */
-static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
+static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
{
- unsigned short flags = fib6_info_dst_flags(rt);
- struct net_device *dev = rt->fib6_nh.fib_nh_dev;
+ struct net_device *dev = res->nh->fib_nh_dev;
+ struct fib6_info *f6i = res->f6i;
+ unsigned short flags;
struct rt6_info *nrt;
- if (!fib6_info_hold_safe(rt))
+ if (!fib6_info_hold_safe(f6i))
goto fallback;
+ flags = fib6_info_dst_flags(f6i);
nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
if (!nrt) {
- fib6_info_release(rt);
+ fib6_info_release(f6i);
goto fallback;
}
- ip6_rt_copy_init(nrt, rt);
+ ip6_rt_copy_init(nrt, res);
return nrt;
fallback:
@@ -1050,7 +1095,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
const struct sk_buff *skb,
int flags)
{
- struct fib6_info *f6i;
+ struct fib6_result res = {};
struct fib6_node *fn;
struct rt6_info *rt;
@@ -1060,37 +1105,38 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
rcu_read_lock();
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
- f6i = rcu_dereference(fn->leaf);
- if (!f6i) {
- f6i = net->ipv6.fib6_null_entry;
- } else {
- f6i = rt6_device_match(net, f6i, &fl6->saddr,
- fl6->flowi6_oif, flags);
- if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
- f6i = fib6_multipath_select(net, f6i, fl6,
- fl6->flowi6_oif, skb,
- flags);
- }
- if (f6i == net->ipv6.fib6_null_entry) {
+ res.f6i = rcu_dereference(fn->leaf);
+ if (!res.f6i)
+ res.f6i = net->ipv6.fib6_null_entry;
+ else
+ rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
+ flags);
+
+ if (res.f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
+
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ goto out;
}
- trace_fib6_table_lookup(net, f6i, table, fl6);
+ fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
+ fl6->flowi6_oif != 0, skb, flags);
/* Search through exception table */
- rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
- } else if (f6i == net->ipv6.fib6_null_entry) {
- rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
} else {
- rt = ip6_create_rt_rcu(f6i);
+ rt = ip6_create_rt_rcu(&res);
}
+out:
+ trace_fib6_table_lookup(net, &res, table, fl6);
+
rcu_read_unlock();
return rt;
@@ -1156,10 +1202,11 @@ int ip6_ins_rt(struct net *net, struct fib6_info *rt)
return __ip6_ins_rt(rt, &info, NULL);
}
-static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
+static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
+ struct fib6_info *f6i = res->f6i;
struct net_device *dev;
struct rt6_info *rt;
@@ -1167,25 +1214,25 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
* Clone the route.
*/
- if (!fib6_info_hold_safe(ort))
+ if (!fib6_info_hold_safe(f6i))
return NULL;
- dev = ip6_rt_get_dev_rcu(ort);
+ dev = ip6_rt_get_dev_rcu(res);
rt = ip6_dst_alloc(dev_net(dev), dev, 0);
if (!rt) {
- fib6_info_release(ort);
+ fib6_info_release(f6i);
return NULL;
}
- ip6_rt_copy_init(rt, ort);
+ ip6_rt_copy_init(rt, res);
rt->rt6i_flags |= RTF_CACHE;
rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
- if (!rt6_is_gw_or_nonexthop(ort)) {
- if (ort->fib6_dst.plen != 128 &&
- ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
+ if (!rt6_is_gw_or_nonexthop(res)) {
+ if (f6i->fib6_dst.plen != 128 &&
+ ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
@@ -1198,34 +1245,35 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
return rt;
}
-static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
+static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
{
- unsigned short flags = fib6_info_dst_flags(rt);
+ struct fib6_info *f6i = res->f6i;
+ unsigned short flags = fib6_info_dst_flags(f6i);
struct net_device *dev;
struct rt6_info *pcpu_rt;
- if (!fib6_info_hold_safe(rt))
+ if (!fib6_info_hold_safe(f6i))
return NULL;
rcu_read_lock();
- dev = ip6_rt_get_dev_rcu(rt);
+ dev = ip6_rt_get_dev_rcu(res);
pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
rcu_read_unlock();
if (!pcpu_rt) {
- fib6_info_release(rt);
+ fib6_info_release(f6i);
return NULL;
}
- ip6_rt_copy_init(pcpu_rt, rt);
+ ip6_rt_copy_init(pcpu_rt, res);
pcpu_rt->rt6i_flags |= RTF_PCPU;
return pcpu_rt;
}
/* It should be called with rcu_read_lock() acquired */
-static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
+static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
struct rt6_info *pcpu_rt, **p;
- p = this_cpu_ptr(rt->rt6i_pcpu);
+ p = this_cpu_ptr(res->f6i->rt6i_pcpu);
pcpu_rt = *p;
if (pcpu_rt)
@@ -1235,18 +1283,18 @@ static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
}
static struct rt6_info *rt6_make_pcpu_route(struct net *net,
- struct fib6_info *rt)
+ const struct fib6_result *res)
{
struct rt6_info *pcpu_rt, *prev, **p;
- pcpu_rt = ip6_rt_pcpu_alloc(rt);
+ pcpu_rt = ip6_rt_pcpu_alloc(res);
if (!pcpu_rt) {
dst_hold(&net->ipv6.ip6_null_entry->dst);
return net->ipv6.ip6_null_entry;
}
dst_hold(&pcpu_rt->dst);
- p = this_cpu_ptr(rt->rt6i_pcpu);
+ p = this_cpu_ptr(res->f6i->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
@@ -1389,14 +1437,15 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
return NULL;
}
-static unsigned int fib6_mtu(const struct fib6_info *rt)
+static unsigned int fib6_mtu(const struct fib6_result *res)
{
+ const struct fib6_nh *nh = res->nh;
unsigned int mtu;
- if (rt->fib6_pmtu) {
- mtu = rt->fib6_pmtu;
+ if (res->f6i->fib6_pmtu) {
+ mtu = res->f6i->fib6_pmtu;
} else {
- struct net_device *dev = fib6_info_nh_dev(rt);
+ struct net_device *dev = nh->fib_nh_dev;
struct inet6_dev *idev;
rcu_read_lock();
@@ -1407,26 +1456,27 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
- return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
static int rt6_insert_exception(struct rt6_info *nrt,
- struct fib6_info *ort)
+ const struct fib6_result *res)
{
struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
+ struct fib6_info *f6i = res->f6i;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
- if (ort->exception_bucket_flushed) {
+ if (f6i->exception_bucket_flushed) {
err = -EINVAL;
goto out;
}
- bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+ bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
@@ -1435,24 +1485,24 @@ static int rt6_insert_exception(struct rt6_info *nrt,
err = -ENOMEM;
goto out;
}
- rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+ rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
}
#ifdef CONFIG_IPV6_SUBTREES
- /* rt6i_src.plen != 0 indicates ort is in subtree
+ /* fib6_src.plen != 0 indicates f6i is in subtree
* and exception table is indexed by a hash of
- * both rt6i_dst and rt6i_src.
+ * both fib6_dst and fib6_src.
* Otherwise, the exception table is indexed by
- * a hash of only rt6i_dst.
+ * a hash of only fib6_dst.
*/
- if (ort->fib6_src.plen)
+ if (f6i->fib6_src.plen)
src_key = &nrt->rt6i_src.addr;
#endif
- /* rt6_mtu_change() might lower mtu on ort.
+ /* rt6_mtu_change() might lower mtu on f6i.
* Only insert this exception route if its mtu
- * is less than ort's mtu value.
+ * is less than f6i's mtu value.
*/
- if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
+ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
err = -EINVAL;
goto out;
}
@@ -1481,9 +1531,9 @@ out:
/* Update fn->fn_sernum to invalidate all cached dst */
if (!err) {
- spin_lock_bh(&ort->fib6_table->tb6_lock);
- fib6_update_sernum(net, ort);
- spin_unlock_bh(&ort->fib6_table->tb6_lock);
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum(net, f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
fib6_force_start_gc(net);
}
@@ -1520,33 +1570,33 @@ out:
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
-static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
struct in6_addr *daddr,
struct in6_addr *saddr)
{
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct rt6_info *res = NULL;
+ struct rt6_info *ret = NULL;
- bucket = rcu_dereference(rt->rt6i_exception_bucket);
+ bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
#ifdef CONFIG_IPV6_SUBTREES
- /* rt6i_src.plen != 0 indicates rt is in subtree
+ /* fib6i_src.plen != 0 indicates f6i is in subtree
* and exception table is indexed by a hash of
- * both rt6i_dst and rt6i_src.
+ * both fib6_dst and fib6_src.
* Otherwise, the exception table is indexed by
- * a hash of only rt6i_dst.
+ * a hash of only fib6_dst.
*/
- if (rt->fib6_src.plen)
+ if (res->f6i->fib6_src.plen)
src_key = saddr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
- res = rt6_ex->rt6i;
+ ret = rt6_ex->rt6i;
- return res;
+ return ret;
}
/* Remove the passed in cached rt from the hash table that contains it */
@@ -1794,11 +1844,10 @@ void rt6_age_exceptions(struct fib6_info *rt,
}
/* must be called with rcu lock held */
-struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6, int strict)
+int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
+ struct flowi6 *fl6, struct fib6_result *res, int strict)
{
struct fib6_node *fn, *saved_fn;
- struct fib6_info *f6i;
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
@@ -1807,8 +1856,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
oif = 0;
redo_rt6_select:
- f6i = rt6_select(net, fn, oif, strict);
- if (f6i == net->ipv6.fib6_null_entry) {
+ rt6_select(net, fn, oif, res, strict);
+ if (res->f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto redo_rt6_select;
@@ -1820,16 +1869,16 @@ redo_rt6_select:
}
}
- trace_fib6_table_lookup(net, f6i, table, fl6);
+ trace_fib6_table_lookup(net, res, table, fl6);
- return f6i;
+ return 0;
}
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6,
const struct sk_buff *skb, int flags)
{
- struct fib6_info *f6i;
+ struct fib6_result res = {};
struct rt6_info *rt;
int strict = 0;
@@ -1840,19 +1889,18 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
- f6i = fib6_table_lookup(net, table, oif, fl6, strict);
- if (f6i->fib6_nsiblings)
- f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
-
- if (f6i == net->ipv6.fib6_null_entry) {
+ fib6_table_lookup(net, table, oif, fl6, &res, strict);
+ if (res.f6i == net->ipv6.fib6_null_entry) {
rt = net->ipv6.ip6_null_entry;
rcu_read_unlock();
dst_hold(&rt->dst);
return rt;
}
+ fib6_select_path(net, &res, fl6, oif, false, skb, strict);
+
/*Search through exception table */
- rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
@@ -1860,7 +1908,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_unlock();
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
- !f6i->fib6_nh.fib_nh_has_gw)) {
+ !res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
* owned by the fib6 tree. It is for the special case where
* the daddr in the skb during the neighbor look-up is different
@@ -1868,7 +1916,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
*/
struct rt6_info *uncached_rt;
- uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
+ uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
rcu_read_unlock();
@@ -1890,10 +1938,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
struct rt6_info *pcpu_rt;
local_bh_disable();
- pcpu_rt = rt6_get_pcpu_route(f6i);
+ pcpu_rt = rt6_get_pcpu_route(&res);
if (!pcpu_rt)
- pcpu_rt = rt6_make_pcpu_route(net, f6i);
+ pcpu_rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
rcu_read_unlock();
@@ -2312,15 +2360,23 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (rt6->rt6i_flags & RTF_CACHE)
rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
- struct fib6_info *from;
+ struct fib6_result res = {};
struct rt6_info *nrt6;
rcu_read_lock();
- from = rcu_dereference(rt6->from);
- nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
+ res.f6i = rcu_dereference(rt6->from);
+ if (!res.f6i) {
+ rcu_read_unlock();
+ return;
+ }
+ res.nh = &res.f6i->fib6_nh;
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+
+ nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
- if (rt6_insert_exception(nrt6, from))
+ if (rt6_insert_exception(nrt6, &res))
dst_release_immediate(&nrt6->dst);
}
rcu_read_unlock();
@@ -2393,6 +2449,36 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
NULL);
}
+static bool ip6_redirect_nh_match(const struct fib6_result *res,
+ struct flowi6 *fl6,
+ const struct in6_addr *gw,
+ struct rt6_info **ret)
+{
+ const struct fib6_nh *nh = res->nh;
+
+ if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
+ fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
+ return false;
+
+ /* rt_cache's gateway might be different from its 'parent'
+ * in the case of an ip redirect.
+ * So we keep searching in the exception table if the gateway
+ * is different.
+ */
+ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
+ if (rt_cache &&
+ ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
+ *ret = rt_cache;
+ return true;
+ }
+ return false;
+ }
+ return true;
+}
+
/* Handle redirects */
struct ip6rd_flowi {
struct flowi6 fl6;
@@ -2406,7 +2492,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
int flags)
{
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
- struct rt6_info *ret = NULL, *rt_cache;
+ struct rt6_info *ret = NULL;
+ struct fib6_result res = {};
struct fib6_info *rt;
struct fib6_node *fn;
@@ -2424,34 +2511,15 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
- continue;
+ res.f6i = rt;
+ res.nh = &rt->fib6_nh;
+
if (fib6_check_expired(rt))
continue;
if (rt->fib6_flags & RTF_REJECT)
break;
- if (!rt->fib6_nh.fib_nh_has_gw)
- continue;
- if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
- continue;
- /* rt_cache's gateway might be different from its 'parent'
- * in the case of an ip redirect.
- * So we keep searching in the exception table if the gateway
- * is different.
- */
- if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) {
- rt_cache = rt6_find_cached_rt(rt,
- &fl6->daddr,
- &fl6->saddr);
- if (rt_cache &&
- ipv6_addr_equal(&rdfl->gateway,
- &rt_cache->rt6i_gateway)) {
- ret = rt_cache;
- break;
- }
- continue;
- }
- break;
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
+ goto out;
}
if (!rt)
@@ -2467,15 +2535,20 @@ restart:
goto restart;
}
+ res.f6i = rt;
+ res.nh = &rt->fib6_nh;
out:
- if (ret)
+ if (ret) {
ip6_hold_safe(net, &ret);
- else
- ret = ip6_create_rt_rcu(rt);
+ } else {
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+ ret = ip6_create_rt_rcu(&res);
+ }
rcu_read_unlock();
- trace_fib6_table_lookup(net, rt, table, fl6);
+ trace_fib6_table_lookup(net, &res, table, fl6);
return ret;
};
@@ -2593,12 +2666,15 @@ out:
* based on ip6_dst_mtu_forward and exception logic of
* rt6_find_cached_rt; called with rcu_read_lock
*/
-u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
- struct in6_addr *saddr)
+u32 ip6_mtu_from_fib6(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct rt6_exception_bucket *bucket;
+ const struct fib6_nh *nh = res->nh;
+ struct fib6_info *f6i = res->f6i;
+ const struct in6_addr *src_key;
struct rt6_exception *rt6_ex;
- struct in6_addr *src_key;
struct inet6_dev *idev;
u32 mtu = 0;
@@ -2620,7 +2696,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
if (likely(!mtu)) {
- struct net_device *dev = fib6_info_nh_dev(f6i);
+ struct net_device *dev = nh->fib_nh_dev;
mtu = IPV6_MIN_MTU;
idev = __in6_dev_get(dev);
@@ -2630,7 +2706,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
- return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
@@ -2964,7 +3040,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
goto out;
fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
- fib6_nh->fib_nh_has_gw = 1;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
}
err = -ENODEV;
@@ -3282,9 +3358,13 @@ static int ip6_route_del(struct fib6_config *cfg,
struct fib6_nh *nh;
if (cfg->fc_flags & RTF_CACHE) {
+ struct fib6_result res = {
+ .f6i = rt,
+ };
int rc;
- rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
+ rt_cache = rt6_find_cached_rt(&res,
+ &cfg->fc_dst,
&cfg->fc_src);
if (rt_cache) {
rc = ip6_del_cached_rt(rt_cache, cfg);
@@ -3328,10 +3408,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
{
struct netevent_redirect netevent;
struct rt6_info *rt, *nrt = NULL;
+ struct fib6_result res = {};
struct ndisc_options ndopts;
struct inet6_dev *in6_dev;
struct neighbour *neigh;
- struct fib6_info *from;
struct rd_msg *msg;
int optlen, on_link;
u8 *lladdr;
@@ -3414,14 +3494,17 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NDISC_REDIRECT, &ndopts);
rcu_read_lock();
- from = rcu_dereference(rt->from);
+ res.f6i = rcu_dereference(rt->from);
/* This fib6_info_hold() is safe here because we hold reference to rt
* and rt already holds reference to fib6_info.
*/
- fib6_info_hold(from);
+ fib6_info_hold(res.f6i);
rcu_read_unlock();
- nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
+ res.nh = &res.f6i->fib6_nh;
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+ nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
if (!nrt)
goto out;
@@ -3435,7 +3518,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* a cached route because rt6_insert_exception() will
* takes care of it
*/
- if (rt6_insert_exception(nrt, from)) {
+ if (rt6_insert_exception(nrt, &res)) {
dst_release_immediate(&nrt->dst);
goto out;
}
@@ -3447,7 +3530,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
out:
- fib6_info_release(from);
+ fib6_info_release(res.f6i);
neigh_release(neigh);
}
@@ -3476,7 +3559,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
continue;
if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
- !rt->fib6_nh.fib_nh_has_gw)
+ !rt->fib6_nh.fib_nh_gw_family)
continue;
if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
continue;
@@ -3807,7 +3890,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
struct in6_addr *gateway = (struct in6_addr *)arg;
if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
- rt->fib6_nh.fib_nh_has_gw &&
+ rt->fib6_nh.fib_nh_gw_family &&
ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
return -1;
}
@@ -3829,7 +3912,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
struct arg_netdev_event {
const struct net_device *dev;
union {
- unsigned int nh_flags;
+ unsigned char nh_flags;
unsigned long event;
};
};
@@ -3942,7 +4025,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
return 0;
}
-void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
+void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
{
struct arg_netdev_event arg = {
.dev = dev,
@@ -3999,7 +4082,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
const struct net_device *dev,
- unsigned int nh_flags)
+ unsigned char nh_flags)
{
struct fib6_info *iter;
@@ -4711,9 +4794,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
nla_nest_end(skb, mp);
} else {
+ unsigned char nh_flags = 0;
+
if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
- &rtm->rtm_flags, false) < 0)
+ &nh_flags, false) < 0)
goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
}
if (rt6_flags & RTF_EXPIRES) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b444483cdb2b..2464fba569b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -285,8 +285,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, peeking, off;
- int err;
+ int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
struct udp_mib __percpu *mib;
bool checksum_valid = false;
@@ -299,9 +298,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
- peeking = flags & MSG_PEEK;
off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
if (!skb)
return err;
@@ -340,14 +338,14 @@ try_again:
goto csum_copy_err;
}
if (unlikely(err)) {
- if (!peeked) {
+ if (!peeking) {
atomic_inc(&sk->sk_drops);
SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
return err;
}
- if (!peeked)
+ if (!peeking)
SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
sock_recv_ts_and_drops(msg, sk, skb);
@@ -1047,6 +1045,8 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
/* The following checks are replicated from __ip6_datagram_connect()
* and intended to prevent BPF program called below from accessing
* bytes that are out of the bound specified by user in addr_len.
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index d4c60523c549..2cac910c1cd4 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -618,6 +618,7 @@ static const struct proto_ops l2tp_ip_ops = {
.getname = l2tp_ip_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 37a69df17cab..4ec546cc1dd6 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -752,6 +752,7 @@ static const struct proto_ops l2tp_ip6_ops = {
.getname = l2tp_ip6_getname,
.poll = datagram_poll,
.ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 04d9946dcdba..f36cae785e82 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1070,7 +1070,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
{
struct pppol2tp_ioc_stats stats;
struct l2tp_session *session;
- int val;
switch (cmd) {
case PPPIOCGMRU:
@@ -1097,7 +1096,7 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
- if (get_user(val, (int __user *)arg))
+ if (!access_ok((int __user *)arg, sizeof(int)))
return -EFAULT;
break;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index b99e73a7e7e0..2017b7d780f5 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -320,14 +320,13 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
struct llc_sap *sap;
int rc = -EINVAL;
- dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
-
lock_sock(sk);
if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)))
goto out;
rc = -EAFNOSUPPORT;
if (unlikely(addr->sllc_family != AF_LLC))
goto out;
+ dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
rc = -ENODEV;
rcu_read_lock();
if (sk->sk_bound_dev_if) {
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 62edfa6a73ed..c2d8b5451a5e 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1200,6 +1200,9 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);
+ if (local->in_reconfig)
+ return;
+
if (!check_sdata_in_driver(sdata))
return;
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 42d52cded4c1..20bf9db7a388 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -173,8 +173,10 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
* The driver doesn't know anything about VLAN interfaces.
* Hence, don't send GTKs for VLAN interfaces to the driver.
*/
- if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE))
+ if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) {
+ ret = 1;
goto out_unsupported;
+ }
}
ret = drv_set_key(key->local, SET_KEY, sdata,
@@ -219,11 +221,8 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
/* all of these we can do in software - if driver can */
if (ret == 1)
return 0;
- if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) {
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
- return 0;
+ if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL))
return -EINVAL;
- }
return 0;
default:
return -EINVAL;
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index a805d2acf0f7..796b069ad251 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -23,7 +23,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath);
static u32 mesh_table_hash(const void *addr, u32 len, u32 seed)
{
/* Use last four bytes of hw addr as hash index */
- return jhash_1word(*(u32 *)(addr+2), seed);
+ return jhash_1word(__get_unaligned_cpu32((u8 *)addr + 2), seed);
}
static const struct rhashtable_params mesh_rht_params = {
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 4a03c18b39a8..25577ede2986 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1588,7 +1588,15 @@ static void sta_ps_start(struct sta_info *sta)
return;
for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
- if (txq_has_queue(sta->sta.txq[tid]))
+ struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct txq_info *txqi = to_txq_info(txq);
+
+ spin_lock(&local->active_txq_lock[txq->ac]);
+ if (!list_empty(&txqi->schedule_order))
+ list_del_init(&txqi->schedule_order);
+ spin_unlock(&local->active_txq_lock[txq->ac]);
+
+ if (txq_has_queue(txq))
set_bit(tid, &sta->txq_buffered_tids);
else
clear_bit(tid, &sta->txq_buffered_tids);
diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h
index 366b9e6f043e..40141df09f25 100644
--- a/net/mac80211/trace_msg.h
+++ b/net/mac80211/trace_msg.h
@@ -1,4 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions of this file
+ * Copyright (C) 2019 Intel Corporation
+ */
+
#ifdef CONFIG_MAC80211_MESSAGE_TRACING
#if !defined(__MAC80211_MSG_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
@@ -11,7 +16,7 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM mac80211_msg
-#define MAX_MSG_LEN 100
+#define MAX_MSG_LEN 120
DECLARE_EVENT_CLASS(mac80211_msg_event,
TP_PROTO(struct va_format *vaf),
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8037384fc06e..dd220b977025 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3228,6 +3228,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
u8 max_subframes = sta->sta.max_amsdu_subframes;
int max_frags = local->hw.max_tx_fragments;
int max_amsdu_len = sta->sta.max_amsdu_len;
+ int orig_truesize;
u32 flow_idx;
__be16 len;
void *data;
@@ -3272,6 +3273,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
if (!head || skb_is_gso(head))
goto out;
+ orig_truesize = head->truesize;
orig_len = head->len;
if (skb->len + head->len > max_amsdu_len)
@@ -3329,6 +3331,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
*frag_tail = skb;
out_recalc:
+ fq->memory_usage += head->truesize - orig_truesize;
if (head->len != orig_len) {
flow->backlog += head->len - orig_len;
tin->backlog_bytes += head->len - orig_len;
@@ -3669,16 +3672,17 @@ EXPORT_SYMBOL(ieee80211_tx_dequeue);
struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
{
struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_txq *ret = NULL;
struct txq_info *txqi = NULL;
- lockdep_assert_held(&local->active_txq_lock[ac]);
+ spin_lock_bh(&local->active_txq_lock[ac]);
begin:
txqi = list_first_entry_or_null(&local->active_txqs[ac],
struct txq_info,
schedule_order);
if (!txqi)
- return NULL;
+ goto out;
if (txqi->txq.sta) {
struct sta_info *sta = container_of(txqi->txq.sta,
@@ -3695,24 +3699,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
if (txqi->schedule_round == local->schedule_round[ac])
- return NULL;
+ goto out;
list_del_init(&txqi->schedule_order);
txqi->schedule_round = local->schedule_round[ac];
- return &txqi->txq;
+ ret = &txqi->txq;
+
+out:
+ spin_unlock_bh(&local->active_txq_lock[ac]);
+ return ret;
}
EXPORT_SYMBOL(ieee80211_next_txq);
-void ieee80211_return_txq(struct ieee80211_hw *hw,
- struct ieee80211_txq *txq)
+void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
+ struct ieee80211_txq *txq,
+ bool force)
{
struct ieee80211_local *local = hw_to_local(hw);
struct txq_info *txqi = to_txq_info(txq);
- lockdep_assert_held(&local->active_txq_lock[txq->ac]);
+ spin_lock_bh(&local->active_txq_lock[txq->ac]);
if (list_empty(&txqi->schedule_order) &&
- (!skb_queue_empty(&txqi->frags) || txqi->tin.backlog_packets)) {
+ (force || !skb_queue_empty(&txqi->frags) ||
+ txqi->tin.backlog_packets)) {
/* If airtime accounting is active, always enqueue STAs at the
* head of the list to ensure that they only get moved to the
* back by the airtime DRR scheduler once they have a negative
@@ -3729,20 +3739,10 @@ void ieee80211_return_txq(struct ieee80211_hw *hw,
list_add_tail(&txqi->schedule_order,
&local->active_txqs[txq->ac]);
}
-}
-EXPORT_SYMBOL(ieee80211_return_txq);
-void ieee80211_schedule_txq(struct ieee80211_hw *hw,
- struct ieee80211_txq *txq)
- __acquires(txq_lock) __releases(txq_lock)
-{
- struct ieee80211_local *local = hw_to_local(hw);
-
- spin_lock_bh(&local->active_txq_lock[txq->ac]);
- ieee80211_return_txq(hw, txq);
spin_unlock_bh(&local->active_txq_lock[txq->ac]);
}
-EXPORT_SYMBOL(ieee80211_schedule_txq);
+EXPORT_SYMBOL(__ieee80211_schedule_txq);
bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
struct ieee80211_txq *txq)
@@ -3752,7 +3752,7 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
struct sta_info *sta;
u8 ac = txq->ac;
- lockdep_assert_held(&local->active_txq_lock[ac]);
+ spin_lock_bh(&local->active_txq_lock[ac]);
if (!txqi->txq.sta)
goto out;
@@ -3782,34 +3782,27 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
sta->airtime[ac].deficit += sta->airtime_weight;
list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]);
+ spin_unlock_bh(&local->active_txq_lock[ac]);
return false;
out:
if (!list_empty(&txqi->schedule_order))
list_del_init(&txqi->schedule_order);
+ spin_unlock_bh(&local->active_txq_lock[ac]);
return true;
}
EXPORT_SYMBOL(ieee80211_txq_may_transmit);
void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac)
- __acquires(txq_lock)
{
struct ieee80211_local *local = hw_to_local(hw);
spin_lock_bh(&local->active_txq_lock[ac]);
local->schedule_round[ac]++;
-}
-EXPORT_SYMBOL(ieee80211_txq_schedule_start);
-
-void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac)
- __releases(txq_lock)
-{
- struct ieee80211_local *local = hw_to_local(hw);
-
spin_unlock_bh(&local->active_txq_lock[ac]);
}
-EXPORT_SYMBOL(ieee80211_txq_schedule_end);
+EXPORT_SYMBOL(ieee80211_txq_schedule_start);
void __ieee80211_subif_start_xmit(struct sk_buff *skb,
struct net_device *dev,
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8120e04f15e4..e321a5fafb87 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -23,7 +23,7 @@
#include <net/ipv6.h>
#endif
#include <net/ipv6_stubs.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#include "internal.h"
/* max memory we will use for mpls_route */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index f3a8557494d6..2619c2fbea93 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -137,10 +137,14 @@ static int mpls_xmit(struct sk_buff *skb)
mpls_stats_inc_outucastpkts(out_dev, skb);
- if (rt)
- err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
- skb);
- else if (rt6) {
+ if (rt) {
+ if (rt->rt_gw_family == AF_INET)
+ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4,
+ skb);
+ else if (rt->rt_gw_family == AF_INET6)
+ err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6,
+ skb);
+ } else if (rt6) {
if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) {
/* 6PE (RFC 4798) */
err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3],
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index dc07fcc7938e..802db01e3075 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <net/ncsi.h>
@@ -667,7 +668,10 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN);
/* Increase mac address by 1 for BMC's address */
- saddr.sa_data[ETH_ALEN - 1]++;
+ eth_addr_inc((u8 *)saddr.sa_data);
+ if (!is_valid_ether_addr((const u8 *)saddr.sa_data))
+ return -ENXIO;
+
ret = ops->ndo_set_mac_address(ndev, &saddr);
if (ret < 0)
netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 6548271209a0..02b281d3c167 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -404,11 +404,6 @@ config NF_NAT
forms of full Network Address Port Translation. This can be
controlled by iptables, ip6tables or nft.
-config NF_NAT_NEEDED
- bool
- depends on NF_NAT
- default y
-
config NF_NAT_AMANDA
tristate
depends on NF_CONNTRACK && NF_NAT
@@ -1002,6 +997,20 @@ config NETFILTER_XT_TARGET_REDIRECT
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ depends on NF_NAT
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT_MASQUERADE
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_TEE
tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4894a85cdd0b..72cca6b48960 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -77,7 +77,8 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
+ nft_chain_route.o
nf_tables_set-objs := nf_tables_set_core.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
@@ -147,6 +148,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_MASQUERADE) += xt_MASQUERADE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 93aaec3a54ec..71f06900473e 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
+#include <net/netfilter/nf_queue.h>
#include <net/sock.h>
#include "nf_internals.h"
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 43bbaa32b1d6..14457551bcb4 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1678,7 +1678,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
if (!cp) {
int v;
- if (!sysctl_schedule_icmp(ipvs))
+ if (ipip || !sysctl_schedule_icmp(ipvs))
return NF_ACCEPT;
if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 4b933669fd83..ab119a7540db 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -831,6 +831,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
+ /* set the tunnel info */
+ dest->tun_type = udest->tun_type;
+ dest->tun_port = udest->tun_port;
+
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
@@ -987,6 +991,13 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ERANGE;
}
+ if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if (udest->tun_port == 0) {
+ pr_err("%s(): tunnel port is zero\n", __func__);
+ return -EINVAL;
+ }
+ }
+
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
@@ -1051,6 +1062,13 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ERANGE;
}
+ if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if (udest->tun_port == 0) {
+ pr_err("%s(): tunnel port is zero\n", __func__);
+ return -EINVAL;
+ }
+ }
+
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
@@ -2333,6 +2351,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
udest->u_threshold = udest_compat->u_threshold;
udest->l_threshold = udest_compat->l_threshold;
udest->af = AF_INET;
+ udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
}
static int
@@ -2890,6 +2909,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
[IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
+ [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
};
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
@@ -3193,6 +3214,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
IP_VS_CONN_F_FWD_MASK)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
atomic_read(&dest->weight)) ||
+ nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
+ dest->tun_type) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
+ dest->tun_port) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
@@ -3315,12 +3340,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
- *nla_l_thresh;
+ *nla_l_thresh, *nla_tun_type, *nla_tun_port;
nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
+ nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
+ nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
@@ -3330,6 +3357,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
udest->weight = nla_get_u32(nla_weight);
udest->u_threshold = nla_get_u32(nla_u_thresh);
udest->l_threshold = nla_get_u32(nla_l_thresh);
+
+ if (nla_tun_type)
+ udest->tun_type = nla_get_u8(nla_tun_type);
+
+ if (nla_tun_port)
+ udest->tun_port = nla_get_be16(nla_tun_port);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 175349fcf91f..8d6f94b67772 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -32,6 +32,7 @@
#include <linux/slab.h>
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
+#include <net/gue.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
@@ -382,6 +383,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst);
} else {
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (!dest)
+ goto err_put;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
if (mtu < 68) {
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
goto err_put;
@@ -533,6 +538,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst);
else {
mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (!dest)
+ goto err_put;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
if (mtu < IPV6_MIN_MTU) {
IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
IPV6_MIN_MTU);
@@ -989,6 +998,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
}
}
+static int
+ipvs_gue_encap(struct net *net, struct sk_buff *skb,
+ struct ip_vs_conn *cp, __u8 *next_protocol)
+{
+ __be16 dport;
+ __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
+ struct udphdr *udph; /* Our new UDP header */
+ struct guehdr *gueh; /* Our new GUE header */
+
+ skb_push(skb, sizeof(struct guehdr));
+
+ gueh = (struct guehdr *)skb->data;
+
+ gueh->control = 0;
+ gueh->version = 0;
+ gueh->hlen = 0;
+ gueh->flags = 0;
+ gueh->proto_ctype = *next_protocol;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ udph = udp_hdr(skb);
+
+ dport = cp->dest->tun_port;
+ udph->dest = dport;
+ udph->source = sport;
+ udph->len = htons(skb->len);
+ udph->check = 0;
+
+ *next_protocol = IPPROTO_UDP;
+
+ return 0;
+}
+
/*
* IP Tunneling transmitter
*
@@ -1025,6 +1069,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
+ int tun_type, gso_type;
EnterFunction(10);
@@ -1046,6 +1091,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+ tun_type = cp->dest->tun_type;
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
@@ -1054,11 +1104,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
+ gso_type = __tun_gso_type_mask(AF_INET, cp->af);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+
+ if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
skb->transport_header = skb->network_header;
+ skb_set_inner_ipproto(skb, next_protocol);
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ ipvs_gue_encap(net, skb, cp, &next_protocol);
+
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1102,6 +1161,8 @@ int
ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct netns_ipvs *ipvs = cp->ipvs;
+ struct net *net = ipvs->net;
struct rt6_info *rt; /* Route to the other host */
struct in6_addr saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
@@ -1112,10 +1173,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
+ int tun_type, gso_type;
EnterFunction(10);
- local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
&saddr, ipvsh, 1,
IP_VS_RT_MODE_LOCAL |
@@ -1134,17 +1196,31 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+ tun_type = cp->dest->tun_type;
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, &payload_len,
&dsfield, &ttl, NULL);
if (IS_ERR(skb))
goto tx_error;
- if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
+ gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+
+ if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
skb->transport_header = skb->network_header;
+ skb_set_inner_ipproto(skb, next_protocol);
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ ipvs_gue_encap(net, skb, cp, &next_protocol);
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1167,7 +1243,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip6_local_out(cp->ipvs->net, skb->sk, skb);
+ ip6_local_out(net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 82bfbeef46af..2a714527cde1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
@@ -449,6 +450,40 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
}
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
+/* Generate a almost-unique pseudo-id for a given conntrack.
+ *
+ * intentionally doesn't re-use any of the seeds used for hash
+ * table location, we assume id gets exposed to userspace.
+ *
+ * Following nf_conn items do not change throughout lifetime
+ * of the nf_conn after it has been committed to main hash table:
+ *
+ * 1. nf_conn address
+ * 2. nf_conn->ext address
+ * 3. nf_conn->master address (normally NULL)
+ * 4. tuple
+ * 5. the associated net namespace
+ */
+u32 nf_ct_get_id(const struct nf_conn *ct)
+{
+ static __read_mostly siphash_key_t ct_id_seed;
+ unsigned long a, b, c, d;
+
+ net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
+
+ a = (unsigned long)ct;
+ b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct));
+ c = (unsigned long)ct->ext;
+ d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash),
+ &ct_id_seed);
+#ifdef CONFIG_64BIT
+ return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
+#else
+ return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_id);
+
static void
clean_from_lists(struct nf_conn *ct)
{
@@ -982,12 +1017,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
- if (tstamp) {
- if (skb->tstamp == 0)
- __net_timestamp(skb);
+ if (tstamp)
+ tstamp->start = ktime_get_real_ns();
- tstamp->start = ktime_to_ns(skb->tstamp);
- }
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
@@ -1350,6 +1382,7 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
+ ct->timeout = 0;
write_pnet(&ct->ct_net, net);
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 334d6e5b7762..59c18804a10a 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -336,7 +336,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
exp->tuple.dst.u.all = *dst;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
#endif
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 66c596d287a5..d547a777192f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -29,6 +29,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
+#include <linux/siphash.h>
#include <linux/netfilter.h>
#include <net/netlink.h>
@@ -45,7 +46,7 @@
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#endif
@@ -485,7 +486,9 @@ nla_put_failure:
static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
- if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
+ __be32 id = (__force __be32)nf_ct_get_id(ct);
+
+ if (nla_put_be32(skb, CTA_ID, id))
goto nla_put_failure;
return 0;
@@ -655,7 +658,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
@@ -1286,8 +1289,9 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
}
if (cda[CTA_ID]) {
- u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
- if (id != (u32)(unsigned long)ct) {
+ __be32 id = nla_get_be32(cda[CTA_ID]);
+
+ if (id != (__force __be32)nf_ct_get_id(ct)) {
nf_ct_put(ct);
return -ENOENT;
}
@@ -1494,7 +1498,7 @@ static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl,
return -EOPNOTSUPP;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
@@ -1586,7 +1590,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
static int
ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
{
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
int ret;
if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
@@ -2369,7 +2373,7 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
@@ -2692,6 +2696,25 @@ nla_put_failure:
static const union nf_inet_addr any_addr;
+static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
+{
+ static __read_mostly siphash_key_t exp_id_seed;
+ unsigned long a, b, c, d;
+
+ net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
+
+ a = (unsigned long)exp;
+ b = (unsigned long)exp->helper;
+ c = (unsigned long)exp->master;
+ d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed);
+
+#ifdef CONFIG_64BIT
+ return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed);
+#else
+ return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed);
+#endif
+}
+
static int
ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp)
@@ -2699,7 +2722,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
struct nf_conn *master = exp->master;
long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
struct nf_conn_help *help;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
struct nlattr *nest_parms;
struct nf_conntrack_tuple nat_tuple = {};
#endif
@@ -2717,7 +2740,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
CTA_EXPECT_MASTER) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
exp->saved_proto.all) {
nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED);
@@ -2739,7 +2762,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
}
#endif
if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
- nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
+ nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) ||
nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
goto nla_put_failure;
@@ -3044,7 +3067,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
- if (ntohl(id) != (u32)(unsigned long)exp) {
+
+ if (id != nf_expect_get_id(exp)) {
nf_ct_expect_put(exp);
return -ENOENT;
}
@@ -3180,7 +3204,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
struct nf_conntrack_expect *exp,
u_int8_t u3)
{
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
struct nf_conntrack_tuple nat_tuple = {};
int err;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index b9403a266a2e..37bb530d848f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -55,7 +55,7 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb,
struct va_format vaf;
va_list args;
- if (net->ct.sysctl_log_invalid != protonum ||
+ if (net->ct.sysctl_log_invalid != protonum &&
net->ct.sysctl_log_invalid != IPPROTO_RAW)
return;
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 7df477996b16..9becac953587 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -103,49 +103,94 @@ int nf_conntrack_icmp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-static int
-icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb,
- const struct nf_hook_state *state)
+/* Check inner header is related to any of the existing connections */
+int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state,
+ u8 l4proto, union nf_inet_addr *outer_daddr)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
+ union nf_inet_addr *ct_daddr;
+ enum ip_conntrack_dir dir;
+ struct nf_conn *ct;
WARN_ON(skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
- if (!nf_ct_get_tuplepr(skb,
- skb_network_offset(skb) + ip_hdrlen(skb)
- + sizeof(struct icmphdr),
- PF_INET, state->net, &origtuple)) {
- pr_debug("icmp_error_message: failed to get tuple\n");
+ if (!nf_ct_get_tuplepr(skb, dataoff,
+ state->pf, state->net, &origtuple))
return -NF_ACCEPT;
- }
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&innertuple, &origtuple)) {
- pr_debug("icmp_error_message: no match\n");
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple))
return -NF_ACCEPT;
- }
-
- ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(state->net, zone, &innertuple);
- if (!h) {
- pr_debug("icmp_error_message: no match\n");
+ if (!h)
+ return -NF_ACCEPT;
+
+ /* Consider: A -> T (=This machine) -> B
+ * Conntrack entry will look like this:
+ * Original: A->B
+ * Reply: B->T (SNAT case) OR A
+ *
+ * When this function runs, we got packet that looks like this:
+ * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..).
+ *
+ * Above nf_conntrack_find_get() makes lookup based on inner_hdr,
+ * so we should expect that destination of the found connection
+ * matches outer header destination address.
+ *
+ * In above example, we can consider these two cases:
+ * 1. Error coming in reply direction from B or M (middle box) to
+ * T (SNAT case) or A.
+ * Inner saddr will be B, dst will be T or A.
+ * The found conntrack will be reply tuple (B->T/A).
+ * 2. Error coming in original direction from A or M to B.
+ * Inner saddr will be A, inner daddr will be B.
+ * The found conntrack will be original tuple (A->B).
+ *
+ * In both cases, conntrack[dir].dst == inner.dst.
+ *
+ * A bogus packet could look like this:
+ * Inner: B->T
+ * Outer: B->X (other machine reachable by T).
+ *
+ * In this case, lookup yields connection A->B and will
+ * set packet from B->X as *RELATED*, even though no connection
+ * from X was ever seen.
+ */
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ dir = NF_CT_DIRECTION(h);
+ ct_daddr = &ct->tuplehash[dir].tuple.dst.u3;
+ if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) {
+ if (state->pf == AF_INET) {
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ l4proto,
+ "outer daddr %pI4 != inner %pI4",
+ &outer_daddr->ip, &ct_daddr->ip);
+ } else if (state->pf == AF_INET6) {
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ l4proto,
+ "outer daddr %pI6 != inner %pI6",
+ &outer_daddr->ip6, &ct_daddr->ip6);
+ }
+ nf_ct_put(ct);
return -NF_ACCEPT;
}
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ ctinfo = IP_CT_RELATED;
+ if (dir == IP_CT_DIR_REPLY)
ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */
- nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
+ nf_ct_set(skb, ct, ctinfo);
return NF_ACCEPT;
}
@@ -162,11 +207,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
const struct nf_hook_state *state)
{
+ union nf_inet_addr outer_daddr;
const struct icmphdr *icmph;
struct icmphdr _ih;
/* Not enough header? */
- icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
+ icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
if (icmph == NULL) {
icmp_error_log(skb, state, "short packet");
return -NF_ACCEPT;
@@ -199,7 +245,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
- return icmp_error_message(tmpl, skb, state);
+ memset(&outer_daddr, 0, sizeof(outer_daddr));
+ outer_daddr.ip = ip_hdr(skb)->daddr;
+
+ dataoff += sizeof(*icmph);
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMP, &outer_daddr);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index bec4a3211658..c63ee3612855 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -123,51 +123,6 @@ int nf_conntrack_icmpv6_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-static int
-icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb,
- unsigned int icmp6off)
-{
- struct nf_conntrack_tuple intuple, origtuple;
- const struct nf_conntrack_tuple_hash *h;
- enum ip_conntrack_info ctinfo;
- struct nf_conntrack_zone tmp;
-
- WARN_ON(skb_nfct(skb));
-
- /* Are they talking about one of our connections? */
- if (!nf_ct_get_tuplepr(skb,
- skb_network_offset(skb)
- + sizeof(struct ipv6hdr)
- + sizeof(struct icmp6hdr),
- PF_INET6, net, &origtuple)) {
- pr_debug("icmpv6_error: Can't get tuple\n");
- return -NF_ACCEPT;
- }
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&intuple, &origtuple)) {
- pr_debug("icmpv6_error: Can't invert tuple\n");
- return -NF_ACCEPT;
- }
-
- ctinfo = IP_CT_RELATED;
-
- h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
- &intuple);
- if (!h) {
- pr_debug("icmpv6_error: no match\n");
- return -NF_ACCEPT;
- } else {
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- ctinfo += IP_CT_IS_REPLY;
- }
-
- /* Update skb to refer to this connection */
- nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
- return NF_ACCEPT;
-}
static void icmpv6_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
@@ -182,6 +137,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
unsigned int dataoff,
const struct nf_hook_state *state)
{
+ union nf_inet_addr outer_daddr;
const struct icmp6hdr *icmp6h;
struct icmp6hdr _ih;
int type;
@@ -210,7 +166,11 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT;
- return icmpv6_error_message(state->net, tmpl, skb, dataoff);
+ memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
+ sizeof(outer_daddr.ip6));
+ dataoff += sizeof(*icmp6h);
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMPV6, &outer_daddr);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 39fcc1ed18f3..d5454d1031a3 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -928,7 +928,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
exp->class != class)
break;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
if (!direct_rtp &&
(!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) ||
exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 1d291a51cd45..6452550d187f 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -235,13 +235,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (tuplehash == NULL)
return NF_ACCEPT;
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ outdev = rt->dst.dev;
if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
(ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
@@ -452,13 +449,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (tuplehash == NULL)
return NF_ACCEPT;
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+ outdev = rt->dst.dev;
if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
return NF_ACCEPT;
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index e15779fd58e3..d6c43902ebd7 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -7,9 +7,6 @@
#include <linux/netdevice.h>
/* nf_queue.c */
-int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
- const struct nf_hook_entries *entries, unsigned int index,
- unsigned int verdict);
void nf_queue_nf_hook_drop(struct net *net);
/* nf_log.c */
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index af7dc6537758..715e3d4d761b 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -415,9 +415,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
case IPPROTO_ICMPV6:
/* id is same for either direction... */
keyptr = &tuple->src.u.icmp.id;
- min = range->min_proto.icmp.id;
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ min = 0;
+ range_size = 65536;
+ } else {
+ min = ntohs(range->min_proto.icmp.id);
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+ }
goto find_free_id;
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
case IPPROTO_GRE:
@@ -1009,7 +1014,7 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
.expectfn = nf_nat_follow_master,
};
-int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
{
struct nat_net *nat_net = net_generic(net, nat_net_id);
@@ -1019,14 +1024,12 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
struct nf_hook_ops *nat_ops;
int i, ret;
- if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
+ if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
return -EINVAL;
- nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+ nat_proto_net = &nat_net->nat_proto_net[pf];
for (i = 0; i < ops_count; i++) {
- if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
- return -EINVAL;
if (orig_nat_ops[i].hooknum == hooknum) {
hooknum = i;
break;
@@ -1086,8 +1089,8 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
return ret;
}
-void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
- unsigned int ops_count)
+void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
+ unsigned int ops_count)
{
struct nat_net *nat_net = net_generic(net, nat_net_id);
struct nf_nat_hooks_net *nat_proto_net;
@@ -1096,10 +1099,10 @@ void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
int hooknum = ops->hooknum;
int i;
- if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
+ if (pf >= ARRAY_SIZE(nat_net->nat_proto_net))
return;
- nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+ nat_proto_net = &nat_net->nat_proto_net[pf];
mutex_lock(&nf_nat_proto_mutex);
if (WARN_ON(nat_proto_net->users == 0))
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index d85c4d902e7b..8e8a65d46345 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -7,12 +7,10 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+#include <net/netfilter/nf_nat_masquerade.h>
static DEFINE_MUTEX(masq_mutex);
-static unsigned int masq_refcnt4 __read_mostly;
-static unsigned int masq_refcnt6 __read_mostly;
+static unsigned int masq_refcnt __read_mostly;
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
@@ -137,56 +135,6 @@ static struct notifier_block masq_inet_notifier = {
.notifier_call = masq_inet_event,
};
-int nf_nat_masquerade_ipv4_register_notifier(void)
-{
- int ret = 0;
-
- mutex_lock(&masq_mutex);
- if (WARN_ON_ONCE(masq_refcnt4 == UINT_MAX)) {
- ret = -EOVERFLOW;
- goto out_unlock;
- }
-
- /* check if the notifier was already set */
- if (++masq_refcnt4 > 1)
- goto out_unlock;
-
- /* Register for device down reports */
- ret = register_netdevice_notifier(&masq_dev_notifier);
- if (ret)
- goto err_dec;
- /* Register IP address change reports */
- ret = register_inetaddr_notifier(&masq_inet_notifier);
- if (ret)
- goto err_unregister;
-
- mutex_unlock(&masq_mutex);
- return ret;
-
-err_unregister:
- unregister_netdevice_notifier(&masq_dev_notifier);
-err_dec:
- masq_refcnt4--;
-out_unlock:
- mutex_unlock(&masq_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
-
-void nf_nat_masquerade_ipv4_unregister_notifier(void)
-{
- mutex_lock(&masq_mutex);
- /* check if the notifier still has clients */
- if (--masq_refcnt4 > 0)
- goto out_unlock;
-
- unregister_netdevice_notifier(&masq_dev_notifier);
- unregister_inetaddr_notifier(&masq_inet_notifier);
-out_unlock:
- mutex_unlock(&masq_mutex);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
-
#if IS_ENABLED(CONFIG_IPV6)
static atomic_t v6_worker_count __read_mostly;
@@ -322,44 +270,68 @@ static struct notifier_block masq_inet6_notifier = {
.notifier_call = masq_inet6_event,
};
-int nf_nat_masquerade_ipv6_register_notifier(void)
+static int nf_nat_masquerade_ipv6_register_notifier(void)
+{
+ return register_inet6addr_notifier(&masq_inet6_notifier);
+}
+#else
+static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; }
+#endif
+
+int nf_nat_masquerade_inet_register_notifiers(void)
{
int ret = 0;
mutex_lock(&masq_mutex);
- if (WARN_ON_ONCE(masq_refcnt6 == UINT_MAX)) {
+ if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) {
ret = -EOVERFLOW;
goto out_unlock;
}
- /* check if the notifier is already set */
- if (++masq_refcnt6 > 1)
+ /* check if the notifier was already set */
+ if (++masq_refcnt > 1)
goto out_unlock;
- ret = register_inet6addr_notifier(&masq_inet6_notifier);
+ /* Register for device down reports */
+ ret = register_netdevice_notifier(&masq_dev_notifier);
if (ret)
goto err_dec;
+ /* Register IP address change reports */
+ ret = register_inetaddr_notifier(&masq_inet_notifier);
+ if (ret)
+ goto err_unregister;
+
+ ret = nf_nat_masquerade_ipv6_register_notifier();
+ if (ret)
+ goto err_unreg_inet;
mutex_unlock(&masq_mutex);
return ret;
+err_unreg_inet:
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+err_unregister:
+ unregister_netdevice_notifier(&masq_dev_notifier);
err_dec:
- masq_refcnt6--;
+ masq_refcnt--;
out_unlock:
mutex_unlock(&masq_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier);
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers);
-void nf_nat_masquerade_ipv6_unregister_notifier(void)
+void nf_nat_masquerade_inet_unregister_notifiers(void)
{
mutex_lock(&masq_mutex);
- /* check if the notifier still has clients */
- if (--masq_refcnt6 > 0)
+ /* check if the notifiers still have clients */
+ if (--masq_refcnt > 0)
goto out_unlock;
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
unregister_inet6addr_notifier(&masq_inet6_notifier);
+#endif
out_unlock:
mutex_unlock(&masq_mutex);
}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
-#endif
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 62743da3004f..84f5c90a7f21 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -725,7 +725,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
return ret;
}
-static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv4_in,
@@ -758,13 +758,14 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
- return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops,
+ ARRAY_SIZE(nf_nat_ipv4_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn);
void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
- nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn);
@@ -925,20 +926,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
return ret;
}
-static int nat_route_me_harder(struct net *net, struct sk_buff *skb)
-{
-#ifdef CONFIG_IPV6_MODULE
- const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
-
- if (!v6_ops)
- return -EHOSTUNREACH;
-
- return v6_ops->route_me_harder(net, skb);
-#else
- return ip6_route_me_harder(net, skb);
-#endif
-}
-
static unsigned int
nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -958,7 +945,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
&ct->tuplehash[!dir].tuple.src.u3)) {
- err = nat_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -977,7 +964,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
return ret;
}
-static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+const struct nf_hook_ops nf_nat_ipv6_ops[] = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv6_in,
@@ -1010,14 +997,44 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
- return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops,
+ return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops,
ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn);
void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
- nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+ nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn);
#endif /* CONFIG_IPV6 */
+
+#if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT)
+int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(ops->pf != NFPROTO_INET))
+ return -EINVAL;
+
+ ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops,
+ ARRAY_SIZE(nf_nat_ipv6_ops));
+ if (ret)
+ return ret;
+
+ ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops,
+ ARRAY_SIZE(nf_nat_ipv4_ops));
+ if (ret)
+ nf_nat_ipv6_unregister_fn(net, ops);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn);
+
+void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn);
+#endif /* NFT INET NAT */
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index a36a77bae1d6..9dc1d6e04946 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -240,6 +240,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
return 0;
}
+EXPORT_SYMBOL_GPL(nf_queue);
static unsigned int nf_iterate(struct sk_buff *skb,
struct nf_hook_state *state,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 90e6b09ef2af..9d888dc6be38 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1544,7 +1544,7 @@ static int nft_chain_parse_hook(struct net *net,
if (IS_ERR(type))
return PTR_ERR(type);
}
- if (!(type->hook_mask & (1 << hook->num)))
+ if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
return -EOPNOTSUPP;
if (type->type == NFT_CHAIN_T_NAT &&
@@ -3193,9 +3193,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
static __be64 nf_jiffies64_to_msecs(u64 input)
{
- u64 ms = jiffies64_to_nsecs(input);
-
- return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+ return cpu_to_be64(jiffies64_to_msecs(input));
}
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
@@ -3438,8 +3436,7 @@ err:
return err;
}
-static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
- struct nft_set_desc *desc,
+static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
const struct nlattr *nla)
{
struct nlattr *da[NFTA_SET_DESC_MAX + 1];
@@ -3565,7 +3562,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
if (nla[NFTA_SET_DESC] != NULL) {
- err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
+ err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
}
@@ -3785,8 +3782,8 @@ bind:
}
EXPORT_SYMBOL_GPL(nf_tables_bind_set);
-void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding, bool event)
+static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding, bool event)
{
list_del_rcu(&binding->list);
@@ -3797,7 +3794,6 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
GFP_KERNEL);
}
}
-EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding,
@@ -7533,6 +7529,7 @@ static int __init nf_tables_module_init(void)
if (err < 0)
goto err5;
+ nft_chain_route_init();
return err;
err5:
rhltable_destroy(&nft_objname_ht);
@@ -7552,6 +7549,7 @@ static void __exit nf_tables_module_exit(void)
nfnetlink_subsys_unregister(&nf_tables_subsys);
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
+ nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
cancel_work_sync(&trans_destroy_work);
rcu_barrier();
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b1f9c5303f02..0b3347570265 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -540,7 +540,7 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (skb->tstamp) {
+ if (hooknum <= NF_INET_FORWARD && skb->tstamp) {
struct nfulnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 1f1d90c1716b..7b827bcb412c 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -255,9 +255,9 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
}
EXPORT_SYMBOL_GPL(nf_osf_match);
-const char *nf_osf_find(const struct sk_buff *skb,
- const struct list_head *nf_osf_fingers,
- const int ttl_check)
+bool nf_osf_find(const struct sk_buff *skb,
+ const struct list_head *nf_osf_fingers,
+ const int ttl_check, struct nf_osf_data *data)
{
const struct iphdr *ip = ip_hdr(skb);
const struct nf_osf_user_finger *f;
@@ -265,24 +265,24 @@ const char *nf_osf_find(const struct sk_buff *skb,
const struct nf_osf_finger *kf;
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
- const char *genre = NULL;
memset(&ctx, 0, sizeof(ctx));
tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
if (!tcp)
- return NULL;
+ return false;
list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
f = &kf->finger;
if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
continue;
- genre = f->genre;
+ data->genre = f->genre;
+ data->version = f->version;
break;
}
- return genre;
+ return true;
}
EXPORT_SYMBOL_GPL(nf_osf_find);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 0dcc3592d053..e057b2961d31 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -582,7 +582,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entskb->tstamp) {
+ if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) {
struct nfqnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c
index ee4852088d50..2f89bde3c61c 100644
--- a/net/netfilter/nft_chain_nat.c
+++ b/net/netfilter/nft_chain_nat.c
@@ -74,6 +74,36 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
};
#endif
+#ifdef CONFIG_NF_TABLES_INET
+static int nft_nat_inet_reg(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_inet_register_fn(net, ops);
+}
+
+static void nft_nat_inet_unreg(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_inet_unregister_fn(net, ops);
+}
+
+static const struct nft_chain_type nft_chain_nat_inet = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_INET,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ },
+ .ops_register = nft_nat_inet_reg,
+ .ops_unregister = nft_nat_inet_unreg,
+};
+#endif
+
static int __init nft_chain_nat_init(void)
{
#ifdef CONFIG_NF_TABLES_IPV6
@@ -82,6 +112,9 @@ static int __init nft_chain_nat_init(void)
#ifdef CONFIG_NF_TABLES_IPV4
nft_register_chain_type(&nft_chain_nat_ipv4);
#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_register_chain_type(&nft_chain_nat_inet);
+#endif
return 0;
}
@@ -94,6 +127,9 @@ static void __exit nft_chain_nat_exit(void)
#ifdef CONFIG_NF_TABLES_IPV6
nft_unregister_chain_type(&nft_chain_nat_ipv6);
#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_unregister_chain_type(&nft_chain_nat_inet);
+#endif
}
module_init(nft_chain_nat_init);
diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
new file mode 100644
index 000000000000..8826bbe71136
--- /dev/null
+++ b/net/netfilter/nft_chain_route.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#ifdef CONFIG_NF_TABLES_IPV4
+static unsigned int nf_route_table_hook4(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct iphdr *iph;
+ struct nft_pktinfo pkt;
+ __be32 saddr, daddr;
+ unsigned int ret;
+ u32 mark;
+ int err;
+ u8 tos;
+
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb);
+
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = nft_do_chain(&pkt, priv);
+ if (ret == NF_ACCEPT) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+ return ret;
+}
+
+static const struct nft_chain_type nft_chain_route_ipv4 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV4,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook4,
+ },
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_IPV6
+static unsigned int nf_route_table_hook6(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct in6_addr saddr, daddr;
+ struct nft_pktinfo pkt;
+ u32 mark, flowlabel;
+ unsigned int ret;
+ u8 hop_limit;
+ int err;
+
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv6(&pkt, skb);
+
+ /* save source/dest address, mark, hoplimit, flowlabel, priority */
+ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+ memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+ mark = skb->mark;
+ hop_limit = ipv6_hdr(skb)->hop_limit;
+
+ /* flowlabel and prio (includes version, which shouldn't change either)*/
+ flowlabel = *((u32 *)ipv6_hdr(skb));
+
+ ret = nft_do_chain(&pkt, priv);
+ if (ret == NF_ACCEPT &&
+ (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+ memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+ skb->mark != mark ||
+ ipv6_hdr(skb)->hop_limit != hop_limit ||
+ flowlabel != *((u32 *)ipv6_hdr(skb)))) {
+ err = nf_ip6_route_me_harder(state->net, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+
+ return ret;
+}
+
+static const struct nft_chain_type nft_chain_route_ipv6 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV6,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook6,
+ },
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_INET
+static unsigned int nf_route_table_inet(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ switch (state->pf) {
+ case NFPROTO_IPV4:
+ return nf_route_table_hook4(priv, skb, state);
+ case NFPROTO_IPV6:
+ return nf_route_table_hook6(priv, skb, state);
+ default:
+ nft_set_pktinfo(&pkt, skb, state);
+ break;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_route_inet = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_INET,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_inet,
+ },
+};
+#endif
+
+void __init nft_chain_route_init(void)
+{
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_register_chain_type(&nft_chain_route_ipv6);
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ nft_register_chain_type(&nft_chain_route_ipv4);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_register_chain_type(&nft_chain_route_inet);
+#endif
+}
+
+void __exit nft_chain_route_fini(void)
+{
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_unregister_chain_type(&nft_chain_route_ipv6);
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ nft_unregister_chain_type(&nft_chain_route_ipv4);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_unregister_chain_type(&nft_chain_route_inet);
+#endif
+}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index bee156eaa400..86fd90085eaf 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -14,8 +14,7 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+#include <net/netfilter/nf_nat_masquerade.h>
struct nft_masq {
u32 flags;
@@ -196,28 +195,73 @@ static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
static int __init nft_masq_module_init_ipv6(void)
{
- int ret = nft_register_expr(&nft_masq_ipv6_type);
-
- if (ret)
- return ret;
-
- ret = nf_nat_masquerade_ipv6_register_notifier();
- if (ret < 0)
- nft_unregister_expr(&nft_masq_ipv6_type);
-
- return ret;
+ return nft_register_expr(&nft_masq_ipv6_type);
}
static void nft_masq_module_exit_ipv6(void)
{
nft_unregister_expr(&nft_masq_ipv6_type);
- nf_nat_masquerade_ipv6_unregister_notifier();
}
#else
static inline int nft_masq_module_init_ipv6(void) { return 0; }
static inline void nft_masq_module_exit_ipv6(void) {}
#endif
+#ifdef CONFIG_NF_TABLES_INET
+static void nft_masq_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ return nft_masq_ipv4_eval(expr, regs, pkt);
+ case NFPROTO_IPV6:
+ return nft_masq_ipv6_eval(expr, regs, pkt);
+ }
+
+ WARN_ON_ONCE(1);
+}
+
+static void
+nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_INET);
+}
+
+static struct nft_expr_type nft_masq_inet_type;
+static const struct nft_expr_ops nft_masq_inet_ops = {
+ .type = &nft_masq_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_inet_eval,
+ .init = nft_masq_init,
+ .destroy = nft_masq_inet_destroy,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+};
+
+static struct nft_expr_type nft_masq_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "masq",
+ .ops = &nft_masq_inet_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_module_init_inet(void)
+{
+ return nft_register_expr(&nft_masq_inet_type);
+}
+
+static void nft_masq_module_exit_inet(void)
+{
+ nft_unregister_expr(&nft_masq_inet_type);
+}
+#else
+static inline int nft_masq_module_init_inet(void) { return 0; }
+static inline void nft_masq_module_exit_inet(void) {}
+#endif
+
static int __init nft_masq_module_init(void)
{
int ret;
@@ -226,15 +270,23 @@ static int __init nft_masq_module_init(void)
if (ret < 0)
return ret;
+ ret = nft_masq_module_init_inet();
+ if (ret < 0) {
+ nft_masq_module_exit_ipv6();
+ return ret;
+ }
+
ret = nft_register_expr(&nft_masq_ipv4_type);
if (ret < 0) {
+ nft_masq_module_exit_inet();
nft_masq_module_exit_ipv6();
return ret;
}
- ret = nf_nat_masquerade_ipv4_register_notifier();
+ ret = nf_nat_masquerade_inet_register_notifiers();
if (ret < 0) {
nft_masq_module_exit_ipv6();
+ nft_masq_module_exit_inet();
nft_unregister_expr(&nft_masq_ipv4_type);
return ret;
}
@@ -245,8 +297,9 @@ static int __init nft_masq_module_init(void)
static void __exit nft_masq_module_exit(void)
{
nft_masq_module_exit_ipv6();
+ nft_masq_module_exit_inet();
nft_unregister_expr(&nft_masq_ipv4_type);
- nf_nat_masquerade_ipv4_unregister_notifier();
+ nf_nat_masquerade_inet_unregister_notifiers();
}
module_init(nft_masq_module_init);
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index e93aed9bda88..d90d421826aa 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -140,7 +140,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
- if (family != ctx->family)
+ if (ctx->family != NFPROTO_INET && ctx->family != family)
return -EOPNOTSUPP;
switch (family) {
@@ -278,13 +278,67 @@ static struct nft_expr_type nft_nat_type __read_mostly = {
.owner = THIS_MODULE,
};
+#ifdef CONFIG_NF_TABLES_INET
+static void nft_nat_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ if (priv->family == nft_pf(pkt))
+ nft_nat_eval(expr, regs, pkt);
+}
+
+static const struct nft_expr_ops nft_nat_inet_ops = {
+ .type = &nft_nat_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+ .eval = nft_nat_inet_eval,
+ .init = nft_nat_init,
+ .destroy = nft_nat_destroy,
+ .dump = nft_nat_dump,
+ .validate = nft_nat_validate,
+};
+
+static struct nft_expr_type nft_inet_nat_type __read_mostly = {
+ .name = "nat",
+ .family = NFPROTO_INET,
+ .ops = &nft_nat_inet_ops,
+ .policy = nft_nat_policy,
+ .maxattr = NFTA_NAT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int nft_nat_inet_module_init(void)
+{
+ return nft_register_expr(&nft_inet_nat_type);
+}
+
+static void nft_nat_inet_module_exit(void)
+{
+ nft_unregister_expr(&nft_inet_nat_type);
+}
+#else
+static int nft_nat_inet_module_init(void) { return 0; }
+static void nft_nat_inet_module_exit(void) { }
+#endif
+
static int __init nft_nat_module_init(void)
{
- return nft_register_expr(&nft_nat_type);
+ int ret = nft_nat_inet_module_init();
+
+ if (ret)
+ return ret;
+
+ ret = nft_register_expr(&nft_nat_type);
+ if (ret)
+ nft_nat_inet_module_exit();
+
+ return ret;
}
static void __exit nft_nat_module_exit(void)
{
+ nft_nat_inet_module_exit();
nft_unregister_expr(&nft_nat_type);
}
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index b13618c764ec..87b60d6617ef 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -7,11 +7,13 @@
struct nft_osf {
enum nft_registers dreg:8;
u8 ttl;
+ u32 flags;
};
static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
[NFTA_OSF_DREG] = { .type = NLA_U32 },
[NFTA_OSF_TTL] = { .type = NLA_U8 },
+ [NFTA_OSF_FLAGS] = { .type = NLA_U32 },
};
static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
@@ -20,9 +22,10 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct nft_osf *priv = nft_expr_priv(expr);
u32 *dest = &regs->data[priv->dreg];
struct sk_buff *skb = pkt->skb;
+ char os_match[NFT_OSF_MAXGENRELEN + 1];
const struct tcphdr *tcp;
+ struct nf_osf_data data;
struct tcphdr _tcph;
- const char *os_name;
tcp = skb_header_pointer(skb, ip_hdrlen(skb),
sizeof(struct tcphdr), &_tcph);
@@ -35,11 +38,17 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl);
- if (!os_name)
+ if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) {
strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
- else
- strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN);
+ } else {
+ if (priv->flags & NFT_OSF_F_VERSION)
+ snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s",
+ data.genre, data.version);
+ else
+ strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
+
+ strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
+ }
}
static int nft_osf_init(const struct nft_ctx *ctx,
@@ -47,6 +56,7 @@ static int nft_osf_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_osf *priv = nft_expr_priv(expr);
+ u32 flags;
int err;
u8 ttl;
@@ -57,6 +67,13 @@ static int nft_osf_init(const struct nft_ctx *ctx,
priv->ttl = ttl;
}
+ if (tb[NFTA_OSF_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS]));
+ if (flags != NFT_OSF_F_VERSION)
+ return -EINVAL;
+ priv->flags = flags;
+ }
+
priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
err = nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN);
@@ -73,6 +90,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags)))
+ goto nla_put_failure;
+
if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
goto nla_put_failure;
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index a340cd8a751b..da74fdc4a684 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -82,7 +82,7 @@ static int nft_redir_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_redir *priv = nft_expr_priv(expr);
@@ -202,6 +202,55 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
};
#endif
+#ifdef CONFIG_NF_TABLES_INET
+static void nft_redir_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ return nft_redir_ipv4_eval(expr, regs, pkt);
+ case NFPROTO_IPV6:
+ return nft_redir_ipv6_eval(expr, regs, pkt);
+ }
+
+ WARN_ON_ONCE(1);
+}
+
+static void
+nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_INET);
+}
+
+static struct nft_expr_type nft_redir_inet_type;
+static const struct nft_expr_ops nft_redir_inet_ops = {
+ .type = &nft_redir_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_inet_eval,
+ .init = nft_redir_init,
+ .destroy = nft_redir_inet_destroy,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+};
+
+static struct nft_expr_type nft_redir_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "redir",
+ .ops = &nft_redir_inet_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_redir_module_init_inet(void)
+{
+ return nft_register_expr(&nft_redir_inet_type);
+}
+#else
+static inline int nft_redir_module_init_inet(void) { return 0; }
+#endif
+
static int __init nft_redir_module_init(void)
{
int ret = nft_register_expr(&nft_redir_ipv4_type);
@@ -217,6 +266,15 @@ static int __init nft_redir_module_init(void)
}
#endif
+ ret = nft_redir_module_init_inet();
+ if (ret < 0) {
+ nft_unregister_expr(&nft_redir_ipv4_type);
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_unregister_expr(&nft_redir_ipv6_type);
+#endif
+ return ret;
+ }
+
return ret;
}
@@ -226,6 +284,9 @@ static void __exit nft_redir_module_exit(void)
#ifdef CONFIG_NF_TABLES_IPV6
nft_unregister_expr(&nft_redir_ipv6_type);
#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_unregister_expr(&nft_redir_inet_type);
+#endif
}
module_init(nft_redir_module_init);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e5e5c64df8d1..0a6656ed1534 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -227,7 +227,7 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision)
EXPORT_SYMBOL_GPL(xt_request_find_match);
/* Find target, grabs ref. Returns ERR_PTR() on error. */
-struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
+static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
{
struct xt_target *t;
int err = -ENOENT;
@@ -255,7 +255,6 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
return ERR_PTR(err);
}
-EXPORT_SYMBOL(xt_find_target);
struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
{
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c
index fd3f9e8a74da..ece20d832adc 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/netfilter/xt_MASQUERADE.c
@@ -9,20 +9,10 @@
* published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/x_tables.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+#include <net/netfilter/nf_nat_masquerade.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -64,38 +54,78 @@ static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_target masquerade_tg_reg __read_mostly = {
- .name = "MASQUERADE",
- .family = NFPROTO_IPV4,
- .target = masquerade_tg,
- .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
- .table = "nat",
- .hooks = 1 << NF_INET_POST_ROUTING,
- .checkentry = masquerade_tg_check,
- .destroy = masquerade_tg_destroy,
- .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_IPV6)
+static unsigned int
+masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
+}
+
+static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS)
+ return -EINVAL;
+
+ return nf_ct_netns_get(par->net, par->family);
+}
+#endif
+
+static struct xt_target masquerade_tg_reg[] __read_mostly = {
+ {
+#if IS_ENABLED(CONFIG_IPV6)
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV6,
+ .target = masquerade_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg6_checkentry,
+ .destroy = masquerade_tg_destroy,
+ .me = THIS_MODULE,
+ }, {
+#endif
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV4,
+ .target = masquerade_tg,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg_check,
+ .destroy = masquerade_tg_destroy,
+ .me = THIS_MODULE,
+ }
};
static int __init masquerade_tg_init(void)
{
int ret;
- ret = xt_register_target(&masquerade_tg_reg);
+ ret = xt_register_targets(masquerade_tg_reg,
+ ARRAY_SIZE(masquerade_tg_reg));
if (ret)
return ret;
- ret = nf_nat_masquerade_ipv4_register_notifier();
- if (ret)
- xt_unregister_target(&masquerade_tg_reg);
+ ret = nf_nat_masquerade_inet_register_notifiers();
+ if (ret) {
+ xt_unregister_targets(masquerade_tg_reg,
+ ARRAY_SIZE(masquerade_tg_reg));
+ return ret;
+ }
return ret;
}
static void __exit masquerade_tg_exit(void)
{
- xt_unregister_target(&masquerade_tg_reg);
- nf_nat_masquerade_ipv4_unregister_notifier();
+ xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg));
+ nf_nat_masquerade_inet_unregister_notifiers();
}
module_init(masquerade_tg_init);
module_exit(masquerade_tg_exit);
+#if IS_ENABLED(CONFIG_IPV6)
+MODULE_ALIAS("ip6t_MASQUERADE");
+#endif
+MODULE_ALIAS("ipt_MASQUERADE");
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index c13bcd0ab491..8dbb4d48f2ed 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -163,19 +163,24 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
s64 stamp;
/*
- * We cannot use get_seconds() instead of __net_timestamp() here.
+ * We need real time here, but we can neither use skb->tstamp
+ * nor __net_timestamp().
+ *
+ * skb->tstamp and skb->skb_mstamp_ns overlap, however, they
+ * use different clock types (real vs monotonic).
+ *
* Suppose you have two rules:
- * 1. match before 13:00
- * 2. match after 13:00
+ * 1. match before 13:00
+ * 2. match after 13:00
+ *
* If you match against processing time (get_seconds) it
* may happen that the same packet matches both rules if
- * it arrived at the right moment before 13:00.
+ * it arrived at the right moment before 13:00, so it would be
+ * better to check skb->tstamp and set it via __net_timestamp()
+ * if needed. This however breaks outgoing packets tx timestamp,
+ * and causes them to get delayed forever by fq packet scheduler.
*/
- if (skb->tstamp == 0)
- __net_timestamp((struct sk_buff *)skb);
-
- stamp = ktime_to_ns(skb->tstamp);
- stamp = div_s64(stamp, NSEC_PER_SEC);
+ stamp = get_seconds();
if (info->flags & XT_TIME_LOCAL_TZ)
/* Adjust for local timezone */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f28e937320a3..216ab915dd54 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -988,7 +988,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
int err = 0;
- unsigned long groups = nladdr->nl_groups;
+ unsigned long groups;
bool bound;
if (addr_len < sizeof(struct sockaddr_nl))
@@ -996,6 +996,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
if (nladdr->nl_family != AF_NETLINK)
return -EINVAL;
+ groups = nladdr->nl_groups;
/* Only superuser is allowed to listen multicasts */
if (groups) {
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 1d3144d19903..167c09e1ea90 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1199,7 +1199,6 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
void __user *argp = (void __user *)arg;
- int ret;
switch (cmd) {
case TIOCOUTQ: {
@@ -1225,18 +1224,6 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return put_user(amount, (int __user *)argp);
}
- case SIOCGSTAMP:
- lock_sock(sk);
- ret = sock_get_timestamp(sk, argp);
- release_sock(sk);
- return ret;
-
- case SIOCGSTAMPNS:
- lock_sock(sk);
- ret = sock_get_timestampns(sk, argp);
- release_sock(sk);
- return ret;
-
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFDSTADDR:
@@ -1362,6 +1349,7 @@ static const struct proto_ops nr_proto_ops = {
.getname = nr_getname,
.poll = datagram_poll,
.ioctl = nr_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = nr_listen,
.shutdown = sock_no_shutdown,
.setsockopt = nr_setsockopt,
@@ -1392,18 +1380,22 @@ static int __init nr_proto_init(void)
int i;
int rc = proto_register(&nr_proto, 0);
- if (rc != 0)
- goto out;
+ if (rc)
+ return rc;
if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) {
- printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n");
- return -1;
+ pr_err("NET/ROM: %s - nr_ndevs parameter too large\n",
+ __func__);
+ rc = -EINVAL;
+ goto unregister_proto;
}
dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL);
- if (dev_nr == NULL) {
- printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n");
- return -1;
+ if (!dev_nr) {
+ pr_err("NET/ROM: %s - unable to allocate device array\n",
+ __func__);
+ rc = -ENOMEM;
+ goto unregister_proto;
}
for (i = 0; i < nr_ndevs; i++) {
@@ -1413,13 +1405,13 @@ static int __init nr_proto_init(void)
sprintf(name, "nr%d", i);
dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup);
if (!dev) {
- printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n");
+ rc = -ENOMEM;
goto fail;
}
dev->base_addr = i;
- if (register_netdev(dev)) {
- printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n");
+ rc = register_netdev(dev);
+ if (rc) {
free_netdev(dev);
goto fail;
}
@@ -1427,36 +1419,64 @@ static int __init nr_proto_init(void)
dev_nr[i] = dev;
}
- if (sock_register(&nr_family_ops)) {
- printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n");
+ rc = sock_register(&nr_family_ops);
+ if (rc)
goto fail;
- }
- register_netdevice_notifier(&nr_dev_notifier);
+ rc = register_netdevice_notifier(&nr_dev_notifier);
+ if (rc)
+ goto out_sock;
ax25_register_pid(&nr_pid);
ax25_linkfail_register(&nr_linkfail_notifier);
#ifdef CONFIG_SYSCTL
- nr_register_sysctl();
+ rc = nr_register_sysctl();
+ if (rc)
+ goto out_sysctl;
#endif
nr_loopback_init();
- proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops);
- proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops);
- proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops);
-out:
- return rc;
+ rc = -ENOMEM;
+ if (!proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops))
+ goto proc_remove1;
+ if (!proc_create_seq("nr_neigh", 0444, init_net.proc_net,
+ &nr_neigh_seqops))
+ goto proc_remove2;
+ if (!proc_create_seq("nr_nodes", 0444, init_net.proc_net,
+ &nr_node_seqops))
+ goto proc_remove3;
+
+ return 0;
+
+proc_remove3:
+ remove_proc_entry("nr_neigh", init_net.proc_net);
+proc_remove2:
+ remove_proc_entry("nr", init_net.proc_net);
+proc_remove1:
+
+ nr_loopback_clear();
+ nr_rt_free();
+
+#ifdef CONFIG_SYSCTL
+ nr_unregister_sysctl();
+out_sysctl:
+#endif
+ ax25_linkfail_release(&nr_linkfail_notifier);
+ ax25_protocol_release(AX25_P_NETROM);
+ unregister_netdevice_notifier(&nr_dev_notifier);
+out_sock:
+ sock_unregister(PF_NETROM);
fail:
while (--i >= 0) {
unregister_netdev(dev_nr[i]);
free_netdev(dev_nr[i]);
}
kfree(dev_nr);
+unregister_proto:
proto_unregister(&nr_proto);
- rc = -1;
- goto out;
+ return rc;
}
module_init(nr_proto_init);
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
index 215ad22a9647..93d13f019981 100644
--- a/net/netrom/nr_loopback.c
+++ b/net/netrom/nr_loopback.c
@@ -70,7 +70,7 @@ static void nr_loopback_timer(struct timer_list *unused)
}
}
-void __exit nr_loopback_clear(void)
+void nr_loopback_clear(void)
{
del_timer_sync(&loopback_timer);
skb_queue_purge(&loopback_queue);
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index 6485f593e2f0..b76aa668a94b 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -953,7 +953,7 @@ const struct seq_operations nr_neigh_seqops = {
/*
* Free all memory associated with the nodes and routes lists.
*/
-void __exit nr_rt_free(void)
+void nr_rt_free(void)
{
struct nr_neigh *s = NULL;
struct nr_node *t = NULL;
diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c
index ba1c368b3f18..771011b84270 100644
--- a/net/netrom/sysctl_net_netrom.c
+++ b/net/netrom/sysctl_net_netrom.c
@@ -146,9 +146,12 @@ static struct ctl_table nr_table[] = {
{ }
};
-void __init nr_register_sysctl(void)
+int __init nr_register_sysctl(void)
{
nr_table_header = register_net_sysctl(&init_net, "net/netrom", nr_table);
+ if (!nr_table_header)
+ return -ENOMEM;
+ return 0;
}
void nr_unregister_sysctl(void)
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index ddfc52ac1f9b..c0d323b58e73 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -312,6 +312,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe,
create_info = (struct nci_hci_create_pipe_resp *)skb->data;
dest_gate = create_info->dest_gate;
new_pipe = create_info->pipe;
+ if (new_pipe >= NCI_HCI_MAX_PIPES) {
+ status = NCI_HCI_ANY_E_NOK;
+ goto exit;
+ }
/* Save the new created pipe and bind with local gate,
* the description for skb->data[3] is destination gate id
@@ -336,6 +340,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe,
goto exit;
}
delete_info = (struct nci_hci_delete_pipe_noti *)skb->data;
+ if (delete_info->pipe >= NCI_HCI_MAX_PIPES) {
+ status = NCI_HCI_ANY_E_NOK;
+ goto exit;
+ }
ndev->hci_dev->pipes[delete_info->pipe].gate =
NCI_HCI_INVALID_GATE;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 0be3ab5bde26..626629944450 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -29,7 +29,7 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/ipv6_frag.h>
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
#include <net/netfilter/nf_nat.h>
#endif
@@ -75,7 +75,7 @@ struct ovs_conntrack_info {
struct md_mark mark;
struct md_labels labels;
char timeout[CTNL_TIMEOUT_NAME_MAX];
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */
#endif
};
@@ -721,7 +721,7 @@ static bool skb_nfct_cached(struct net *net,
return ct_executed;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
/* Modelled after nf_nat_ipv[46]_fn().
* range is only used for new, uninitialized NAT state.
* Returns either NF_ACCEPT or NF_DROP.
@@ -903,7 +903,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
return err;
}
-#else /* !CONFIG_NF_NAT_NEEDED */
+#else /* !CONFIG_NF_NAT */
static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb, struct nf_conn *ct,
@@ -1330,7 +1330,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
return 0;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
static int parse_nat(const struct nlattr *attr,
struct ovs_conntrack_info *info, bool log)
{
@@ -1467,7 +1467,7 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
.maxlen = sizeof(struct md_labels) },
[OVS_CT_ATTR_HELPER] = { .minlen = 1,
.maxlen = NF_CT_HELPER_NAME_LEN },
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
/* NAT length is checked when parsing the nested attributes. */
[OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
#endif
@@ -1547,7 +1547,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
return -EINVAL;
}
break;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
case OVS_CT_ATTR_NAT: {
int err = parse_nat(a, info, log);
@@ -1677,7 +1677,7 @@ err_free_ct:
return err;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
@@ -1783,7 +1783,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
return -EMSGSIZE;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
return -EMSGSIZE;
#endif
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 08fe8b79c0bf..5c4a118d6f96 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4075,11 +4075,6 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
}
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
-
#ifdef CONFIG_INET
case SIOCADDRT:
case SIOCDELRT:
@@ -4455,6 +4450,7 @@ static const struct proto_ops packet_ops_spkt = {
.getname = packet_getname_spkt,
.poll = datagram_poll,
.ioctl = packet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
@@ -4476,6 +4472,7 @@ static const struct proto_ops packet_ops = {
.getname = packet_getname,
.poll = packet_poll,
.ioctl = packet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index b37e6e0a1026..7c5e8292cc0a 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -968,9 +968,6 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
break;
- case SIOCGSTAMP:
- rc = sock_get_timestamp(sk, argp);
- break;
case SIOCADDRT:
case SIOCDELRT:
case SIOCSIFADDR:
@@ -1033,6 +1030,7 @@ static const struct proto_ops qrtr_proto_ops = {
.recvmsg = qrtr_recvmsg,
.getname = qrtr_getname,
.ioctl = qrtr_ioctl,
+ .gettstamp = sock_gettstamp,
.poll = datagram_poll,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index d6cc97fbbbb0..2b969f99ef13 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -543,6 +543,9 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
struct rds_sock *rs = rds_sk_to_rs(sk);
int ret = 0;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
+
lock_sock(sk);
switch (uaddr->sa_family) {
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 17c9d9f0c848..0f4398e7f2a7 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -173,6 +173,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* We allow an RDS socket to be bound to either IPv4 or IPv6
* address.
*/
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
if (uaddr->sa_family == AF_INET) {
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
index 31cf37da4510..93c0437e6a5f 100644
--- a/net/rds/ib_fmr.c
+++ b/net/rds/ib_fmr.c
@@ -44,6 +44,17 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
else
pool = rds_ibdev->mr_1m_pool;
+ if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+ /* Switch pools if one of the pool is reaching upper limit */
+ if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ pool = rds_ibdev->mr_1m_pool;
+ else
+ pool = rds_ibdev->mr_8k_pool;
+ }
+
ibmr = rds_ib_try_reuse_ibmr(pool);
if (ibmr)
return ibmr;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 63c8d107adcf..d664e9ade74d 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -454,9 +454,6 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
struct rds_ib_mr *ibmr = NULL;
int iter = 0;
- if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10)
- queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
-
while (1) {
ibmr = rds_ib_reuse_mr(pool);
if (ibmr)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index c96f63ffe31e..e274bc6e1458 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1301,12 +1301,6 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return put_user(amount, (unsigned int __user *) argp);
}
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *) argp);
-
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *) argp);
-
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFDSTADDR:
@@ -1474,6 +1468,7 @@ static const struct proto_ops rose_proto_ops = {
.getname = rose_getname,
.poll = datagram_poll,
.ioctl = rose_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = rose_listen,
.shutdown = sock_no_shutdown,
.setsockopt = rose_setsockopt,
diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c
index 7af4f99c4a93..094a6621f8e8 100644
--- a/net/rose/rose_loopback.c
+++ b/net/rose/rose_loopback.c
@@ -16,6 +16,7 @@
#include <linux/init.h>
static struct sk_buff_head loopback_queue;
+#define ROSE_LOOPBACK_LIMIT 1000
static struct timer_list loopback_timer;
static void rose_set_loopback_timer(void);
@@ -35,29 +36,27 @@ static int rose_loopback_running(void)
int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh)
{
- struct sk_buff *skbn;
+ struct sk_buff *skbn = NULL;
- skbn = skb_clone(skb, GFP_ATOMIC);
+ if (skb_queue_len(&loopback_queue) < ROSE_LOOPBACK_LIMIT)
+ skbn = skb_clone(skb, GFP_ATOMIC);
- kfree_skb(skb);
-
- if (skbn != NULL) {
+ if (skbn) {
+ consume_skb(skb);
skb_queue_tail(&loopback_queue, skbn);
if (!rose_loopback_running())
rose_set_loopback_timer();
+ } else {
+ kfree_skb(skb);
}
return 1;
}
-
static void rose_set_loopback_timer(void)
{
- del_timer(&loopback_timer);
-
- loopback_timer.expires = jiffies + 10;
- add_timer(&loopback_timer);
+ mod_timer(&loopback_timer, jiffies + 10);
}
static void rose_loopback_timer(struct timer_list *unused)
@@ -68,8 +67,12 @@ static void rose_loopback_timer(struct timer_list *unused)
struct sock *sk;
unsigned short frametype;
unsigned int lci_i, lci_o;
+ int count;
- while ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+ for (count = 0; count < ROSE_LOOPBACK_LIMIT; count++) {
+ skb = skb_dequeue(&loopback_queue);
+ if (!skb)
+ return;
if (skb->len < ROSE_MIN_LEN) {
kfree_skb(skb);
continue;
@@ -106,6 +109,8 @@ static void rose_loopback_timer(struct timer_list *unused)
kfree_skb(skb);
}
}
+ if (!skb_queue_empty(&loopback_queue))
+ mod_timer(&loopback_timer, jiffies + 1);
}
void __exit rose_loopback_clear(void)
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 96f2952bbdfd..ae8c5d7f3bf1 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -135,7 +135,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
struct rxrpc_local *local;
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- u16 service_id = srx->srx_service;
+ u16 service_id;
int ret;
_enter("%p,%p,%d", rx, saddr, len);
@@ -143,6 +143,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
ret = rxrpc_validate_address(rx, srx, len);
if (ret < 0)
goto error;
+ service_id = srx->srx_service;
lock_sock(&rx->sk);
@@ -370,18 +371,22 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call);
* rxrpc_kernel_check_life - Check to see whether a call is still alive
* @sock: The socket the call is on
* @call: The call to check
+ * @_life: Where to store the life value
*
* Allow a kernel service to find out whether a call is still alive - ie. we're
- * getting ACKs from the server. Returns a number representing the life state
- * which can be compared to that returned by a previous call.
+ * getting ACKs from the server. Passes back in *_life a number representing
+ * the life state which can be compared to that returned by a previous call and
+ * return true if the call is still alive.
*
* If the life state stalls, rxrpc_kernel_probe_life() should be called and
* then 2RTT waited.
*/
-u32 rxrpc_kernel_check_life(const struct socket *sock,
- const struct rxrpc_call *call)
+bool rxrpc_kernel_check_life(const struct socket *sock,
+ const struct rxrpc_call *call,
+ u32 *_life)
{
- return call->acks_latest;
+ *_life = call->acks_latest;
+ return call->state != RXRPC_CALL_COMPLETE;
}
EXPORT_SYMBOL(rxrpc_kernel_check_life);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 4b1a534d290a..062ca9dc29b8 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -654,6 +654,7 @@ struct rxrpc_call {
u8 ackr_reason; /* reason to ACK */
u16 ackr_skew; /* skew on packet being ACK'd */
rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
+ rxrpc_serial_t ackr_first_seq; /* first sequence number received */
rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */
rxrpc_seq_t ackr_seen; /* Highest packet shown seen */
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index b6fca8ebb117..8d31fb4c51e1 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -153,7 +153,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
* pass a connection-level abort onto all calls on that connection
*/
static void rxrpc_abort_calls(struct rxrpc_connection *conn,
- enum rxrpc_call_completion compl)
+ enum rxrpc_call_completion compl,
+ rxrpc_serial_t serial)
{
struct rxrpc_call *call;
int i;
@@ -173,6 +174,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
call->call_id, 0,
conn->abort_code,
conn->error);
+ else
+ trace_rxrpc_rx_abort(call, serial,
+ conn->abort_code);
if (rxrpc_set_call_completion(call, compl,
conn->abort_code,
conn->error))
@@ -213,8 +217,6 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
conn->state = RXRPC_CONN_LOCALLY_ABORTED;
spin_unlock_bh(&conn->state_lock);
- rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED);
-
msg.msg_name = &conn->params.peer->srx.transport;
msg.msg_namelen = conn->params.peer->srx.transport_len;
msg.msg_control = NULL;
@@ -242,6 +244,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
len = iov[0].iov_len + iov[1].iov_len;
serial = atomic_inc_return(&conn->serial);
+ rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial);
whdr.serial = htonl(serial);
_proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code);
@@ -321,7 +324,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
conn->error = -ECONNABORTED;
conn->abort_code = abort_code;
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
- rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED);
+ rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial);
return -ECONNABORTED;
case RXRPC_PACKET_TYPE_CHALLENGE:
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 9128aa0e40aa..c2c35cf4e308 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -837,7 +837,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
u8 acks[RXRPC_MAXACKS];
} buf;
rxrpc_serial_t acked_serial;
- rxrpc_seq_t first_soft_ack, hard_ack;
+ rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
int nr_acks, offset, ioffset;
_enter("");
@@ -851,13 +851,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
acked_serial = ntohl(buf.ack.serial);
first_soft_ack = ntohl(buf.ack.firstPacket);
+ prev_pkt = ntohl(buf.ack.previousPacket);
hard_ack = first_soft_ack - 1;
nr_acks = buf.ack.nAcks;
summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
buf.ack.reason : RXRPC_ACK__INVALID);
trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
- first_soft_ack, ntohl(buf.ack.previousPacket),
+ first_soft_ack, prev_pkt,
summary.ack_reason, nr_acks);
if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
@@ -878,8 +879,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_propose_ack_respond_to_ack);
}
- /* Discard any out-of-order or duplicate ACKs. */
- if (before_eq(sp->hdr.serial, call->acks_latest))
+ /* Discard any out-of-order or duplicate ACKs (outside lock). */
+ if (before(first_soft_ack, call->ackr_first_seq) ||
+ before(prev_pkt, call->ackr_prev_seq))
return;
buf.info.rxMTU = 0;
@@ -890,12 +892,16 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
spin_lock(&call->input_lock);
- /* Discard any out-of-order or duplicate ACKs. */
- if (before_eq(sp->hdr.serial, call->acks_latest))
+ /* Discard any out-of-order or duplicate ACKs (inside lock). */
+ if (before(first_soft_ack, call->ackr_first_seq) ||
+ before(prev_pkt, call->ackr_prev_seq))
goto out;
call->acks_latest_ts = skb->tstamp;
call->acks_latest = sp->hdr.serial;
+ call->ackr_first_seq = first_soft_ack;
+ call->ackr_prev_seq = prev_pkt;
+
/* Parse rwind and mtu sizes if provided. */
if (buf.info.rxMTU)
rxrpc_input_ackinfo(call, skb, &buf.info);
@@ -1155,19 +1161,19 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
* handle data received on the local endpoint
* - may be called in interrupt context
*
- * The socket is locked by the caller and this prevents the socket from being
- * shut down and the local endpoint from going away, thus sk_user_data will not
- * be cleared until this function returns.
+ * [!] Note that as this is called from the encap_rcv hook, the socket is not
+ * held locked by the caller and nothing prevents sk_user_data on the UDP from
+ * being cleared in the middle of processing this function.
*
* Called with the RCU read lock held from the IP layer via UDP.
*/
int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
{
+ struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk);
struct rxrpc_connection *conn;
struct rxrpc_channel *chan;
struct rxrpc_call *call = NULL;
struct rxrpc_skb_priv *sp;
- struct rxrpc_local *local = udp_sk->sk_user_data;
struct rxrpc_peer *peer = NULL;
struct rxrpc_sock *rx = NULL;
unsigned int channel;
@@ -1175,6 +1181,10 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
_enter("%p", udp_sk);
+ if (unlikely(!local)) {
+ kfree_skb(skb);
+ return 0;
+ }
if (skb->tstamp == 0)
skb->tstamp = ktime_get_real();
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 9157fd00dce3..b67dec945498 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -304,7 +304,8 @@ nomem:
ret = -ENOMEM;
sock_error:
mutex_unlock(&rxnet->local_mutex);
- kfree(local);
+ if (local)
+ call_rcu(&local->rcu, rxrpc_local_rcu);
_leave(" = %d", ret);
return ERR_PTR(ret);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index bc05af89fc38..6e84d878053c 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -157,6 +157,11 @@ void rxrpc_error_report(struct sock *sk)
_enter("%p{%d}", sk, local->debug_id);
+ /* Clear the outstanding error value on the socket so that it doesn't
+ * cause kernel_sendmsg() to return it later.
+ */
+ sock_error(sk);
+
skb = sock_dequeue_err_skb(sk);
if (!skb) {
_leave("UDP socket errqueue empty");
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 46c9312085b1..bec64deb7b0a 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -152,12 +152,13 @@ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call,
}
/*
- * Queue a DATA packet for transmission, set the resend timeout and send the
- * packet immediately
+ * Queue a DATA packet for transmission, set the resend timeout and send
+ * the packet immediately. Returns the error from rxrpc_send_data_packet()
+ * in case the caller wants to do something with it.
*/
-static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
- struct sk_buff *skb, bool last,
- rxrpc_notify_end_tx_t notify_end_tx)
+static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
+ struct sk_buff *skb, bool last,
+ rxrpc_notify_end_tx_t notify_end_tx)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
unsigned long now;
@@ -250,7 +251,8 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
out:
rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
- _leave("");
+ _leave(" = %d", ret);
+ return ret;
}
/*
@@ -423,9 +425,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
if (ret < 0)
goto out;
- rxrpc_queue_packet(rx, call, skb,
- !msg_data_left(msg) && !more,
- notify_end_tx);
+ ret = rxrpc_queue_packet(rx, call, skb,
+ !msg_data_left(msg) && !more,
+ notify_end_tx);
+ /* Should check for failure here */
skb = NULL;
}
} while (msg_data_left(msg) > 0);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2763176e369c..0d8968803e98 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -90,6 +90,7 @@ struct cls_fl_head {
struct rhashtable ht;
spinlock_t masks_lock; /* Protect masks list */
struct list_head masks;
+ struct list_head hw_filters;
struct rcu_work rwork;
struct idr handle_idr;
};
@@ -102,6 +103,7 @@ struct cls_fl_filter {
struct tcf_result res;
struct fl_flow_key key;
struct list_head list;
+ struct list_head hw_list;
u32 handle;
u32 flags;
u32 in_hw_count;
@@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
spin_lock_init(&head->masks_lock);
INIT_LIST_HEAD_RCU(&head->masks);
+ INIT_LIST_HEAD(&head->hw_filters);
rcu_assign_pointer(tp->root, head);
idr_init(&head->handle_idr);
@@ -336,8 +339,7 @@ static void fl_mask_free_work(struct work_struct *work)
fl_mask_free(mask);
}
-static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
- bool async)
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask)
{
if (!refcount_dec_and_test(&mask->refcnt))
return false;
@@ -348,14 +350,21 @@ static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
list_del_rcu(&mask->list);
spin_unlock(&head->masks_lock);
- if (async)
- tcf_queue_work(&mask->rwork, fl_mask_free_work);
- else
- fl_mask_free(mask);
+ tcf_queue_work(&mask->rwork, fl_mask_free_work);
return true;
}
+static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
+{
+ /* Flower classifier only changes root pointer during init and destroy.
+ * Users must obtain reference to tcf_proto instance before calling its
+ * API, so tp->root pointer is protected from concurrent call to
+ * fl_destroy() by reference counting.
+ */
+ return rcu_dereference_raw(tp->root);
+}
+
static void __fl_destroy_filter(struct cls_fl_filter *f)
{
tcf_exts_destroy(&f->exts);
@@ -386,6 +395,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
spin_lock(&tp->lock);
+ list_del_init(&f->hw_list);
tcf_block_offload_dec(block, &f->flags);
spin_unlock(&tp->lock);
@@ -397,6 +407,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
struct cls_fl_filter *f, bool rtnl_held,
struct netlink_ext_ack *extack)
{
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
bool skip_sw = tc_skip_sw(f->flags);
@@ -448,6 +459,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
goto errout;
}
+ spin_lock(&tp->lock);
+ list_add(&f->hw_list, &head->hw_filters);
+ spin_unlock(&tp->lock);
errout:
if (!rtnl_held)
rtnl_unlock();
@@ -479,23 +493,11 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
rtnl_unlock();
}
-static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
-{
- /* Flower classifier only changes root pointer during init and destroy.
- * Users must obtain reference to tcf_proto instance before calling its
- * API, so tp->root pointer is protected from concurrent call to
- * fl_destroy() by reference counting.
- */
- return rcu_dereference_raw(tp->root);
-}
-
static void __fl_put(struct cls_fl_filter *f)
{
if (!refcount_dec_and_test(&f->refcnt))
return;
- WARN_ON(!f->deleted);
-
if (tcf_exts_get_net(&f->exts))
tcf_queue_work(&f->rwork, fl_destroy_filter_work);
else
@@ -538,7 +540,6 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = fl_head_dereference(tp);
- bool async = tcf_exts_get_net(&f->exts);
*last = false;
@@ -555,7 +556,7 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
list_del_rcu(&f->list);
spin_unlock(&tp->lock);
- *last = fl_mask_put(head, f->mask, async);
+ *last = fl_mask_put(head, f->mask);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f, rtnl_held, extack);
tcf_unbind_filter(tp, &f->res);
@@ -1466,9 +1467,9 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew,
struct fl_flow_mask *mask = fnew->mask;
int err;
- err = rhashtable_insert_fast(&mask->ht,
- &fnew->ht_node,
- mask->filter_ht_params);
+ err = rhashtable_lookup_insert_fast(&mask->ht,
+ &fnew->ht_node,
+ mask->filter_ht_params);
if (err) {
*in_ht = false;
/* It is okay if filter with same key exists when
@@ -1527,6 +1528,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
err = -ENOBUFS;
goto errout_tb;
}
+ INIT_LIST_HEAD(&fnew->hw_list);
refcount_set(&fnew->refcnt, 1);
err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
@@ -1574,7 +1576,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout_hw;
}
- refcount_inc(&fnew->refcnt);
if (fold) {
/* Fold filter was deleted concurrently. Retry lookup. */
if (fold->deleted) {
@@ -1596,6 +1597,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
in_ht = true;
}
+ refcount_inc(&fnew->refcnt);
rhashtable_remove_fast(&fold->mask->ht,
&fold->ht_node,
fold->mask->filter_ht_params);
@@ -1605,11 +1607,10 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
spin_unlock(&tp->lock);
- fl_mask_put(head, fold->mask, true);
+ fl_mask_put(head, fold->mask);
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
tcf_unbind_filter(tp, &fold->res);
- tcf_exts_get_net(&fold->exts);
/* Caller holds reference to fold, so refcnt is always > 0
* after this.
*/
@@ -1637,6 +1638,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errout_hw;
+ refcount_inc(&fnew->refcnt);
fnew->handle = handle;
list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
spin_unlock(&tp->lock);
@@ -1648,18 +1650,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
kfree(mask);
return 0;
+errout_ht:
+ spin_lock(&tp->lock);
errout_hw:
+ fnew->deleted = true;
spin_unlock(&tp->lock);
if (!tc_skip_hw(fnew->flags))
fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
-errout_ht:
if (in_ht)
rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
fnew->mask->filter_ht_params);
errout_mask:
- fl_mask_put(head, fnew->mask, true);
+ fl_mask_put(head, fnew->mask);
errout:
- tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
+ __fl_put(fnew);
errout_tb:
kfree(tb);
errout_mask_alloc:
@@ -1704,19 +1708,46 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
}
}
+static struct cls_fl_filter *
+fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ if (list_empty(&head->hw_filters)) {
+ spin_unlock(&tp->lock);
+ return NULL;
+ }
+
+ if (!f)
+ f = list_entry(&head->hw_filters, struct cls_fl_filter,
+ hw_list);
+ list_for_each_entry_continue(f, &head->hw_filters, hw_list) {
+ if (!(add && f->deleted) && refcount_inc_not_zero(&f->refcnt)) {
+ spin_unlock(&tp->lock);
+ return f;
+ }
+ }
+
+ spin_unlock(&tp->lock);
+ return NULL;
+}
+
static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
- unsigned long handle = 0;
- struct cls_fl_filter *f;
+ struct cls_fl_filter *f = NULL;
int err;
- while ((f = fl_get_next_filter(tp, &handle))) {
- if (tc_skip_hw(f->flags))
- goto next_flow;
+ /* hw_filters list can only be changed by hw offload functions after
+ * obtaining rtnl lock. Make sure it is not changed while reoffload is
+ * iterating it.
+ */
+ ASSERT_RTNL();
+ while ((f = fl_get_next_hw_filter(tp, f, add))) {
cls_flower.rule =
flow_rule_alloc(tcf_exts_num_actions(&f->exts));
if (!cls_flower.rule) {
@@ -1762,7 +1793,6 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
add);
spin_unlock(&tp->lock);
next_flow:
- handle++;
__fl_put(f);
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index fb8f138b9776..c126b9f78d6e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -998,6 +998,19 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb,
qdisc_put(old);
}
+static void qdisc_clear_nolock(struct Qdisc *sch)
+{
+ sch->flags &= ~TCQ_F_NOLOCK;
+ if (!(sch->flags & TCQ_F_CPUSTATS))
+ return;
+
+ free_percpu(sch->cpu_bstats);
+ free_percpu(sch->cpu_qstats);
+ sch->cpu_bstats = NULL;
+ sch->cpu_qstats = NULL;
+ sch->flags &= ~TCQ_F_CPUSTATS;
+}
+
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
* to device "dev".
*
@@ -1076,7 +1089,7 @@ skip:
/* Only support running class lockless if parent is lockless */
if (new && (new->flags & TCQ_F_NOLOCK) &&
parent && !(parent->flags & TCQ_F_NOLOCK))
- new->flags &= ~TCQ_F_NOLOCK;
+ qdisc_clear_nolock(new);
if (!cops || !cops->graft)
return -EOPNOTSUPP;
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index c6a502933fe7..f68fd7a0e038 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -61,16 +61,20 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <net/netevent.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
+static LIST_HEAD(cbs_list);
+static DEFINE_SPINLOCK(cbs_list_lock);
+
#define BYTES_PER_KBIT (1000LL / 8)
struct cbs_sched_data {
bool offload;
int queue;
- s64 port_rate; /* in bytes/s */
+ atomic64_t port_rate; /* in bytes/s */
s64 last; /* timestamp in ns */
s64 credits; /* in bytes */
s32 locredit; /* in bytes */
@@ -82,6 +86,7 @@ struct cbs_sched_data {
struct sk_buff **to_free);
struct sk_buff *(*dequeue)(struct Qdisc *sch);
struct Qdisc *qdisc;
+ struct list_head cbs_list;
};
static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -181,6 +186,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
s64 credits;
int len;
+ if (atomic64_read(&q->port_rate) == -1) {
+ WARN_ONCE(1, "cbs: dequeue() called with unknown port rate.");
+ return NULL;
+ }
+
if (q->credits < 0) {
credits = timediff_to_credits(now - q->last, q->idleslope);
@@ -207,7 +217,8 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
/* As sendslope is a negative number, this will decrease the
* amount of q->credits.
*/
- credits = credits_from_len(len, q->sendslope, q->port_rate);
+ credits = credits_from_len(len, q->sendslope,
+ atomic64_read(&q->port_rate));
credits += q->credits;
q->credits = max_t(s64, credits, q->locredit);
@@ -294,6 +305,50 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
return 0;
}
+static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
+{
+ struct ethtool_link_ksettings ecmd;
+ int port_rate = -1;
+
+ if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
+ ecmd.base.speed != SPEED_UNKNOWN)
+ port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT;
+
+ atomic64_set(&q->port_rate, port_rate);
+ netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->port_rate),
+ ecmd.base.speed);
+}
+
+static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct cbs_sched_data *q;
+ struct net_device *qdev;
+ bool found = false;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ spin_lock(&cbs_list_lock);
+ list_for_each_entry(q, &cbs_list, cbs_list) {
+ qdev = qdisc_dev(q->qdisc);
+ if (qdev == dev) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&cbs_list_lock);
+
+ if (found)
+ cbs_set_port_rate(dev, q);
+
+ return NOTIFY_DONE;
+}
+
static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -315,16 +370,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
qopt = nla_data(tb[TCA_CBS_PARMS]);
if (!qopt->offload) {
- struct ethtool_link_ksettings ecmd;
- s64 link_speed;
-
- if (!__ethtool_get_link_ksettings(dev, &ecmd))
- link_speed = ecmd.base.speed;
- else
- link_speed = SPEED_1000;
-
- q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
-
+ cbs_set_port_rate(dev, q);
cbs_disable_offload(dev, q);
} else {
err = cbs_enable_offload(dev, q, qopt, extack);
@@ -347,6 +393,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
{
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ int err;
if (!opt) {
NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory");
@@ -367,7 +414,17 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
qdisc_watchdog_init(&q->watchdog, sch);
- return cbs_change(sch, opt, extack);
+ err = cbs_change(sch, opt, extack);
+ if (err)
+ return err;
+
+ if (!q->offload) {
+ spin_lock(&cbs_list_lock);
+ list_add(&q->cbs_list, &cbs_list);
+ spin_unlock(&cbs_list_lock);
+ }
+
+ return 0;
}
static void cbs_destroy(struct Qdisc *sch)
@@ -375,8 +432,11 @@ static void cbs_destroy(struct Qdisc *sch)
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- qdisc_watchdog_cancel(&q->watchdog);
+ spin_lock(&cbs_list_lock);
+ list_del(&q->cbs_list);
+ spin_unlock(&cbs_list_lock);
+ qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
if (q->qdisc)
@@ -487,14 +547,24 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
.owner = THIS_MODULE,
};
+static struct notifier_block cbs_device_notifier = {
+ .notifier_call = cbs_dev_notifier,
+};
+
static int __init cbs_module_init(void)
{
+ int err = register_netdevice_notifier(&cbs_device_notifier);
+
+ if (err)
+ return err;
+
return register_qdisc(&cbs_qdisc_ops);
}
static void __exit cbs_module_exit(void)
{
unregister_qdisc(&cbs_qdisc_ops);
+ unregister_netdevice_notifier(&cbs_device_notifier);
}
module_init(cbs_module_init)
module_exit(cbs_module_exit)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 81356ef38d1d..848aab3693bd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -68,7 +68,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
skb = __skb_dequeue(&q->skb_bad_txq);
if (qdisc_is_percpu_stats(q)) {
qdisc_qstats_cpu_backlog_dec(q, skb);
- qdisc_qstats_atomic_qlen_dec(q);
+ qdisc_qstats_cpu_qlen_dec(q);
} else {
qdisc_qstats_backlog_dec(q, skb);
q->q.qlen--;
@@ -108,7 +108,7 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
if (qdisc_is_percpu_stats(q)) {
qdisc_qstats_cpu_backlog_inc(q, skb);
- qdisc_qstats_atomic_qlen_inc(q);
+ qdisc_qstats_cpu_qlen_inc(q);
} else {
qdisc_qstats_backlog_inc(q, skb);
q->q.qlen++;
@@ -118,52 +118,36 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
spin_unlock(lock);
}
-static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
- while (skb) {
- struct sk_buff *next = skb->next;
-
- __skb_queue_tail(&q->gso_skb, skb);
- q->qstats.requeues++;
- qdisc_qstats_backlog_inc(q, skb);
- q->q.qlen++; /* it's still part of the queue */
+ spinlock_t *lock = NULL;
- skb = next;
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
}
- __netif_schedule(q);
-
- return 0;
-}
-static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
-{
- spinlock_t *lock = qdisc_lock(q);
-
- spin_lock(lock);
while (skb) {
struct sk_buff *next = skb->next;
__skb_queue_tail(&q->gso_skb, skb);
- qdisc_qstats_cpu_requeues_inc(q);
- qdisc_qstats_cpu_backlog_inc(q, skb);
- qdisc_qstats_atomic_qlen_inc(q);
+ /* it's still part of the queue */
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_requeues_inc(q);
+ qdisc_qstats_cpu_backlog_inc(q, skb);
+ qdisc_qstats_cpu_qlen_inc(q);
+ } else {
+ q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
+ q->q.qlen++;
+ }
skb = next;
}
- spin_unlock(lock);
-
+ if (lock)
+ spin_unlock(lock);
__netif_schedule(q);
-
- return 0;
-}
-
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
-{
- if (q->flags & TCQ_F_NOLOCK)
- return dev_requeue_skb_locked(skb, q);
- else
- return __dev_requeue_skb(skb, q);
}
static void try_bulk_dequeue_skb(struct Qdisc *q,
@@ -252,7 +236,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
skb = __skb_dequeue(&q->gso_skb);
if (qdisc_is_percpu_stats(q)) {
qdisc_qstats_cpu_backlog_dec(q, skb);
- qdisc_qstats_atomic_qlen_dec(q);
+ qdisc_qstats_cpu_qlen_dec(q);
} else {
qdisc_qstats_backlog_dec(q, skb);
q->q.qlen--;
@@ -645,11 +629,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
if (unlikely(err))
return qdisc_drop_cpu(skb, qdisc, to_free);
- qdisc_qstats_atomic_qlen_inc(qdisc);
- /* Note: skb can not be used after skb_array_produce(),
- * so we better not use qdisc_qstats_cpu_backlog_inc()
- */
- this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len);
+ qdisc_update_stats_at_enqueue(qdisc, pkt_len);
return NET_XMIT_SUCCESS;
}
@@ -668,9 +648,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
skb = __skb_array_consume(q);
}
if (likely(skb)) {
- qdisc_qstats_cpu_backlog_dec(qdisc, skb);
- qdisc_bstats_cpu_update(qdisc, skb);
- qdisc_qstats_atomic_qlen_dec(qdisc);
+ qdisc_update_stats_at_dequeue(qdisc, skb);
} else {
qdisc->empty = true;
}
@@ -716,6 +694,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
q->backlog = 0;
+ q->qlen = 0;
}
}
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index c7041999eb5d..df848a36b222 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -13,6 +13,7 @@
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <linux/math64.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <net/netlink.h>
@@ -20,6 +21,9 @@
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
+static LIST_HEAD(taprio_list);
+static DEFINE_SPINLOCK(taprio_list_lock);
+
#define TAPRIO_ALL_GATES_OPEN -1
struct sched_entry {
@@ -42,9 +46,9 @@ struct taprio_sched {
struct Qdisc *root;
s64 base_time;
int clockid;
- int picos_per_byte; /* Using picoseconds because for 10Gbps+
- * speeds it's sub-nanoseconds per byte
- */
+ atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
+ * speeds it's sub-nanoseconds per byte
+ */
size_t num_entries;
/* Protects the update side of the RCU protected current_entry */
@@ -53,6 +57,7 @@ struct taprio_sched {
struct list_head entries;
ktime_t (*get_time)(void);
struct hrtimer advance_timer;
+ struct list_head taprio_list;
};
static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -85,7 +90,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
- gate_mask = entry ? entry->gate_mask : -1;
+ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
rcu_read_unlock();
if (!gate_mask)
@@ -107,7 +112,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
tc = netdev_get_prio_tc_map(dev, prio);
if (!(gate_mask & BIT(tc)))
- return NULL;
+ continue;
return skb;
}
@@ -117,7 +122,14 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
static inline int length_to_duration(struct taprio_sched *q, int len)
{
- return (len * q->picos_per_byte) / 1000;
+ return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+}
+
+static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
+{
+ atomic_set(&entry->budget,
+ div64_u64((u64)entry->interval * 1000,
+ atomic64_read(&q->picos_per_byte)));
}
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
@@ -129,6 +141,11 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
u32 gate_mask;
int i;
+ if (atomic64_read(&q->picos_per_byte) == -1) {
+ WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte.");
+ return NULL;
+ }
+
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
/* if there's no entry, it means that the schedule didn't
@@ -171,12 +188,12 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
*/
if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
ktime_after(guard, entry->close_time))
- return NULL;
+ continue;
/* ... and no budget. */
if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
atomic_sub_return(len, &entry->budget) < 0)
- return NULL;
+ continue;
skb = child->ops->dequeue(child);
if (unlikely(!skb))
@@ -192,14 +209,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
return NULL;
}
-static bool should_restart_cycle(const struct taprio_sched *q,
- const struct sched_entry *entry)
-{
- WARN_ON(!entry);
-
- return list_is_last(&entry->list, &q->entries);
-}
-
static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
struct taprio_sched *q = container_of(timer, struct taprio_sched,
@@ -223,7 +232,7 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
goto first_run;
}
- if (should_restart_cycle(q, entry))
+ if (list_is_last(&entry->list, &q->entries))
next = list_first_entry(&q->entries, struct sched_entry,
list);
else
@@ -232,8 +241,7 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
close_time = ktime_add_ns(entry->close_time, next->interval);
next->close_time = close_time;
- atomic_set(&next->budget,
- (next->interval * 1000) / q->picos_per_byte);
+ taprio_set_budget(q, next);
first_run:
rcu_assign_pointer(q->current_entry, next);
@@ -523,7 +531,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
return 0;
}
-static ktime_t taprio_get_start_time(struct Qdisc *sch)
+static int taprio_get_start_time(struct Qdisc *sch, ktime_t *start)
{
struct taprio_sched *q = qdisc_priv(sch);
struct sched_entry *entry;
@@ -531,27 +539,33 @@ static ktime_t taprio_get_start_time(struct Qdisc *sch)
s64 n;
base = ns_to_ktime(q->base_time);
- cycle = 0;
+ now = q->get_time();
+
+ if (ktime_after(base, now)) {
+ *start = base;
+ return 0;
+ }
/* Calculate the cycle_time, by summing all the intervals.
*/
+ cycle = 0;
list_for_each_entry(entry, &q->entries, list)
cycle = ktime_add_ns(cycle, entry->interval);
- if (!cycle)
- return base;
-
- now = q->get_time();
-
- if (ktime_after(base, now))
- return base;
+ /* The qdisc is expected to have at least one sched_entry. Moreover,
+ * any entry must have 'interval' > 0. Thus if the cycle time is zero,
+ * something went really wrong. In that case, we should warn about this
+ * inconsistent state and return error.
+ */
+ if (WARN_ON(!cycle))
+ return -EFAULT;
/* Schedule the start time for the beginning of the next
* cycle.
*/
n = div64_s64(ktime_sub_ns(now, base), cycle);
-
- return ktime_add_ns(base, (n + 1) * cycle);
+ *start = ktime_add_ns(base, (n + 1) * cycle);
+ return 0;
}
static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
@@ -566,8 +580,7 @@ static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
list);
first->close_time = ktime_add_ns(start, first->interval);
- atomic_set(&first->budget,
- (first->interval * 1000) / q->picos_per_byte);
+ taprio_set_budget(q, first);
rcu_assign_pointer(q->current_entry, NULL);
spin_unlock_irqrestore(&q->current_entry_lock, flags);
@@ -575,6 +588,52 @@ static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
}
+static void taprio_set_picos_per_byte(struct net_device *dev,
+ struct taprio_sched *q)
+{
+ struct ethtool_link_ksettings ecmd;
+ int picos_per_byte = -1;
+
+ if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
+ ecmd.base.speed != SPEED_UNKNOWN)
+ picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
+ ecmd.base.speed * 1000 * 1000);
+
+ atomic64_set(&q->picos_per_byte, picos_per_byte);
+ netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->picos_per_byte),
+ ecmd.base.speed);
+}
+
+static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net_device *qdev;
+ struct taprio_sched *q;
+ bool found = false;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ spin_lock(&taprio_list_lock);
+ list_for_each_entry(q, &taprio_list, taprio_list) {
+ qdev = qdisc_dev(q->root);
+ if (qdev == dev) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&taprio_list_lock);
+
+ if (found)
+ taprio_set_picos_per_byte(dev, q);
+
+ return NOTIFY_DONE;
+}
+
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -582,9 +641,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
- struct ethtool_link_ksettings ecmd;
int i, err, size;
- s64 link_speed;
ktime_t start;
err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt,
@@ -592,7 +649,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (err < 0)
return err;
- err = -EINVAL;
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
@@ -657,17 +713,13 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
mqprio->prio_tc_map[i]);
}
- if (!__ethtool_get_link_ksettings(dev, &ecmd))
- link_speed = ecmd.base.speed;
- else
- link_speed = SPEED_1000;
-
- q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
- link_speed * 1000 * 1000);
+ taprio_set_picos_per_byte(dev, q);
- start = taprio_get_start_time(sch);
- if (!start)
- return 0;
+ err = taprio_get_start_time(sch, &start);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
+ return err;
+ }
taprio_start_sched(sch, start);
@@ -681,6 +733,10 @@ static void taprio_destroy(struct Qdisc *sch)
struct sched_entry *entry, *n;
unsigned int i;
+ spin_lock(&taprio_list_lock);
+ list_del(&q->taprio_list);
+ spin_unlock(&taprio_list_lock);
+
hrtimer_cancel(&q->advance_timer);
if (q->qdiscs) {
@@ -735,6 +791,10 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt)
return -EINVAL;
+ spin_lock(&taprio_list_lock);
+ list_add(&q->taprio_list, &taprio_list);
+ spin_unlock(&taprio_list_lock);
+
return taprio_change(sch, opt, extack);
}
@@ -947,14 +1007,24 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
.owner = THIS_MODULE,
};
+static struct notifier_block taprio_device_notifier = {
+ .notifier_call = taprio_dev_notifier,
+};
+
static int __init taprio_module_init(void)
{
+ int err = register_netdevice_notifier(&taprio_device_notifier);
+
+ if (err)
+ return err;
+
return register_qdisc(&taprio_qdisc_ops);
}
static void __exit taprio_module_exit(void)
{
unregister_qdisc(&taprio_qdisc_ops);
+ unregister_netdevice_notifier(&taprio_device_notifier);
}
module_init(taprio_module_init);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 6200cd2b4b99..188c47eb206e 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1030,6 +1030,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
.getname = sctp_getname,
.poll = sctp_poll,
.ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sctp_inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 951afdeea5e9..f0631bf486b6 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1026,6 +1026,7 @@ static const struct proto_ops inet_seqpacket_ops = {
.getname = inet_getname, /* Semantics are different. */
.poll = sctp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sctp_inet_listen,
.shutdown = inet_shutdown, /* Looks harmless. */
.setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index c9ae3404b1bb..7dfc34b28f4f 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6412,13 +6412,15 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our
* memory usage too much
*/
- if (*sk->sk_prot_creator->memory_pressure) {
+ if (sk_under_memory_pressure(sk)) {
if (sctp_tsnmap_has_gap(map) &&
(sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
pr_debug("%s: under pressure, reneging for tsn:%u\n",
__func__, tsn);
deliver = SCTP_CMD_RENEGE;
- }
+ } else {
+ sk_mem_reclaim(sk);
+ }
}
/*
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9874e60c9b0d..e4e892cc5644 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1913,7 +1913,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (sctp_wspace(asoc) < (int)msg_len)
sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
- if (sctp_wspace(asoc) <= 0) {
+ if (sk_under_memory_pressure(sk))
+ sk_mem_reclaim(sk);
+
+ if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) {
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
if (err)
@@ -4847,7 +4850,8 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr,
}
/* Validate addr_len before calling common connect/connectx routine. */
- af = sctp_get_af_specific(addr->sa_family);
+ af = addr_len < offsetofend(struct sockaddr, sa_family) ? NULL :
+ sctp_get_af_specific(addr->sa_family);
if (!af || addr_len < af->sockaddr_len) {
err = -EINVAL;
} else {
@@ -8930,7 +8934,10 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
goto do_error;
if (signal_pending(current))
goto do_interrupted;
- if ((int)msg_len <= sctp_wspace(asoc))
+ if (sk_under_memory_pressure(sk))
+ sk_mem_reclaim(sk);
+ if ((int)msg_len <= sctp_wspace(asoc) &&
+ sk_wmem_schedule(sk, msg_len))
break;
/* Let another process have a go. Since we are going
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 102c6fefe38c..25e0b7e5189c 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -484,14 +484,15 @@ static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq,
}
static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
- struct sctp_ulpevent *event)
+ struct sk_buff_head *skb_list)
{
- struct sk_buff *skb = sctp_event2skb(event);
struct sock *sk = ulpq->asoc->base.sk;
struct sctp_sock *sp = sctp_sk(sk);
- struct sk_buff_head *skb_list;
+ struct sctp_ulpevent *event;
+ struct sk_buff *skb;
- skb_list = (struct sk_buff_head *)skb->prev;
+ skb = __skb_peek(skb_list);
+ event = sctp_skb2event(skb);
if (sk->sk_shutdown & RCV_SHUTDOWN &&
(sk->sk_shutdown & SEND_SHUTDOWN ||
@@ -858,19 +859,24 @@ static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
if (!(event->msg_flags & SCTP_DATA_UNORDERED)) {
event = sctp_intl_reasm(ulpq, event);
- if (event && event->msg_flags & MSG_EOR) {
+ if (event) {
skb_queue_head_init(&temp);
__skb_queue_tail(&temp, sctp_event2skb(event));
- event = sctp_intl_order(ulpq, event);
+ if (event->msg_flags & MSG_EOR)
+ event = sctp_intl_order(ulpq, event);
}
} else {
event = sctp_intl_reasm_uo(ulpq, event);
+ if (event) {
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ }
}
if (event) {
event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
- sctp_enqueue_event(ulpq, event);
+ sctp_enqueue_event(ulpq, &temp);
}
return event_eor;
@@ -944,20 +950,27 @@ out:
static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
struct sctp_ulpevent *event;
+ struct sk_buff_head temp;
if (!skb_queue_empty(&ulpq->reasm)) {
do {
event = sctp_intl_retrieve_first(ulpq);
- if (event)
- sctp_enqueue_event(ulpq, event);
+ if (event) {
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ sctp_enqueue_event(ulpq, &temp);
+ }
} while (event);
}
if (!skb_queue_empty(&ulpq->reasm_uo)) {
do {
event = sctp_intl_retrieve_first_uo(ulpq);
- if (event)
- sctp_enqueue_event(ulpq, event);
+ if (event) {
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ sctp_enqueue_event(ulpq, &temp);
+ }
} while (event);
}
}
@@ -1059,7 +1072,7 @@ static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
if (event) {
sctp_intl_retrieve_ordered(ulpq, event);
- sctp_enqueue_event(ulpq, event);
+ sctp_enqueue_event(ulpq, &temp);
}
}
@@ -1298,6 +1311,15 @@ static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
ntohl(skip->mid), skip->flags);
}
+static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
+{
+ struct sk_buff_head temp;
+
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ return sctp_ulpq_tail_event(ulpq, &temp);
+}
+
static struct sctp_stream_interleave sctp_stream_interleave_0 = {
.data_chunk_len = sizeof(struct sctp_data_chunk),
.ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk),
@@ -1306,7 +1328,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
.assign_number = sctp_chunk_assign_ssn,
.validate_data = sctp_validate_data,
.ulpevent_data = sctp_ulpq_tail_data,
- .enqueue_event = sctp_ulpq_tail_event,
+ .enqueue_event = do_ulpq_tail_event,
.renege_events = sctp_ulpq_renege,
.start_pd = sctp_ulpq_partial_delivery,
.abort_pd = sctp_ulpq_abort_pd,
@@ -1317,6 +1339,16 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
.handle_ftsn = sctp_handle_fwdtsn,
};
+static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff_head temp;
+
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ return sctp_enqueue_event(ulpq, &temp);
+}
+
static struct sctp_stream_interleave sctp_stream_interleave_1 = {
.data_chunk_len = sizeof(struct sctp_idata_chunk),
.ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk),
@@ -1325,7 +1357,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
.assign_number = sctp_chunk_assign_mid,
.validate_data = sctp_validate_idata,
.ulpevent_data = sctp_ulpevent_idata,
- .enqueue_event = sctp_enqueue_event,
+ .enqueue_event = do_sctp_enqueue_event,
.renege_events = sctp_renege_events,
.start_pd = sctp_intl_start_pd,
.abort_pd = sctp_intl_abort_pd,
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 8cb7d9858270..c2a7478587ab 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -634,8 +634,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
gfp_t gfp)
{
struct sctp_ulpevent *event = NULL;
- struct sk_buff *skb;
- size_t padding, len;
+ struct sk_buff *skb = chunk->skb;
+ struct sock *sk = asoc->base.sk;
+ size_t padding, datalen;
int rx_count;
/*
@@ -646,15 +647,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
if (asoc->ep->rcvbuf_policy)
rx_count = atomic_read(&asoc->rmem_alloc);
else
- rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+ rx_count = atomic_read(&sk->sk_rmem_alloc);
- if (rx_count >= asoc->base.sk->sk_rcvbuf) {
+ datalen = ntohs(chunk->chunk_hdr->length);
- if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
- (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
- chunk->skb->truesize)))
- goto fail;
- }
+ if (rx_count >= sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, datalen))
+ goto fail;
/* Clone the original skb, sharing the data. */
skb = skb_clone(chunk->skb, gfp);
@@ -681,8 +679,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
* The sender should never pad with more than 3 bytes. The receiver
* MUST ignore the padding bytes.
*/
- len = ntohs(chunk->chunk_hdr->length);
- padding = SCTP_PAD4(len) - len;
+ padding = SCTP_PAD4(datalen) - datalen;
/* Fixup cloned skb with just this chunks data. */
skb_trim(skb, chunk->chunk_end - padding - skb->data);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 5dde92101743..a212fe079c07 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -116,12 +116,13 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
event = sctp_ulpq_reasm(ulpq, event);
/* Do ordering if needed. */
- if ((event) && (event->msg_flags & MSG_EOR)) {
+ if (event) {
/* Create a temporary list to collect chunks on. */
skb_queue_head_init(&temp);
__skb_queue_tail(&temp, sctp_event2skb(event));
- event = sctp_ulpq_order(ulpq, event);
+ if (event->msg_flags & MSG_EOR)
+ event = sctp_ulpq_order(ulpq, event);
}
/* Send event to the ULP. 'event' is the sctp_ulpevent for
@@ -129,7 +130,7 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
*/
if (event) {
event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
- sctp_ulpq_tail_event(ulpq, event);
+ sctp_ulpq_tail_event(ulpq, &temp);
}
return event_eor;
@@ -193,18 +194,17 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc);
}
-/* If the SKB of 'event' is on a list, it is the first such member
- * of that list.
- */
-int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
+int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list)
{
struct sock *sk = ulpq->asoc->base.sk;
struct sctp_sock *sp = sctp_sk(sk);
- struct sk_buff_head *queue, *skb_list;
- struct sk_buff *skb = sctp_event2skb(event);
+ struct sctp_ulpevent *event;
+ struct sk_buff_head *queue;
+ struct sk_buff *skb;
int clear_pd = 0;
- skb_list = (struct sk_buff_head *) skb->prev;
+ skb = __skb_peek(skb_list);
+ event = sctp_skb2event(skb);
/* If the socket is just going to throw this away, do not
* even try to deliver it.
@@ -257,13 +257,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
}
}
- /* If we are harvesting multiple skbs they will be
- * collected on a list.
- */
- if (skb_list)
- skb_queue_splice_tail_init(skb_list, queue);
- else
- __skb_queue_tail(queue, skb);
+ skb_queue_splice_tail_init(skb_list, queue);
/* Did we just complete partial delivery and need to get
* rolling again? Move pending data to the receive
@@ -738,25 +732,25 @@ void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn)
static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq)
{
struct sctp_ulpevent *event = NULL;
- struct sk_buff_head temp;
if (skb_queue_empty(&ulpq->reasm))
return;
while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) {
- /* Do ordering if needed. */
- if ((event) && (event->msg_flags & MSG_EOR)) {
- skb_queue_head_init(&temp);
- __skb_queue_tail(&temp, sctp_event2skb(event));
+ struct sk_buff_head temp;
+
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ /* Do ordering if needed. */
+ if (event->msg_flags & MSG_EOR)
event = sctp_ulpq_order(ulpq, event);
- }
/* Send event to the ULP. 'event' is the
* sctp_ulpevent for very first SKB on the temp' list.
*/
if (event)
- sctp_ulpq_tail_event(ulpq, event);
+ sctp_ulpq_tail_event(ulpq, &temp);
}
}
@@ -956,7 +950,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
if (event) {
/* see if we have more ordered that we can deliver */
sctp_ulpq_retrieve_ordered(ulpq, event);
- sctp_ulpq_tail_event(ulpq, event);
+ sctp_ulpq_tail_event(ulpq, &temp);
}
}
@@ -1082,7 +1076,11 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
event = sctp_ulpq_retrieve_first(ulpq);
/* Send event to the ULP. */
if (event) {
- sctp_ulpq_tail_event(ulpq, event);
+ struct sk_buff_head temp;
+
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+ sctp_ulpq_tail_event(ulpq, &temp);
sctp_ulpq_set_pd(ulpq);
return;
}
@@ -1106,7 +1104,8 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
freed += sctp_ulpq_renege_frags(ulpq, needed - freed);
}
/* If able to free enough room, accept this chunk. */
- if (freed >= needed) {
+ if (sk_rmem_schedule(asoc->base.sk, chunk->skb, needed) &&
+ freed >= needed) {
int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp);
/*
* Enter partial delivery if chunk has not been
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 77ef53596d18..086d9913975d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -134,11 +134,9 @@ static int smc_release(struct socket *sock)
smc = smc_sk(sk);
/* cleanup for a dangling non-blocking connect */
- if (smc->connect_info && sk->sk_state == SMC_INIT)
+ if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
tcp_abort(smc->clcsock->sk, ECONNABORTED);
flush_work(&smc->connect_work);
- kfree(smc->connect_info);
- smc->connect_info = NULL;
if (sk->sk_state == SMC_LISTEN)
/* smc_close_non_accepted() is called and acquires
@@ -167,10 +165,9 @@ static int smc_release(struct socket *sock)
if (sk->sk_state == SMC_CLOSED) {
if (smc->clcsock) {
- mutex_lock(&smc->clcsock_release_lock);
- sock_release(smc->clcsock);
- smc->clcsock = NULL;
- mutex_unlock(&smc->clcsock_release_lock);
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
}
if (!smc->use_fallback)
smc_conn_free(&smc->conn);
@@ -446,12 +443,22 @@ static void smc_link_save_peer_info(struct smc_link *link,
link->peer_mtu = clc->qp_mtu;
}
+static void smc_switch_to_fallback(struct smc_sock *smc)
+{
+ smc->use_fallback = true;
+ if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
+ smc->clcsock->file = smc->sk.sk_socket->file;
+ smc->clcsock->file->private_data = smc->clcsock;
+ }
+}
+
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
- smc->use_fallback = true;
+ smc_switch_to_fallback(smc);
smc->fallback_rsn = reason_code;
smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
@@ -491,46 +498,41 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code,
mutex_unlock(&smc_client_lgr_pending);
smc_conn_free(&smc->conn);
+ smc->connect_nonblock = 0;
return reason_code;
}
/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
-static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
- u8 *ibport, unsigned short vlan_id, u8 gid[])
+static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
{
- int reason_code = 0;
-
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
- smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
- gid);
- if (!(*ibdev))
- reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-
- return reason_code;
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
+ if (!ini->ib_dev)
+ return SMC_CLC_DECL_NOSMCRDEV;
+ return 0;
}
/* check if there is an ISM device available for this connection. */
/* called for connect and listen */
-static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
+static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* Find ISM device with same PNETID as connecting interface */
- smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
- if (!(*ismdev))
- return SMC_CLC_DECL_CNFERR; /* configuration error */
+ smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
+ if (!ini->ism_dev)
+ return SMC_CLC_DECL_NOSMCDDEV;
return 0;
}
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
- struct smcd_dev *ismdev,
- unsigned short vlan_id)
+ struct smc_init_info *ini)
{
- if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
- return SMC_CLC_DECL_CNFERR;
+ if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
+ return SMC_CLC_DECL_ISMVLANERR;
return 0;
}
@@ -538,12 +540,11 @@ static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
* used, the VLAN ID will be registered again during the connection setup.
*/
static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
- struct smcd_dev *ismdev,
- unsigned short vlan_id)
+ struct smc_init_info *ini)
{
if (!is_smcd)
return 0;
- if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
+ if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
return SMC_CLC_DECL_CNFERR;
return 0;
}
@@ -551,13 +552,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
/* CLC handshake during connect */
static int smc_connect_clc(struct smc_sock *smc, int smc_type,
struct smc_clc_msg_accept_confirm *aclc,
- struct smc_ib_device *ibdev, u8 ibport,
- u8 gid[], struct smcd_dev *ismdev)
+ struct smc_init_info *ini)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
+ rc = smc_clc_send_proposal(smc, smc_type, ini);
if (rc)
return rc;
/* receive SMC Accept CLC message */
@@ -568,23 +568,19 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type,
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
- struct smc_ib_device *ibdev, u8 ibport)
+ struct smc_init_info *ini)
{
- int local_contact = SMC_FIRST_CONTACT;
struct smc_link *link;
int reason_code = 0;
+ ini->is_smcd = false;
+ ini->ib_lcl = &aclc->lcl;
+ ini->ib_clcqpn = ntoh24(aclc->qpn);
+ ini->srv_first_contact = aclc->hdr.flag;
+
mutex_lock(&smc_client_lgr_pending);
- local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
- ibport, ntoh24(aclc->qpn), &aclc->lcl,
- NULL, 0);
- if (local_contact < 0) {
- if (local_contact == -ENOMEM)
- reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
- else if (local_contact == -ENOLINK)
- reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
- else
- reason_code = SMC_CLC_DECL_INTERR; /* other error */
+ reason_code = smc_conn_create(smc, ini);
+ if (reason_code) {
mutex_unlock(&smc_client_lgr_pending);
return reason_code;
}
@@ -594,45 +590,48 @@ static int smc_connect_rdma(struct smc_sock *smc,
/* create send buffer and rmb */
if (smc_buf_create(smc, false))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
+ ini->cln_first_contact);
- if (local_contact == SMC_FIRST_CONTACT)
+ if (ini->cln_first_contact == SMC_FIRST_CONTACT)
smc_link_save_peer_info(link, aclc);
if (smc_rmb_rtoken_handling(&smc->conn, aclc))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
- local_contact);
+ ini->cln_first_contact);
smc_close_init(smc);
smc_rx_init(smc);
- if (local_contact == SMC_FIRST_CONTACT) {
+ if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
if (smc_ib_ready_link(link))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
- local_contact);
+ ini->cln_first_contact);
} else {
if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
- local_contact);
+ ini->cln_first_contact);
}
smc_rmb_sync_sg_for_device(&smc->conn);
reason_code = smc_clc_send_confirm(smc);
if (reason_code)
- return smc_connect_abort(smc, reason_code, local_contact);
+ return smc_connect_abort(smc, reason_code,
+ ini->cln_first_contact);
smc_tx_init(smc);
- if (local_contact == SMC_FIRST_CONTACT) {
+ if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
/* QP confirmation over RoCE fabric */
reason_code = smc_clnt_conf_first_link(smc);
if (reason_code)
return smc_connect_abort(smc, reason_code,
- local_contact);
+ ini->cln_first_contact);
}
mutex_unlock(&smc_client_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
@@ -642,23 +641,26 @@ static int smc_connect_rdma(struct smc_sock *smc,
/* setup for ISM connection of client */
static int smc_connect_ism(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
- struct smcd_dev *ismdev)
+ struct smc_init_info *ini)
{
- int local_contact = SMC_FIRST_CONTACT;
int rc = 0;
+ ini->is_smcd = true;
+ ini->ism_gid = aclc->gid;
+ ini->srv_first_contact = aclc->hdr.flag;
+
/* there is only one lgr role for SMC-D; use server lock */
mutex_lock(&smc_server_lgr_pending);
- local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0,
- NULL, ismdev, aclc->gid);
- if (local_contact < 0) {
+ rc = smc_conn_create(smc, ini);
+ if (rc) {
mutex_unlock(&smc_server_lgr_pending);
- return SMC_CLC_DECL_MEM;
+ return rc;
}
/* Create send and receive buffers */
if (smc_buf_create(smc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
+ ini->cln_first_contact);
smc_conn_save_peer_info(smc, aclc);
smc_close_init(smc);
@@ -667,10 +669,11 @@ static int smc_connect_ism(struct smc_sock *smc,
rc = smc_clc_send_confirm(smc);
if (rc)
- return smc_connect_abort(smc, rc, local_contact);
+ return smc_connect_abort(smc, rc, ini->cln_first_contact);
mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
@@ -682,13 +685,9 @@ static int __smc_connect(struct smc_sock *smc)
{
bool ism_supported = false, rdma_supported = false;
struct smc_clc_msg_accept_confirm aclc;
- struct smc_ib_device *ibdev;
- struct smcd_dev *ismdev;
- u8 gid[SMC_GID_SIZE];
- unsigned short vlan;
+ struct smc_init_info ini = {0};
int smc_type;
int rc = 0;
- u8 ibport;
sock_hold(&smc->sk); /* sock put in passive closing */
@@ -703,20 +702,21 @@ static int __smc_connect(struct smc_sock *smc)
if (using_ipsec(smc))
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
- /* check for VLAN ID */
- if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
+ return smc_connect_decline_fallback(smc,
+ SMC_CLC_DECL_GETVLANERR);
/* check if there is an ism device available */
- if (!smc_check_ism(smc, &ismdev) &&
- !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
+ if (!smc_find_ism_device(smc, &ini) &&
+ !smc_connect_ism_vlan_setup(smc, &ini)) {
/* ISM is supported for this connection */
ism_supported = true;
smc_type = SMC_TYPE_D;
}
/* check if there is a rdma device available */
- if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
+ if (!smc_find_rdma_device(smc, &ini)) {
/* RDMA is supported for this connection */
rdma_supported = true;
if (ism_supported)
@@ -730,25 +730,25 @@ static int __smc_connect(struct smc_sock *smc)
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
/* perform CLC handshake */
- rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
+ rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
return smc_connect_decline_fallback(smc, rc);
}
/* depending on previous steps, connect using rdma or ism */
if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
- rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
+ rc = smc_connect_rdma(smc, &aclc, &ini);
else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
- rc = smc_connect_ism(smc, &aclc, ismdev);
+ rc = smc_connect_ism(smc, &aclc, &ini);
else
rc = SMC_CLC_DECL_MODEUNSUPP;
if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
return smc_connect_decline_fallback(smc, rc);
}
- smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
return 0;
}
@@ -756,17 +756,30 @@ static void smc_connect_work(struct work_struct *work)
{
struct smc_sock *smc = container_of(work, struct smc_sock,
connect_work);
- int rc;
+ long timeo = smc->sk.sk_sndtimeo;
+ int rc = 0;
- lock_sock(&smc->sk);
- rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
- smc->connect_info->alen, smc->connect_info->flags);
+ if (!timeo)
+ timeo = MAX_SCHEDULE_TIMEOUT;
+ lock_sock(smc->clcsock->sk);
if (smc->clcsock->sk->sk_err) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
- goto out;
- }
- if (rc < 0) {
- smc->sk.sk_err = -rc;
+ } else if ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_SYN_SENT | TCP_SYN_RECV)) {
+ rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
+ if ((rc == -EPIPE) &&
+ ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
+ rc = 0;
+ }
+ release_sock(smc->clcsock->sk);
+ lock_sock(&smc->sk);
+ if (rc != 0 || smc->sk.sk_err) {
+ smc->sk.sk_state = SMC_CLOSED;
+ if (rc == -EPIPE || rc == -EAGAIN)
+ smc->sk.sk_err = EPIPE;
+ else if (signal_pending(current))
+ smc->sk.sk_err = -sock_intr_errno(timeo);
goto out;
}
@@ -775,12 +788,14 @@ static void smc_connect_work(struct work_struct *work)
smc->sk.sk_err = -rc;
out:
- if (smc->sk.sk_err)
- smc->sk.sk_state_change(&smc->sk);
- else
- smc->sk.sk_write_space(&smc->sk);
- kfree(smc->connect_info);
- smc->connect_info = NULL;
+ if (!sock_flag(&smc->sk, SOCK_DEAD)) {
+ if (smc->sk.sk_err) {
+ smc->sk.sk_state_change(&smc->sk);
+ } else { /* allow polling before and after fallback decision */
+ smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
+ smc->sk.sk_write_space(&smc->sk);
+ }
+ }
release_sock(&smc->sk);
}
@@ -813,26 +828,18 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
smc_copy_sock_settings_to_clc(smc);
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ if (smc->connect_nonblock) {
+ rc = -EALREADY;
+ goto out;
+ }
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc && rc != -EINPROGRESS)
+ goto out;
if (flags & O_NONBLOCK) {
- if (smc->connect_info) {
- rc = -EALREADY;
- goto out;
- }
- smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
- if (!smc->connect_info) {
- rc = -ENOMEM;
- goto out;
- }
- smc->connect_info->alen = alen;
- smc->connect_info->flags = flags ^ O_NONBLOCK;
- memcpy(&smc->connect_info->addr, addr, alen);
- schedule_work(&smc->connect_work);
+ if (schedule_work(&smc->connect_work))
+ smc->connect_nonblock = 1;
rc = -EINPROGRESS;
} else {
- rc = kernel_connect(smc->clcsock, addr, alen, flags);
- if (rc)
- goto out;
-
rc = __smc_connect(smc);
if (rc < 0)
goto out;
@@ -872,11 +879,11 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
if (rc < 0)
lsk->sk_err = -rc;
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
+ new_sk->sk_prot->unhash(new_sk);
if (new_clcsock)
sock_release(new_clcsock);
new_sk->sk_state = SMC_CLOSED;
sock_set_flag(new_sk, SOCK_DEAD);
- new_sk->sk_prot->unhash(new_sk);
sock_put(new_sk); /* final */
*new_smc = NULL;
goto out;
@@ -927,16 +934,21 @@ struct sock *smc_accept_dequeue(struct sock *parent,
smc_accept_unlink(new_sk);
if (new_sk->sk_state == SMC_CLOSED) {
+ new_sk->sk_prot->unhash(new_sk);
if (isk->clcsock) {
sock_release(isk->clcsock);
isk->clcsock = NULL;
}
- new_sk->sk_prot->unhash(new_sk);
sock_put(new_sk); /* final */
continue;
}
- if (new_sock)
+ if (new_sock) {
sock_graft(new_sk, new_sock);
+ if (isk->use_fallback) {
+ smc_sk(new_sk)->clcsock->file = new_sock->file;
+ isk->clcsock->file->private_data = isk->clcsock;
+ }
+ }
return new_sk;
}
return NULL;
@@ -956,6 +968,7 @@ void smc_close_non_accepted(struct sock *sk)
sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
}
+ sk->sk_prot->unhash(sk);
if (smc->clcsock) {
struct socket *tcp;
@@ -971,7 +984,6 @@ void smc_close_non_accepted(struct sock *sk)
smc_conn_free(&smc->conn);
}
release_sock(sk);
- sk->sk_prot->unhash(sk);
sock_put(sk); /* final sock_put */
}
@@ -1037,13 +1049,13 @@ static void smc_listen_out(struct smc_sock *new_smc)
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
- lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
if (lsmc->sk.sk_state == SMC_LISTEN) {
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ release_sock(&lsmc->sk);
} else { /* no longer listening */
smc_close_non_accepted(newsmcsk);
}
- release_sock(&lsmc->sk);
/* Wake up accept */
lsmc->sk.sk_data_ready(&lsmc->sk);
@@ -1087,7 +1099,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
return;
}
smc_conn_free(&new_smc->conn);
- new_smc->use_fallback = true;
+ smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = reason_code;
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
if (smc_clc_send_decline(new_smc, reason_code) < 0) {
@@ -1099,7 +1111,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
}
/* listen worker: check prefixes */
-static int smc_listen_rdma_check(struct smc_sock *new_smc,
+static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc)
{
struct smc_clc_msg_proposal_prefix *pclc_prfx;
@@ -1107,25 +1119,21 @@ static int smc_listen_rdma_check(struct smc_sock *new_smc,
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (smc_clc_prfx_match(newclcsock, pclc_prfx))
- return SMC_CLC_DECL_CNFERR;
+ return SMC_CLC_DECL_DIFFPREFIX;
return 0;
}
/* listen worker: initialize connection and buffers */
static int smc_listen_rdma_init(struct smc_sock *new_smc,
- struct smc_clc_msg_proposal *pclc,
- struct smc_ib_device *ibdev, u8 ibport,
- int *local_contact)
+ struct smc_init_info *ini)
{
+ int rc;
+
/* allocate connection / link group */
- *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0,
- &pclc->lcl, NULL, 0);
- if (*local_contact < 0) {
- if (*local_contact == -ENOMEM)
- return SMC_CLC_DECL_MEM;/* insufficient memory*/
- return SMC_CLC_DECL_INTERR; /* other error */
- }
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
/* create send buffer and rmb */
if (smc_buf_create(new_smc, false))
@@ -1137,33 +1145,30 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
/* listen worker: initialize connection and buffers for SMC-D */
static int smc_listen_ism_init(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
- struct smcd_dev *ismdev,
- int *local_contact)
+ struct smc_init_info *ini)
{
struct smc_clc_msg_smcd *pclc_smcd;
+ int rc;
pclc_smcd = smc_get_clc_msg_smcd(pclc);
- *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL,
- ismdev, pclc_smcd->gid);
- if (*local_contact < 0) {
- if (*local_contact == -ENOMEM)
- return SMC_CLC_DECL_MEM;/* insufficient memory*/
- return SMC_CLC_DECL_INTERR; /* other error */
- }
+ ini->ism_gid = pclc_smcd->gid;
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
/* Check if peer can be reached via ISM device */
if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
new_smc->conn.lgr->vlan_id,
new_smc->conn.lgr->smcd)) {
- if (*local_contact == SMC_FIRST_CONTACT)
+ if (ini->cln_first_contact == SMC_FIRST_CONTACT)
smc_lgr_forget(new_smc->conn.lgr);
smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_CNFERR;
+ return SMC_CLC_DECL_SMCDNOTALK;
}
/* Create send and receive buffers */
if (smc_buf_create(new_smc, true)) {
- if (*local_contact == SMC_FIRST_CONTACT)
+ if (ini->cln_first_contact == SMC_FIRST_CONTACT)
smc_lgr_forget(new_smc->conn.lgr);
smc_conn_free(&new_smc->conn);
return SMC_CLC_DECL_MEM;
@@ -1227,15 +1232,13 @@ static void smc_listen_work(struct work_struct *work)
struct socket *newclcsock = new_smc->clcsock;
struct smc_clc_msg_accept_confirm cclc;
struct smc_clc_msg_proposal *pclc;
- struct smc_ib_device *ibdev;
+ struct smc_init_info ini = {0};
bool ism_supported = false;
- struct smcd_dev *ismdev;
u8 buf[SMC_CLC_MAX_LEN];
- int local_contact = 0;
- unsigned short vlan;
- int reason_code = 0;
int rc = 0;
- u8 ibport;
+
+ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
+ return smc_listen_out_err(new_smc);
if (new_smc->use_fallback) {
smc_listen_out_connected(new_smc);
@@ -1244,7 +1247,7 @@ static void smc_listen_work(struct work_struct *work)
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
- new_smc->use_fallback = true;
+ smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
smc_listen_out_connected(new_smc);
return;
@@ -1254,17 +1257,26 @@ static void smc_listen_work(struct work_struct *work)
* wait for and receive SMC Proposal CLC message
*/
pclc = (struct smc_clc_msg_proposal *)&buf;
- reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
- SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
- if (reason_code) {
- smc_listen_decline(new_smc, reason_code, 0);
- return;
- }
+ rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
+ SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
+ if (rc)
+ goto out_decl;
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(new_smc)) {
- smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
- return;
+ rc = SMC_CLC_DECL_IPSEC;
+ goto out_decl;
+ }
+
+ /* check for matching IP prefix and subnet length */
+ rc = smc_listen_prfx_check(new_smc, pclc);
+ if (rc)
+ goto out_decl;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
+ rc = SMC_CLC_DECL_GETVLANERR;
+ goto out_decl;
}
mutex_lock(&smc_server_lgr_pending);
@@ -1273,59 +1285,73 @@ static void smc_listen_work(struct work_struct *work)
smc_tx_init(new_smc);
/* check if ISM is available */
- if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
- !smc_check_ism(new_smc, &ismdev) &&
- !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
- ism_supported = true;
+ if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
+ ini.is_smcd = true; /* prepare ISM check */
+ rc = smc_find_ism_device(new_smc, &ini);
+ if (!rc)
+ rc = smc_listen_ism_init(new_smc, pclc, &ini);
+ if (!rc)
+ ism_supported = true;
+ else if (pclc->hdr.path == SMC_TYPE_D)
+ goto out_unlock; /* skip RDMA and decline */
}
/* check if RDMA is available */
- if (!ism_supported &&
- ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
- smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
- smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
- smc_listen_rdma_check(new_smc, pclc) ||
- smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
- &local_contact) ||
- smc_listen_rdma_reg(new_smc, local_contact))) {
- /* SMC not supported, decline */
- mutex_unlock(&smc_server_lgr_pending);
- smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
- local_contact);
- return;
+ if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
+ /* prepare RDMA check */
+ memset(&ini, 0, sizeof(ini));
+ ini.is_smcd = false;
+ ini.ib_lcl = &pclc->lcl;
+ rc = smc_find_rdma_device(new_smc, &ini);
+ if (rc) {
+ /* no RDMA device found */
+ if (pclc->hdr.path == SMC_TYPE_B)
+ /* neither ISM nor RDMA device found */
+ rc = SMC_CLC_DECL_NOSMCDEV;
+ goto out_unlock;
+ }
+ rc = smc_listen_rdma_init(new_smc, &ini);
+ if (rc)
+ goto out_unlock;
+ rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
+ if (rc)
+ goto out_unlock;
}
/* send SMC Accept CLC message */
- rc = smc_clc_send_accept(new_smc, local_contact);
- if (rc) {
- mutex_unlock(&smc_server_lgr_pending);
- smc_listen_decline(new_smc, rc, local_contact);
- return;
- }
+ rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
+ if (rc)
+ goto out_unlock;
/* SMC-D does not need this lock any more */
if (ism_supported)
mutex_unlock(&smc_server_lgr_pending);
/* receive SMC Confirm CLC message */
- reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
- SMC_CLC_CONFIRM, CLC_WAIT_TIME);
- if (reason_code) {
+ rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ SMC_CLC_CONFIRM, CLC_WAIT_TIME);
+ if (rc) {
if (!ism_supported)
- mutex_unlock(&smc_server_lgr_pending);
- smc_listen_decline(new_smc, reason_code, local_contact);
- return;
+ goto out_unlock;
+ goto out_decl;
}
/* finish worker */
if (!ism_supported) {
- rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact);
+ rc = smc_listen_rdma_finish(new_smc, &cclc,
+ ini.cln_first_contact);
mutex_unlock(&smc_server_lgr_pending);
if (rc)
return;
}
smc_conn_save_peer_info(new_smc, &cclc);
smc_listen_out_connected(new_smc);
+ return;
+
+out_unlock:
+ mutex_unlock(&smc_server_lgr_pending);
+out_decl:
+ smc_listen_decline(new_smc, rc, ini.cln_first_contact);
}
static void smc_tcp_listen_work(struct work_struct *work)
@@ -1501,7 +1527,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT) {
- smc->use_fallback = true;
+ smc_switch_to_fallback(smc);
smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
rc = -EINVAL;
@@ -1571,8 +1597,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
- __poll_t mask = 0;
struct smc_sock *smc;
+ __poll_t mask = 0;
if (!sk)
return EPOLLNVAL;
@@ -1582,8 +1608,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
/* delegate to CLC child sock */
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
sk->sk_err = smc->clcsock->sk->sk_err;
- if (sk->sk_err)
- mask |= EPOLLERR;
} else {
if (sk->sk_state != SMC_CLOSED)
sock_poll_wait(file, sock, wait);
@@ -1594,9 +1618,14 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
mask |= EPOLLHUP;
if (sk->sk_state == SMC_LISTEN) {
/* woken up by sk_data_ready in smc_listen_work() */
- mask = smc_accept_poll(sk);
+ mask |= smc_accept_poll(sk);
+ } else if (smc->use_fallback) { /* as result of connect_work()*/
+ mask |= smc->clcsock->ops->poll(file, smc->clcsock,
+ wait);
+ sk->sk_err = smc->clcsock->sk->sk_err;
} else {
- if (atomic_read(&smc->conn.sndbuf_space) ||
+ if ((sk->sk_state != SMC_INIT &&
+ atomic_read(&smc->conn.sndbuf_space)) ||
sk->sk_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else {
@@ -1703,7 +1732,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT) {
- smc->use_fallback = true;
+ smc_switch_to_fallback(smc);
smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
if (!smc->use_fallback)
diff --git a/net/smc/smc.h b/net/smc/smc.h
index adbdf195eb08..878313f8d6c1 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -190,18 +190,11 @@ struct smc_connection {
u64 peer_token; /* SMC-D token of peer */
};
-struct smc_connect_info {
- int flags;
- int alen;
- struct sockaddr addr;
-};
-
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
struct smc_connection conn; /* smc connection */
struct smc_sock *listen_smc; /* listen parent */
- struct smc_connect_info *connect_info; /* connect address & flags */
struct work_struct connect_work; /* handle non-blocking connect*/
struct work_struct tcp_listen_work;/* handle tcp socket accepts */
struct work_struct smc_listen_work;/* prepare new accept socket */
@@ -219,6 +212,10 @@ struct smc_sock { /* smc sock container */
* started, waiting for unsent
* data to be sent
*/
+ u8 connect_nonblock : 1;
+ /* non-blocking connect in
+ * flight
+ */
struct mutex clcsock_release_lock;
/* protects clcsock of a listen
* socket
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index d53fd588d1f5..745afd82f281 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -385,8 +385,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
/* send CLC PROPOSAL message across internal TCP socket */
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_ib_device *ibdev, u8 ibport, u8 gid[],
- struct smcd_dev *ismdev)
+ struct smc_init_info *ini)
{
struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
struct smc_clc_msg_proposal_prefix pclc_prfx;
@@ -416,8 +415,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
/* add SMC-R specifics */
memcpy(pclc.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE);
- memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN);
+ memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE);
+ memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
+ ETH_ALEN);
pclc.iparea_offset = htons(0);
}
if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
@@ -425,7 +425,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
memset(&pclc_smcd, 0, sizeof(pclc_smcd));
plen += sizeof(pclc_smcd);
pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
- pclc_smcd.gid = ismdev->local_gid;
+ pclc_smcd.gid = ini->ism_dev->local_gid;
}
pclc.hdr.length = htons(plen);
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 24658e8c0de4..ca209272e5fa 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -34,16 +34,22 @@
#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
-#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */
+#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
+#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
+#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
+#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */
#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
+#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */
+#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/
+#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */
#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
-#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
-#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */
-#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */
-#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */
+#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */
+#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */
+#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
+#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
@@ -179,6 +185,7 @@ smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
}
struct smcd_dev;
+struct smc_init_info;
int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
@@ -186,8 +193,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type, unsigned long timeout);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_ib_device *smcibdev, u8 ibport, u8 gid[],
- struct smcd_dev *ismdev);
+ struct smc_init_info *ini);
int smc_clc_send_confirm(struct smc_sock *smc);
int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 2ad37e998509..fc06720b53c1 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -21,6 +21,22 @@
#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ)
+/* release the clcsock that is assigned to the smc_sock */
+void smc_clcsock_release(struct smc_sock *smc)
+{
+ struct socket *tcp;
+
+ if (smc->listen_smc && current_work() != &smc->smc_listen_work)
+ cancel_work_sync(&smc->smc_listen_work);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (smc->clcsock) {
+ tcp = smc->clcsock;
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+ mutex_unlock(&smc->clcsock_release_lock);
+}
+
static void smc_close_cleanup_listen(struct sock *parent)
{
struct sock *sk;
@@ -321,6 +337,7 @@ static void smc_close_passive_work(struct work_struct *work)
close_work);
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
struct smc_cdc_conn_state_flags *rxflags;
+ bool release_clcsock = false;
struct sock *sk = &smc->sk;
int old_state;
@@ -400,13 +417,13 @@ wakeup:
if ((sk->sk_state == SMC_CLOSED) &&
(sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
smc_conn_free(conn);
- if (smc->clcsock) {
- sock_release(smc->clcsock);
- smc->clcsock = NULL;
- }
+ if (smc->clcsock)
+ release_clcsock = true;
}
}
release_sock(sk);
+ if (release_clcsock)
+ smc_clcsock_release(smc);
sock_put(sk); /* sock_hold done by schedulers of close_work */
}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index 19eb6a211c23..e0e3b5df25d2 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -23,5 +23,6 @@ void smc_close_wake_tx_prepared(struct smc_sock *smc);
int smc_close_active(struct smc_sock *smc);
int smc_close_shutdown_write(struct smc_sock *smc);
void smc_close_init(struct smc_sock *smc);
+void smc_clcsock_release(struct smc_sock *smc);
#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 53a17cfa61af..2d2850adc2a3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -195,10 +195,7 @@ static void smc_lgr_free_work(struct work_struct *work)
}
/* create a new SMC link group */
-static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
- struct smc_ib_device *smcibdev, u8 ibport,
- char *peer_systemid, unsigned short vlan_id,
- struct smcd_dev *smcismdev, u64 peer_gid)
+static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
{
struct smc_link_group *lgr;
struct smc_link *lnk;
@@ -206,20 +203,21 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
int rc = 0;
int i;
- if (is_smcd && vlan_id) {
- rc = smc_ism_get_vlan(smcismdev, vlan_id);
- if (rc)
+ if (ini->is_smcd && ini->vlan_id) {
+ if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
+ rc = SMC_CLC_DECL_ISMVLANERR;
goto out;
+ }
}
lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
if (!lgr) {
- rc = -ENOMEM;
+ rc = SMC_CLC_DECL_MEM;
goto out;
}
- lgr->is_smcd = is_smcd;
+ lgr->is_smcd = ini->is_smcd;
lgr->sync_err = 0;
- lgr->vlan_id = vlan_id;
+ lgr->vlan_id = ini->vlan_id;
rwlock_init(&lgr->sndbufs_lock);
rwlock_init(&lgr->rmbs_lock);
rwlock_init(&lgr->conns_lock);
@@ -231,29 +229,32 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
lgr->conns_all = RB_ROOT;
- if (is_smcd) {
+ if (ini->is_smcd) {
/* SMC-D specific settings */
- lgr->peer_gid = peer_gid;
- lgr->smcd = smcismdev;
+ lgr->peer_gid = ini->ism_gid;
+ lgr->smcd = ini->ism_dev;
} else {
/* SMC-R specific settings */
lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+ memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
+ SMC_SYSTEMID_LEN);
lnk = &lgr->lnk[SMC_SINGLE_LINK];
/* initialize link */
lnk->state = SMC_LNK_ACTIVATING;
lnk->link_id = SMC_SINGLE_LINK;
- lnk->smcibdev = smcibdev;
- lnk->ibport = ibport;
- lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
- if (!smcibdev->initialized)
- smc_ib_setup_per_ibdev(smcibdev);
+ lnk->smcibdev = ini->ib_dev;
+ lnk->ibport = ini->ib_port;
+ lnk->path_mtu =
+ ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
+ if (!ini->ib_dev->initialized)
+ smc_ib_setup_per_ibdev(ini->ib_dev);
get_random_bytes(rndvec, sizeof(rndvec));
lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
(rndvec[2] << 16);
rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
- vlan_id, lnk->gid, &lnk->sgid_index);
+ ini->vlan_id, lnk->gid,
+ &lnk->sgid_index);
if (rc)
goto free_lgr;
rc = smc_llc_link_init(lnk);
@@ -289,6 +290,12 @@ clear_llc_lnk:
free_lgr:
kfree(lgr);
out:
+ if (rc < 0) {
+ if (rc == -ENOMEM)
+ rc = SMC_CLC_DECL_MEM;
+ else
+ rc = SMC_CLC_DECL_INTERR;
+ }
return rc;
}
@@ -528,13 +535,13 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
/* Determine vlan of internal TCP socket.
* @vlan_id: address to store the determined vlan id into
*/
-int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
struct net_device *ndev;
int i, nest_lvl, rc = 0;
- *vlan_id = 0;
+ ini->vlan_id = 0;
if (!dst) {
rc = -ENOTCONN;
goto out;
@@ -546,7 +553,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
ndev = dst->dev;
if (is_vlan_dev(ndev)) {
- *vlan_id = vlan_dev_vlan_id(ndev);
+ ini->vlan_id = vlan_dev_vlan_id(ndev);
goto out_rel;
}
@@ -560,7 +567,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
lower = lower->next;
ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
if (is_vlan_dev(ndev)) {
- *vlan_id = vlan_dev_vlan_id(ndev);
+ ini->vlan_id = vlan_dev_vlan_id(ndev);
break;
}
}
@@ -594,24 +601,16 @@ static bool smcd_lgr_match(struct smc_link_group *lgr,
}
/* create a new SMC connection (and a new link group if necessary) */
-int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
- struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn,
- struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
- u64 peer_gid)
+int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
{
struct smc_connection *conn = &smc->conn;
- int local_contact = SMC_FIRST_CONTACT;
struct smc_link_group *lgr;
- unsigned short vlan_id;
enum smc_lgr_role role;
int rc = 0;
+ ini->cln_first_contact = SMC_FIRST_CONTACT;
role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
- if (rc)
- return rc;
-
- if ((role == SMC_CLNT) && srv_first_contact)
+ if (role == SMC_CLNT && ini->srv_first_contact)
/* create new link group as well */
goto create;
@@ -619,14 +618,15 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
spin_lock_bh(&smc_lgr_list.lock);
list_for_each_entry(lgr, &smc_lgr_list.list, list) {
write_lock_bh(&lgr->conns_lock);
- if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) :
- smcr_lgr_match(lgr, lcl, role, clcqpn)) &&
+ if ((ini->is_smcd ?
+ smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
+ smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
!lgr->sync_err &&
- lgr->vlan_id == vlan_id &&
+ lgr->vlan_id == ini->vlan_id &&
(role == SMC_CLNT ||
lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
/* link group found */
- local_contact = SMC_REUSE_CONTACT;
+ ini->cln_first_contact = SMC_REUSE_CONTACT;
conn->lgr = lgr;
smc_lgr_register_conn(conn); /* add smc conn to lgr */
if (delayed_work_pending(&lgr->free_work))
@@ -638,19 +638,18 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
}
spin_unlock_bh(&smc_lgr_list.lock);
- if (role == SMC_CLNT && !srv_first_contact &&
- (local_contact == SMC_FIRST_CONTACT)) {
+ if (role == SMC_CLNT && !ini->srv_first_contact &&
+ ini->cln_first_contact == SMC_FIRST_CONTACT) {
/* Server reuses a link group, but Client wants to start
* a new one
* send out_of_sync decline, reason synchr. error
*/
- return -ENOLINK;
+ return SMC_CLC_DECL_SYNCERR;
}
create:
- if (local_contact == SMC_FIRST_CONTACT) {
- rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport,
- lcl->id_for_peer, vlan_id, smcd, peer_gid);
+ if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ rc = smc_lgr_create(smc, ini);
if (rc)
goto out;
smc_lgr_register_conn(conn); /* add smc conn to lgr */
@@ -658,7 +657,7 @@ create:
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
conn->urg_state = SMC_URG_READ;
- if (is_smcd) {
+ if (ini->is_smcd) {
conn->rx_off = sizeof(struct smcd_cdc_msg);
smcd_cdc_rx_init(conn); /* init tasklet for this conn */
}
@@ -667,7 +666,7 @@ create:
#endif
out:
- return rc ? rc : local_contact;
+ return rc;
}
/* convert the RMB size into the compressed notation - minimum 16K.
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 8806d2afa6ed..c00ac61dc129 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -229,6 +229,24 @@ struct smc_link_group {
};
};
+struct smc_clc_msg_local;
+
+struct smc_init_info {
+ u8 is_smcd;
+ unsigned short vlan_id;
+ int srv_first_contact;
+ int cln_first_contact;
+ /* SMC-R */
+ struct smc_clc_msg_local *ib_lcl;
+ struct smc_ib_device *ib_dev;
+ u8 ib_gid[SMC_GID_SIZE];
+ u8 ib_port;
+ u32 ib_clcqpn;
+ /* SMC-D */
+ u64 ism_gid;
+ struct smcd_dev *ism_dev;
+};
+
/* Find the connection associated with the given alert token in the link group.
* To use rbtrees we have to implement our own search core.
* Requires @conns_lock
@@ -281,13 +299,10 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
-int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id);
+int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
void smc_conn_free(struct smc_connection *conn);
-int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
- struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn,
- struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
- u64 peer_gid);
+int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
void smcd_conn_free(struct smc_connection *conn);
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
void smc_core_exit(void);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 2fff79db1a59..e89e918b88e0 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -289,6 +289,11 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
INIT_LIST_HEAD(&smcd->vlan);
smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
WQ_MEM_RECLAIM, name);
+ if (!smcd->event_wq) {
+ kfree(smcd->conn);
+ kfree(smcd);
+ return NULL;
+ }
return smcd;
}
EXPORT_SYMBOL_GPL(smcd_alloc_dev);
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 3cdf81cf97a3..9f5d8f36f2d7 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -26,6 +26,7 @@
#include "smc_pnet.h"
#include "smc_ib.h"
#include "smc_ism.h"
+#include "smc_core.h"
#define SMC_ASCII_BLANK 32
@@ -603,7 +604,8 @@ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
- return smc_pnet_remove_by_pnetid(net, NULL);
+ smc_pnet_remove_by_pnetid(net, NULL);
+ return 0;
}
/* SMC_PNETID generic netlink operation definition */
@@ -755,8 +757,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
* IB device and port
*/
static void smc_pnet_find_rdma_dev(struct net_device *netdev,
- struct smc_ib_device **smcibdev,
- u8 *ibport, unsigned short vlan_id, u8 gid[])
+ struct smc_init_info *ini)
{
struct smc_ib_device *ibdev;
@@ -776,10 +777,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
dev_put(ndev);
if (netdev == ndev &&
smc_ib_port_active(ibdev, i) &&
- !smc_ib_determine_gid(ibdev, i, vlan_id, gid,
- NULL)) {
- *smcibdev = ibdev;
- *ibport = i;
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
+ ini->ib_gid, NULL)) {
+ ini->ib_dev = ibdev;
+ ini->ib_port = i;
break;
}
}
@@ -794,9 +795,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
* If nothing found, try to use handshake device
*/
static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
- struct smc_ib_device **smcibdev,
- u8 *ibport, unsigned short vlan_id,
- u8 gid[])
+ struct smc_init_info *ini)
{
u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
struct smc_ib_device *ibdev;
@@ -806,7 +805,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
ndev_pnetid) &&
smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
- smc_pnet_find_rdma_dev(ndev, smcibdev, ibport, vlan_id, gid);
+ smc_pnet_find_rdma_dev(ndev, ini);
return; /* pnetid could not be determined */
}
@@ -817,10 +816,10 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
continue;
if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) &&
smc_ib_port_active(ibdev, i) &&
- !smc_ib_determine_gid(ibdev, i, vlan_id, gid,
- NULL)) {
- *smcibdev = ibdev;
- *ibport = i;
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
+ ini->ib_gid, NULL)) {
+ ini->ib_dev = ibdev;
+ ini->ib_port = i;
goto out;
}
}
@@ -830,7 +829,7 @@ out:
}
static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
- struct smcd_dev **smcismdev)
+ struct smc_init_info *ini)
{
u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
struct smcd_dev *ismdev;
@@ -844,7 +843,7 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
spin_lock(&smcd_dev_list.lock);
list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) {
- *smcismdev = ismdev;
+ ini->ism_dev = ismdev;
break;
}
}
@@ -855,21 +854,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
* determine ib_device and port belonging to used internal TCP socket
* ethernet interface.
*/
-void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport,
- unsigned short vlan_id, u8 gid[])
+void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(sk);
- *smcibdev = NULL;
- *ibport = 0;
-
+ ini->ib_dev = NULL;
+ ini->ib_port = 0;
if (!dst)
goto out;
if (!dst->dev)
goto out_rel;
- smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid);
+ smc_pnet_find_roce_by_pnetid(dst->dev, ini);
out_rel:
dst_release(dst);
@@ -877,17 +873,17 @@ out:
return;
}
-void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev)
+void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(sk);
- *smcismdev = NULL;
+ ini->ism_dev = NULL;
if (!dst)
goto out;
if (!dst->dev)
goto out_rel;
- smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev);
+ smc_pnet_find_ism_by_pnetid(dst->dev, ini);
out_rel:
dst_release(dst);
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 5eac42fb45d0..4564e4d69c2e 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -18,6 +18,7 @@
struct smc_ib_device;
struct smcd_dev;
+struct smc_init_info;
/**
* struct smc_pnettable - SMC PNET table anchor
@@ -43,9 +44,7 @@ int smc_pnet_init(void) __init;
int smc_pnet_net_init(struct net *net);
void smc_pnet_exit(void);
void smc_pnet_net_exit(struct net *net);
-void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport,
- unsigned short vlan_id, u8 gid[]);
-void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev);
+void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
+void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
#endif
diff --git a/net/socket.c b/net/socket.c
index 8255f5bda0aa..a180e1a9ff23 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1164,6 +1164,26 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
err = open_related_ns(&net->ns, get_net_ns);
break;
+ case SIOCGSTAMP_OLD:
+ case SIOCGSTAMPNS_OLD:
+ if (!sock->ops->gettstamp) {
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ err = sock->ops->gettstamp(sock, argp,
+ cmd == SIOCGSTAMP_OLD,
+ !IS_ENABLED(CONFIG_64BIT));
+ break;
+ case SIOCGSTAMP_NEW:
+ case SIOCGSTAMPNS_NEW:
+ if (!sock->ops->gettstamp) {
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ err = sock->ops->gettstamp(sock, argp,
+ cmd == SIOCGSTAMP_NEW,
+ false);
+ break;
default:
err = sock_do_ioctl(net, sock, cmd, arg);
break;
@@ -2916,38 +2936,6 @@ void socket_seq_show(struct seq_file *seq)
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_COMPAT
-static int do_siocgstamp(struct net *net, struct socket *sock,
- unsigned int cmd, void __user *up)
-{
- mm_segment_t old_fs = get_fs();
- struct timeval ktv;
- int err;
-
- set_fs(KERNEL_DS);
- err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
- set_fs(old_fs);
- if (!err)
- err = compat_put_timeval(&ktv, up);
-
- return err;
-}
-
-static int do_siocgstampns(struct net *net, struct socket *sock,
- unsigned int cmd, void __user *up)
-{
- mm_segment_t old_fs = get_fs();
- struct timespec kts;
- int err;
-
- set_fs(KERNEL_DS);
- err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
- set_fs(old_fs);
- if (!err)
- err = compat_put_timespec(&kts, up);
-
- return err;
-}
-
static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
{
struct compat_ifconf ifc32;
@@ -3347,10 +3335,13 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCADDRT:
case SIOCDELRT:
return routing_ioctl(net, sock, cmd, argp);
- case SIOCGSTAMP:
- return do_siocgstamp(net, sock, cmd, argp);
- case SIOCGSTAMPNS:
- return do_siocgstampns(net, sock, cmd, argp);
+ case SIOCGSTAMP_OLD:
+ case SIOCGSTAMPNS_OLD:
+ if (!sock->ops->gettstamp)
+ return -ENOIOCTLCMD;
+ return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
+ !COMPAT_USE_64BIT_TIME);
+
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
case SIOCSHWTSTAMP:
@@ -3368,6 +3359,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCADDDLCI:
case SIOCDELDLCI:
case SIOCGSKNS:
+ case SIOCGSTAMP_NEW:
+ case SIOCGSTAMPNS_NEW:
return sock_ioctl(file, cmd, arg);
case SIOCGIFFLAGS:
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 860dcfb95ee4..e137698e8aef 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -14,7 +14,8 @@
#include <linux/file.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/poll.h>
@@ -140,13 +141,11 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
/* We are going to append to the frags_list of head.
* Need to unshare the frag_list.
*/
- if (skb_has_frag_list(head)) {
- err = skb_unclone(head, GFP_ATOMIC);
- if (err) {
- STRP_STATS_INCR(strp->stats.mem_fail);
- desc->error = err;
- return 0;
- }
+ err = skb_unclone(head, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.mem_fail);
+ desc->error = err;
+ return 0;
}
if (unlikely(skb_shinfo(head)->frag_list)) {
@@ -299,7 +298,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
break;
}
- /* Positive extra indicates ore bytes than needed for the
+ /* Positive extra indicates more bytes than needed for the
* message
*/
@@ -547,7 +546,7 @@ void strp_check_rcv(struct strparser *strp)
}
EXPORT_SYMBOL_GPL(strp_check_rcv);
-static int __init strp_mod_init(void)
+static int __init strp_dev_init(void)
{
strp_wq = create_singlethread_workqueue("kstrp");
if (unlikely(!strp_wq))
@@ -555,11 +554,4 @@ static int __init strp_mod_init(void)
return 0;
}
-
-static void __exit strp_mod_exit(void)
-{
- destroy_workqueue(strp_wq);
-}
-module_init(strp_mod_init);
-module_exit(strp_mod_exit);
-MODULE_LICENSE("GPL");
+device_initcall(strp_dev_init);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 12bb23b8e0c5..261131dfa1f1 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -54,6 +54,7 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail)
h->last_refresh = now;
}
+static inline int cache_is_valid(struct cache_head *h);
static void cache_fresh_locked(struct cache_head *head, time_t expiry,
struct cache_detail *detail);
static void cache_fresh_unlocked(struct cache_head *head,
@@ -105,6 +106,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
if (cache_is_expired(detail, tmp)) {
hlist_del_init_rcu(&tmp->cache_list);
detail->entries --;
+ if (cache_is_valid(tmp) == -EAGAIN)
+ set_bit(CACHE_NEGATIVE, &tmp->flags);
cache_fresh_locked(tmp, 0, detail);
freeme = tmp;
break;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 187d10443a15..8ff11dc98d7f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1540,7 +1540,6 @@ call_start(struct rpc_task *task)
clnt->cl_stats->rpccnt++;
task->tk_action = call_reserve;
rpc_task_set_transport(task, clnt);
- call_reserve(task);
}
/*
@@ -1554,9 +1553,6 @@ call_reserve(struct rpc_task *task)
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_reserve(task);
- if (rpc_task_need_resched(task))
- return;
- call_reserveresult(task);
}
static void call_retry_reserve(struct rpc_task *task);
@@ -1579,7 +1575,6 @@ call_reserveresult(struct rpc_task *task)
if (status >= 0) {
if (task->tk_rqstp) {
task->tk_action = call_refresh;
- call_refresh(task);
return;
}
@@ -1605,7 +1600,6 @@ call_reserveresult(struct rpc_task *task)
/* fall through */
case -EAGAIN: /* woken up; retry */
task->tk_action = call_retry_reserve;
- call_retry_reserve(task);
return;
case -EIO: /* probably a shutdown */
break;
@@ -1628,9 +1622,6 @@ call_retry_reserve(struct rpc_task *task)
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_retry_reserve(task);
- if (rpc_task_need_resched(task))
- return;
- call_reserveresult(task);
}
/*
@@ -1645,9 +1636,6 @@ call_refresh(struct rpc_task *task)
task->tk_status = 0;
task->tk_client->cl_stats->rpcauthrefresh++;
rpcauth_refreshcred(task);
- if (rpc_task_need_resched(task))
- return;
- call_refreshresult(task);
}
/*
@@ -1666,7 +1654,6 @@ call_refreshresult(struct rpc_task *task)
case 0:
if (rpcauth_uptodatecred(task)) {
task->tk_action = call_allocate;
- call_allocate(task);
return;
}
/* Use rate-limiting and a max number of retries if refresh
@@ -1685,7 +1672,6 @@ call_refreshresult(struct rpc_task *task)
task->tk_cred_retry--;
dprintk("RPC: %5u %s: retry refresh creds\n",
task->tk_pid, __func__);
- call_refresh(task);
return;
}
dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
@@ -1711,10 +1697,8 @@ call_allocate(struct rpc_task *task)
task->tk_status = 0;
task->tk_action = call_encode;
- if (req->rq_buffer) {
- call_encode(task);
+ if (req->rq_buffer)
return;
- }
if (proc->p_proc != 0) {
BUG_ON(proc->p_arglen == 0);
@@ -1740,12 +1724,8 @@ call_allocate(struct rpc_task *task)
status = xprt->ops->buf_alloc(task);
xprt_inject_disconnect(xprt);
- if (status == 0) {
- if (rpc_task_need_resched(task))
- return;
- call_encode(task);
+ if (status == 0)
return;
- }
if (status != -ENOMEM) {
rpc_exit(task, status);
return;
@@ -1828,8 +1808,12 @@ call_encode(struct rpc_task *task)
xprt_request_enqueue_receive(task);
xprt_request_enqueue_transmit(task);
out:
- task->tk_action = call_bind;
- call_bind(task);
+ task->tk_action = call_transmit;
+ /* Check that the connection is OK */
+ if (!xprt_bound(task->tk_xprt))
+ task->tk_action = call_bind;
+ else if (!xprt_connected(task->tk_xprt))
+ task->tk_action = call_connect;
}
/*
@@ -1847,7 +1831,6 @@ rpc_task_handle_transmitted(struct rpc_task *task)
{
xprt_end_transmit(task);
task->tk_action = call_transmit_status;
- call_transmit_status(task);
}
/*
@@ -1865,7 +1848,6 @@ call_bind(struct rpc_task *task)
if (xprt_bound(xprt)) {
task->tk_action = call_connect;
- call_connect(task);
return;
}
@@ -1896,7 +1878,6 @@ call_bind_status(struct rpc_task *task)
dprint_status(task);
task->tk_status = 0;
task->tk_action = call_connect;
- call_connect(task);
return;
}
@@ -1981,7 +1962,6 @@ call_connect(struct rpc_task *task)
if (xprt_connected(xprt)) {
task->tk_action = call_transmit;
- call_transmit(task);
return;
}
@@ -2051,7 +2031,6 @@ call_connect_status(struct rpc_task *task)
case 0:
clnt->cl_stats->netreconn++;
task->tk_action = call_transmit;
- call_transmit(task);
return;
}
rpc_exit(task, status);
@@ -2087,9 +2066,6 @@ call_transmit(struct rpc_task *task)
xprt_transmit(task);
}
xprt_end_transmit(task);
- if (rpc_task_need_resched(task))
- return;
- call_transmit_status(task);
}
/*
@@ -2105,11 +2081,8 @@ call_transmit_status(struct rpc_task *task)
* test first.
*/
if (rpc_task_transmitted(task)) {
- if (task->tk_status == 0)
- xprt_request_wait_receive(task);
- if (rpc_task_need_resched(task))
- return;
- call_status(task);
+ task->tk_status = 0;
+ xprt_request_wait_receive(task);
return;
}
@@ -2170,7 +2143,6 @@ call_bc_encode(struct rpc_task *task)
{
xprt_request_enqueue_transmit(task);
task->tk_action = call_bc_transmit;
- call_bc_transmit(task);
}
/*
@@ -2195,6 +2167,9 @@ call_bc_transmit_status(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
+ if (rpc_task_transmitted(task))
+ task->tk_status = 0;
+
dprint_status(task);
switch (task->tk_status) {
@@ -2261,7 +2236,6 @@ call_status(struct rpc_task *task)
status = task->tk_status;
if (status >= 0) {
task->tk_action = call_decode;
- call_decode(task);
return;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 89a63391d4d4..30cfc0efe699 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -90,7 +90,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
- ib_drain_qp(ia->ri_id->qp);
+ ib_drain_rq(ia->ri_id->qp);
drain_workqueue(buf->rb_completion_wq);
/* Deferred Reply processing might have scheduled
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 3cb9f326ee6f..6053489c8063 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -876,6 +876,8 @@ void tipc_link_reset(struct tipc_link *l)
__skb_queue_head_init(&list);
l->in_session = false;
+ /* Force re-synch of peer session number before establishing */
+ l->peer_session--;
l->session++;
l->mtu = l->advertised_mtu;
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index bff241f03525..89993afe0fbd 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -909,7 +909,8 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg,
for (; i < TIPC_NAMETBL_SIZE; i++) {
head = &tn->nametbl->services[i];
- if (*last_type) {
+ if (*last_type ||
+ (!i && *last_key && (*last_lower == *last_key))) {
service = tipc_service_find(net, *last_type);
if (!service)
return -EPIPE;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 3469b5d4ed32..7478e2d4ec02 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -375,14 +375,15 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
if (n->capabilities == capabilities)
goto exit;
/* Same node may come back with new capabilities */
- write_lock_bh(&n->lock);
+ tipc_node_write_lock(n);
n->capabilities = capabilities;
for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
l = n->links[bearer_id].link;
if (l)
tipc_link_update_caps(l, capabilities);
}
- write_unlock_bh(&n->lock);
+ tipc_node_write_unlock_fast(n);
+
/* Calculate cluster capabilities */
tn->capabilities = TIPC_NODE_CAPABILITIES;
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 8ac8ddf1e324..1385207a301f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3070,6 +3070,9 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
case TIPC_SOCK_RECVQ_DEPTH:
value = skb_queue_len(&sk->sk_receive_queue);
break;
+ case TIPC_SOCK_RECVQ_USED:
+ value = sk_rmem_alloc_get(sk);
+ break;
case TIPC_GROUP_JOIN:
seq.type = 0;
if (tsk->group)
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 3481e4906bd6..9df82a573aa7 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -38,6 +38,8 @@
#include <linux/sysctl.h>
+static int zero;
+static int one = 1;
static struct ctl_table_header *tipc_ctl_hdr;
static struct ctl_table tipc_table[] = {
@@ -46,14 +48,16 @@ static struct ctl_table tipc_table[] = {
.data = &sysctl_tipc_rmem,
.maxlen = sizeof(sysctl_tipc_rmem),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "named_timeout",
.data = &sysctl_tipc_named_timeout,
.maxlen = sizeof(sysctl_tipc_named_timeout),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "sk_filter",
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 6f166fbbfff1..0884a1b8ad12 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -354,25 +354,21 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
skb_pull(skb, sizeof(struct udphdr));
hdr = buf_msg(skb);
- rcu_read_lock();
- b = rcu_dereference_rtnl(ub->bearer);
+ b = rcu_dereference(ub->bearer);
if (!b)
- goto rcu_out;
+ goto out;
if (b && test_bit(0, &b->up)) {
tipc_rcv(sock_net(sk), skb, b);
- rcu_read_unlock();
return 0;
}
if (unlikely(msg_user(hdr) == LINK_CONFIG)) {
err = tipc_udp_rcast_disc(b, skb);
if (err)
- goto rcu_out;
+ goto out;
}
-rcu_out:
- rcu_read_unlock();
out:
kfree_skb(skb);
return 0;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 135a7ee9db03..cc0256939eb6 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -52,8 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock);
static void tls_device_free_ctx(struct tls_context *ctx)
{
- if (ctx->tx_conf == TLS_HW)
+ if (ctx->tx_conf == TLS_HW) {
kfree(tls_offload_ctx_tx(ctx));
+ kfree(ctx->tx.rec_seq);
+ kfree(ctx->tx.iv);
+ }
if (ctx->rx_conf == TLS_HW)
kfree(tls_offload_ctx_rx(ctx));
@@ -216,6 +219,13 @@ void tls_device_sk_destruct(struct sock *sk)
}
EXPORT_SYMBOL(tls_device_sk_destruct);
+void tls_device_free_resources_tx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+
+ tls_free_partial_record(sk, tls_ctx);
+}
+
static void tls_append_frag(struct tls_record_info *record,
struct page_frag *pfrag,
int size)
@@ -894,7 +904,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
goto release_netdev;
free_sw_resources:
+ up_read(&device_offload_lock);
tls_sw_free_resources_rx(sk);
+ down_read(&device_offload_lock);
release_ctx:
ctx->priv_ctx_rx = NULL;
release_netdev:
@@ -929,8 +941,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
}
out:
up_read(&device_offload_lock);
- kfree(tls_ctx->rx.rec_seq);
- kfree(tls_ctx->rx.iv);
tls_sw_release_resources_rx(sk);
}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 54c3a758f2a7..a3ebd4b02714 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -194,6 +194,9 @@ static void update_chksum(struct sk_buff *skb, int headln)
static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
{
+ struct sock *sk = skb->sk;
+ int delta;
+
skb_copy_header(nskb, skb);
skb_put(nskb, skb->len);
@@ -201,11 +204,15 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
update_chksum(nskb, headln);
nskb->destructor = skb->destructor;
- nskb->sk = skb->sk;
+ nskb->sk = sk;
skb->destructor = NULL;
skb->sk = NULL;
- refcount_add(nskb->truesize - skb->truesize,
- &nskb->sk->sk_wmem_alloc);
+
+ delta = nskb->truesize - skb->truesize;
+ if (likely(delta < 0))
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
+ else if (delta)
+ refcount_add(delta, &sk->sk_wmem_alloc);
}
/* This function may be called after the user socket is already
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 0e24edab2535..fc81ae18cc44 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -208,6 +208,26 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
return tls_push_sg(sk, ctx, sg, offset, flags);
}
+bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
+{
+ struct scatterlist *sg;
+
+ sg = ctx->partially_sent_record;
+ if (!sg)
+ return false;
+
+ while (1) {
+ put_page(sg_page(sg));
+ sk_mem_uncharge(sk, sg->length);
+
+ if (sg_is_last(sg))
+ break;
+ sg++;
+ }
+ ctx->partially_sent_record = NULL;
+ return true;
+}
+
static void tls_write_space(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
@@ -267,13 +287,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
kfree(ctx->tx.rec_seq);
kfree(ctx->tx.iv);
tls_sw_free_resources_tx(sk);
+#ifdef CONFIG_TLS_DEVICE
+ } else if (ctx->tx_conf == TLS_HW) {
+ tls_device_free_resources_tx(sk);
+#endif
}
- if (ctx->rx_conf == TLS_SW) {
- kfree(ctx->rx.rec_seq);
- kfree(ctx->rx.iv);
+ if (ctx->rx_conf == TLS_SW)
tls_sw_free_resources_rx(sk);
- }
#ifdef CONFIG_TLS_DEVICE
if (ctx->rx_conf == TLS_HW)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4741edf4bb1e..c02293fb10e6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2065,20 +2065,7 @@ void tls_sw_free_resources_tx(struct sock *sk)
/* Free up un-sent records in tx_list. First, free
* the partially sent record if any at head of tx_list.
*/
- if (tls_ctx->partially_sent_record) {
- struct scatterlist *sg = tls_ctx->partially_sent_record;
-
- while (1) {
- put_page(sg_page(sg));
- sk_mem_uncharge(sk, sg->length);
-
- if (sg_is_last(sg))
- break;
- sg++;
- }
-
- tls_ctx->partially_sent_record = NULL;
-
+ if (tls_free_partial_record(sk, tls_ctx)) {
rec = list_first_entry(&ctx->tx_list,
struct tls_rec, list);
list_del(&rec->list);
@@ -2104,6 +2091,9 @@ void tls_sw_release_resources_rx(struct sock *sk)
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ kfree(tls_ctx->rx.rec_seq);
+ kfree(tls_ctx->rx.iv);
+
if (ctx->aead_recv) {
kfree_skb(ctx->recv_pkt);
ctx->recv_pkt = NULL;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ddb838a1b74c..e68d7454f2e3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2040,8 +2040,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
struct unix_sock *u = unix_sk(sk);
struct sk_buff *skb, *last;
long timeo;
+ int skip;
int err;
- int peeked, skip;
err = -EOPNOTSUPP;
if (flags&MSG_OOB)
@@ -2053,8 +2053,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
mutex_lock(&u->iolock);
skip = sk_peek_offset(sk, flags);
- skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
- &err, &last);
+ skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err,
+ &last);
if (skb)
break;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3aecdd3d5b07..e74d21f4108a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13800,7 +13800,8 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_associate,
.flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
- NL80211_FLAG_NEED_RTNL,
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
},
{
.cmd = NL80211_CMD_DEAUTHENTICATE,
@@ -13845,14 +13846,16 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_connect,
.flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
- NL80211_FLAG_NEED_RTNL,
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
},
{
.cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS,
.doit = nl80211_update_connect_params,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
- NL80211_FLAG_NEED_RTNL,
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
},
{
.cmd = NL80211_CMD_DISCONNECT,
@@ -13877,7 +13880,8 @@ static const struct genl_ops nl80211_ops[] = {
.doit = nl80211_setdel_pmksa,
.flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
- NL80211_FLAG_NEED_RTNL,
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
},
{
.cmd = NL80211_CMD_DEL_PMKSA,
@@ -14185,7 +14189,8 @@ static const struct genl_ops nl80211_ops[] = {
.dumpit = nl80211_vendor_cmd_dump,
.flags = GENL_UNS_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
- NL80211_FLAG_NEED_RTNL,
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
},
{
.cmd = NL80211_CMD_SET_QOS_MAP,
@@ -14233,7 +14238,8 @@ static const struct genl_ops nl80211_ops[] = {
.cmd = NL80211_CMD_SET_PMK,
.doit = nl80211_set_pmk,
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
- NL80211_FLAG_NEED_RTNL,
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
},
{
.cmd = NL80211_CMD_DEL_PMK,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 0d5b11d7c6ed..816425ffe05a 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1303,6 +1303,16 @@ reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1,
return dfs_region1;
}
+static void reg_wmm_rules_intersect(const struct ieee80211_wmm_ac *wmm_ac1,
+ const struct ieee80211_wmm_ac *wmm_ac2,
+ struct ieee80211_wmm_ac *intersect)
+{
+ intersect->cw_min = max_t(u16, wmm_ac1->cw_min, wmm_ac2->cw_min);
+ intersect->cw_max = max_t(u16, wmm_ac1->cw_max, wmm_ac2->cw_max);
+ intersect->cot = min_t(u16, wmm_ac1->cot, wmm_ac2->cot);
+ intersect->aifsn = max_t(u8, wmm_ac1->aifsn, wmm_ac2->aifsn);
+}
+
/*
* Helper for regdom_intersect(), this does the real
* mathematical intersection fun
@@ -1317,6 +1327,8 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1,
struct ieee80211_freq_range *freq_range;
const struct ieee80211_power_rule *power_rule1, *power_rule2;
struct ieee80211_power_rule *power_rule;
+ const struct ieee80211_wmm_rule *wmm_rule1, *wmm_rule2;
+ struct ieee80211_wmm_rule *wmm_rule;
u32 freq_diff, max_bandwidth1, max_bandwidth2;
freq_range1 = &rule1->freq_range;
@@ -1327,6 +1339,10 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1,
power_rule2 = &rule2->power_rule;
power_rule = &intersected_rule->power_rule;
+ wmm_rule1 = &rule1->wmm_rule;
+ wmm_rule2 = &rule2->wmm_rule;
+ wmm_rule = &intersected_rule->wmm_rule;
+
freq_range->start_freq_khz = max(freq_range1->start_freq_khz,
freq_range2->start_freq_khz);
freq_range->end_freq_khz = min(freq_range1->end_freq_khz,
@@ -1370,6 +1386,29 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1,
intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms,
rule2->dfs_cac_ms);
+ if (rule1->has_wmm && rule2->has_wmm) {
+ u8 ac;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ reg_wmm_rules_intersect(&wmm_rule1->client[ac],
+ &wmm_rule2->client[ac],
+ &wmm_rule->client[ac]);
+ reg_wmm_rules_intersect(&wmm_rule1->ap[ac],
+ &wmm_rule2->ap[ac],
+ &wmm_rule->ap[ac]);
+ }
+
+ intersected_rule->has_wmm = true;
+ } else if (rule1->has_wmm) {
+ *wmm_rule = *wmm_rule1;
+ intersected_rule->has_wmm = true;
+ } else if (rule2->has_wmm) {
+ *wmm_rule = *wmm_rule2;
+ intersected_rule->has_wmm = true;
+ } else {
+ intersected_rule->has_wmm = false;
+ }
+
if (!is_valid_reg_rule(intersected_rule))
return -EINVAL;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 85dd3342d2c4..c04f5451f89b 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -241,10 +241,9 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
/* copy subelement as we need to change its content to
* mark an ie after it is processed.
*/
- sub_copy = kmalloc(subie_len, gfp);
+ sub_copy = kmemdup(subelement, subie_len, gfp);
if (!sub_copy)
return 0;
- memcpy(sub_copy, subelement, subie_len);
pos = &new_ie[0];
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 6c02c9cf7aa9..cf63b635afc0 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1229,9 +1229,11 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
else if (rate->bw == RATE_INFO_BW_HE_RU &&
rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
result = rates_26[rate->he_gi];
- else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
- rate->bw, rate->he_ru_alloc))
+ else {
+ WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
+ rate->bw, rate->he_ru_alloc);
return 0;
+ }
/* now scale to the appropriate MCS */
tmp = result;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 20a511398389..0ea48a52ce79 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1398,18 +1398,6 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
- case SIOCGSTAMP:
- rc = -EINVAL;
- if (sk)
- rc = sock_get_timestamp(sk,
- (struct timeval __user *)argp);
- break;
- case SIOCGSTAMPNS:
- rc = -EINVAL;
- if (sk)
- rc = sock_get_timestampns(sk,
- (struct timespec __user *)argp);
- break;
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFDSTADDR:
@@ -1681,8 +1669,6 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
void __user *argp = compat_ptr(arg);
- struct sock *sk = sock->sk;
-
int rc = -ENOIOCTLCMD;
switch(cmd) {
@@ -1690,18 +1676,6 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd,
case TIOCINQ:
rc = x25_ioctl(sock, cmd, (unsigned long)argp);
break;
- case SIOCGSTAMP:
- rc = -EINVAL;
- if (sk)
- rc = compat_sock_get_timestamp(sk,
- (struct timeval __user*)argp);
- break;
- case SIOCGSTAMPNS:
- rc = -EINVAL;
- if (sk)
- rc = compat_sock_get_timestampns(sk,
- (struct timespec __user*)argp);
- break;
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFDSTADDR:
@@ -1765,6 +1739,7 @@ static const struct proto_ops x25_proto_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_x25_ioctl,
#endif
+ .gettstamp = sock_gettstamp,
.listen = x25_listen,
.shutdown = sock_no_shutdown,
.setsockopt = x25_setsockopt,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 610c0bdc0c2b..88b9ae24658d 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -43,6 +43,48 @@ struct xsk_queue {
u64 invalid_descs;
};
+/* The structure of the shared state of the rings are the same as the
+ * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion
+ * ring, the kernel is the producer and user space is the consumer. For
+ * the Tx and fill rings, the kernel is the consumer and user space is
+ * the producer.
+ *
+ * producer consumer
+ *
+ * if (LOAD ->consumer) { LOAD ->producer
+ * (A) smp_rmb() (C)
+ * STORE $data LOAD $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->producer STORE ->consumer
+ * }
+ *
+ * (A) pairs with (D), and (B) pairs with (C).
+ *
+ * Starting with (B), it protects the data from being written after
+ * the producer pointer. If this barrier was missing, the consumer
+ * could observe the producer pointer being set and thus load the data
+ * before the producer has written the new data. The consumer would in
+ * this case load the old data.
+ *
+ * (C) protects the consumer from speculatively loading the data before
+ * the producer pointer actually has been read. If we do not have this
+ * barrier, some architectures could load old data as speculative loads
+ * are not discarded as the CPU does not know there is a dependency
+ * between ->producer and data.
+ *
+ * (A) is a control dependency that separates the load of ->consumer
+ * from the stores of $data. In case ->consumer indicates there is no
+ * room in the buffer to store $data we do not. So no barrier is needed.
+ *
+ * (D) protects the load of the data to be observed to happen after the
+ * store of the consumer pointer. If we did not have this memory
+ * barrier, the producer could observe the consumer pointer being set
+ * and overwrite the data with a new value before the consumer got the
+ * chance to read the old value. The consumer would thus miss reading
+ * the old entry and very likely read the new entry twice, once right
+ * now and again after circling through the ring.
+ */
+
/* Common functions operating for both RXTX and umem queues */
static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
@@ -106,6 +148,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
{
if (q->cons_tail == q->cons_head) {
+ smp_mb(); /* D, matches A */
WRITE_ONCE(q->ring->consumer, q->cons_tail);
q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
@@ -128,10 +171,11 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
if (xskq_nb_free(q, q->prod_tail, 1) == 0)
return -ENOSPC;
+ /* A, matches D */
ring->desc[q->prod_tail++ & q->ring_mask] = addr;
/* Order producer and data */
- smp_wmb();
+ smp_wmb(); /* B, matches C */
WRITE_ONCE(q->ring->producer, q->prod_tail);
return 0;
@@ -144,6 +188,7 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
return -ENOSPC;
+ /* A, matches D */
ring->desc[q->prod_head++ & q->ring_mask] = addr;
return 0;
}
@@ -152,7 +197,7 @@ static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
u32 nb_entries)
{
/* Order producer and data */
- smp_wmb();
+ smp_wmb(); /* B, matches C */
q->prod_tail += nb_entries;
WRITE_ONCE(q->ring->producer, q->prod_tail);
@@ -163,6 +208,7 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
if (xskq_nb_free(q, q->prod_head, 1) == 0)
return -ENOSPC;
+ /* A, matches D */
q->prod_head++;
return 0;
}
@@ -204,11 +250,12 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
struct xdp_desc *desc)
{
if (q->cons_tail == q->cons_head) {
+ smp_mb(); /* D, matches A */
WRITE_ONCE(q->ring->consumer, q->cons_tail);
q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
/* Order consumer and data */
- smp_rmb();
+ smp_rmb(); /* C, matches B */
}
return xskq_validate_desc(q, desc);
@@ -228,6 +275,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
if (xskq_nb_free(q, q->prod_head, 1) == 0)
return -ENOSPC;
+ /* A, matches D */
idx = (q->prod_head++) & q->ring_mask;
ring->desc[idx].addr = addr;
ring->desc[idx].len = len;
@@ -238,7 +286,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
static inline void xskq_produce_flush_desc(struct xsk_queue *q)
{
/* Order producer and data */
- smp_wmb();
+ smp_wmb(); /* B, matches C */
q->prod_tail = q->prod_head,
WRITE_ONCE(q->ring->producer, q->prod_tail);