diff options
Diffstat (limited to 'net')
225 files changed, 6250 insertions, 3148 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 1b7a375c6616..dc4411165e43 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -358,6 +358,7 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event) static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; @@ -460,7 +461,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs | IFF_UP); + dev_change_flags(vlandev, flgs | IFF_UP, + extack); netif_stacked_transfer_operstate(dev, vlandev); } break; @@ -648,93 +650,6 @@ out: return err; } -static struct sk_buff *vlan_gro_receive(struct list_head *head, - struct sk_buff *skb) -{ - const struct packet_offload *ptype; - unsigned int hlen, off_vlan; - struct sk_buff *pp = NULL; - struct vlan_hdr *vhdr; - struct sk_buff *p; - __be16 type; - int flush = 1; - - off_vlan = skb_gro_offset(skb); - hlen = off_vlan + sizeof(*vhdr); - vhdr = skb_gro_header_fast(skb, off_vlan); - if (skb_gro_header_hard(skb, hlen)) { - vhdr = skb_gro_header_slow(skb, hlen, off_vlan); - if (unlikely(!vhdr)) - goto out; - } - - type = vhdr->h_vlan_encapsulated_proto; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) - goto out_unlock; - - flush = 0; - - list_for_each_entry(p, head, list) { - struct vlan_hdr *vhdr2; - - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); - if (compare_vlan_header(vhdr, vhdr2)) - NAPI_GRO_CB(p)->same_flow = 0; - } - - skb_gro_pull(skb, sizeof(*vhdr)); - skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); - pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); - -out_unlock: - rcu_read_unlock(); -out: - skb_gro_flush_final(skb, pp, flush); - - return pp; -} - -static int vlan_gro_complete(struct sk_buff *skb, int nhoff) -{ - struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); - __be16 type = vhdr->h_vlan_encapsulated_proto; - struct packet_offload *ptype; - int err = -ENOENT; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); - - rcu_read_unlock(); - return err; -} - -static struct packet_offload vlan_packet_offloads[] __read_mostly = { - { - .type = cpu_to_be16(ETH_P_8021Q), - .priority = 10, - .callbacks = { - .gro_receive = vlan_gro_receive, - .gro_complete = vlan_gro_complete, - }, - }, - { - .type = cpu_to_be16(ETH_P_8021AD), - .priority = 10, - .callbacks = { - .gro_receive = vlan_gro_receive, - .gro_complete = vlan_gro_complete, - }, - }, -}; - static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); @@ -762,7 +677,6 @@ static struct pernet_operations vlan_net_ops = { static int __init vlan_proto_init(void) { int err; - unsigned int i; pr_info("%s v%s\n", vlan_fullname, vlan_version); @@ -786,9 +700,6 @@ static int __init vlan_proto_init(void) if (err < 0) goto err5; - for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) - dev_add_offload(&vlan_packet_offloads[i]); - vlan_ioctl_set(vlan_ioctl_handler); return 0; @@ -806,13 +717,8 @@ err0: static void __exit vlan_cleanup_module(void) { - unsigned int i; - vlan_ioctl_set(NULL); - for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) - dev_remove_offload(&vlan_packet_offloads[i]); - vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 4f60e86f4b8d..a313165e7a67 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -57,7 +57,7 @@ bool vlan_do_receive(struct sk_buff **skbp) } skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats); @@ -223,6 +223,33 @@ static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vi return -ENODEV; } +int vlan_for_each(struct net_device *dev, + int (*action)(struct net_device *dev, int vid, void *arg), + void *arg) +{ + struct vlan_vid_info *vid_info; + struct vlan_info *vlan_info; + struct net_device *vdev; + int ret; + + ASSERT_RTNL(); + + vlan_info = rtnl_dereference(dev->vlan_info); + if (!vlan_info) + return 0; + + list_for_each_entry(vid_info, &vlan_info->vid_list, list) { + vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto, + vid_info->vid); + ret = action(vdev, vid_info->vid, arg); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(vlan_for_each); + int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto) { struct net_device *real_dev = vlan_info->real_dev; @@ -426,3 +453,102 @@ bool vlan_uses_dev(const struct net_device *dev) return vlan_info->grp.nr_vlan_devs ? true : false; } EXPORT_SYMBOL(vlan_uses_dev); + +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) +{ + const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; + __be16 type; + int flush = 1; + + off_vlan = skb_gro_offset(skb); + hlen = off_vlan + sizeof(*vhdr); + vhdr = skb_gro_header_fast(skb, off_vlan); + if (skb_gro_header_hard(skb, hlen)) { + vhdr = skb_gro_header_slow(skb, hlen, off_vlan); + if (unlikely(!vhdr)) + goto out; + } + + type = vhdr->h_vlan_encapsulated_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out_unlock; + + flush = 0; + + list_for_each_entry(p, head, list) { + struct vlan_hdr *vhdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); + if (compare_vlan_header(vhdr, vhdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + +out_unlock: + rcu_read_unlock(); +out: + skb_gro_flush_final(skb, pp, flush); + + return pp; +} + +static int vlan_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); + __be16 type = vhdr->h_vlan_encapsulated_proto; + struct packet_offload *ptype; + int err = -ENOENT; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + + rcu_read_unlock(); + return err; +} + +static struct packet_offload vlan_packet_offloads[] __read_mostly = { + { + .type = cpu_to_be16(ETH_P_8021Q), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, + { + .type = cpu_to_be16(ETH_P_8021AD), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, +}; + +static int __init vlan_offload_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_add_offload(&vlan_packet_offloads[i]); + + return 0; +} + +fs_initcall(vlan_offload_init); diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index f75816f58107..c386e6981416 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -22,7 +22,6 @@ config BATMAN_ADV tristate "B.A.T.M.A.N. Advanced Meshing Protocol" depends on NET - select CRC16 select LIBCRC32C help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is @@ -48,6 +47,7 @@ config BATMAN_ADV_BATMAN_V config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET + select CRC16 default y help This option enables BLA (Bridge Loop Avoidance), a mechanism @@ -82,6 +82,7 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) + default y help This option enables the multicast optimisation which aims to reduce the air overhead while improving the reliability of @@ -100,12 +101,13 @@ config BATMAN_ADV_DEBUGFS config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" - depends on BATMAN_ADV_DEBUGFS + depends on BATMAN_ADV help This is an option for use by developers; most people should say N here. This enables compilation of support for - outputting debugging information to the kernel log. The - output is controlled via the module parameter debug. + outputting debugging information to the debugfs log or tracing + buffer. The output is controlled via the batadv netdev specific + log_level setting. config BATMAN_ADV_TRACING bool "B.A.T.M.A.N. tracing support" diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index d2227091029f..f97e566f0402 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -34,7 +34,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> @@ -2585,13 +2584,14 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv, * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ -static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { @@ -2611,13 +2611,16 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + ret = -EMSGSIZE; if (curr_gw == gw_node) @@ -2668,13 +2671,15 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, int idx_skip = cb->args[0]; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { + spin_lock_bh(&bat_priv->gw.list_lock); + cb->seq = bat_priv->gw.generation << 1 | 1; + + hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; - if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, gw_node)) { + if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, + gw_node)) { idx_skip = idx - 1; goto unlock; } @@ -2682,7 +2687,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, idx_skip = idx; unlock: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 6baec4e68898..90e33f84d37a 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -27,11 +27,13 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> +#include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -915,13 +917,14 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv, * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ -static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { @@ -941,13 +944,16 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + ret = -EMSGSIZE; if (curr_gw == gw_node) { @@ -1018,13 +1024,15 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, int idx_skip = cb->args[0]; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { + spin_lock_bh(&bat_priv->gw.list_lock); + cb->seq = bat_priv->gw.generation << 1 | 1; + + hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; - if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, gw_node)) { + if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, + gw_node)) { idx_skip = idx - 1; goto unlock; } @@ -1032,7 +1040,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, idx_skip = idx; unlock: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 9f481cfdf77d..e8090f099eb8 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -352,19 +352,21 @@ out: */ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) { + static const size_t tvlv_padding = sizeof(__be32); struct batadv_elp_packet *elp_packet; unsigned char *elp_buff; u32 random_seqno; size_t size; int res = -ENOMEM; - size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN; + size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN + tvlv_padding; hard_iface->bat_v.elp_skb = dev_alloc_skb(size); if (!hard_iface->bat_v.elp_skb) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, + BATADV_ELP_HLEN + tvlv_padding); elp_packet = (struct batadv_elp_packet *)elp_buff; elp_packet->packet_type = BATADV_ELP; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 5f1aeeded0e3..5fdde2947802 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2094,14 +2094,15 @@ out: * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface * @claim: entry to dump * * Return: 0 or error code. */ static int -batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { @@ -2111,13 +2112,16 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, void *hdr; int ret = -EINVAL; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_BLA_CLAIM); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); @@ -2153,28 +2157,33 @@ out: * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface - * @head: bucket to dump + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int -batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, - struct hlist_head *head, int *idx_skip) + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_skip) { struct batadv_bla_claim *claim; int idx = 0; int ret = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(claim, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; - ret = batadv_bla_claim_dump_entry(msg, portid, seq, + ret = batadv_bla_claim_dump_entry(msg, portid, cb, primary_if, claim); if (ret) { *idx_skip = idx - 1; @@ -2184,7 +2193,7 @@ batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, *idx_skip = 0; unlock: - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return ret; } @@ -2204,7 +2213,6 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -2230,11 +2238,8 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_bla_claim_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, - primary_if, head, &idx)) + if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if, + hash, bucket, &idx)) break; bucket++; } @@ -2325,14 +2330,15 @@ out: * netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface * @backbone_gw: entry to dump * * Return: 0 or error code. */ static int -batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { @@ -2343,13 +2349,16 @@ batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, void *hdr; int ret = -EINVAL; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_BLA_BACKBONE); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); spin_lock_bh(&backbone_gw->crc_lock); @@ -2386,28 +2395,33 @@ out: * a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface - * @head: bucket to dump + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int -batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, - struct hlist_head *head, int *idx_skip) + struct batadv_hashtable *hash, + unsigned int bucket, int *idx_skip) { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; int ret = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; - ret = batadv_bla_backbone_dump_entry(msg, portid, seq, + ret = batadv_bla_backbone_dump_entry(msg, portid, cb, primary_if, backbone_gw); if (ret) { *idx_skip = idx - 1; @@ -2417,7 +2431,7 @@ batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, *idx_skip = 0; unlock: - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return ret; } @@ -2437,7 +2451,6 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -2463,11 +2476,8 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_bla_backbone_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, - primary_if, head, &idx)) + if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if, + hash, bucket, &idx)) break; bucket++; } diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 8b608a2e2653..d4a7702e48d8 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -19,6 +19,7 @@ #include "debugfs.h" #include "main.h" +#include <asm/current.h> #include <linux/dcache.h> #include <linux/debugfs.h> #include <linux/err.h> @@ -27,6 +28,7 @@ #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/printk.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/stddef.h> diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index a60bacf7120b..b9ffe1826527 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -863,23 +863,27 @@ out: * netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @dat_entry: entry to dump * * Return: 0 or error code. */ static int -batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_dat_entry *dat_entry) { int msecs; void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_DAT_CACHE); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, @@ -901,27 +905,31 @@ batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message - * @head: bucket to dump + * @cb: Control block containing additional options + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int -batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, - struct hlist_head *head, int *idx_skip) +batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_skip) { struct batadv_dat_entry *dat_entry; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) { if (idx < *idx_skip) goto skip; - if (batadv_dat_cache_dump_entry(msg, portid, seq, - dat_entry)) { - rcu_read_unlock(); + if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) { + spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; @@ -930,7 +938,7 @@ batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, skip: idx++; } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return 0; } @@ -951,7 +959,6 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -977,10 +984,7 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_dat_cache_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, head, + if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket, &idx)) break; diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0fddc17106bd..5b71a289d04f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -275,7 +275,7 @@ batadv_frag_merge_packets(struct hlist_head *chain) kfree(entry); packet = (struct batadv_frag_packet *)skb_out->data; - size = ntohs(packet->total_size); + size = ntohs(packet->total_size) + hdr_size; /* Make room for the rest of the fragments. */ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 140c61a3f1ec..9d8e5eda2314 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -377,6 +377,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); + bat_priv->gw.generation++; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -472,6 +473,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); + bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); @@ -518,6 +520,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); + bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 781c5b6e6e8e..508f4416dfc9 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -951,6 +951,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); + batadv_hardif_generation++; return hard_iface; @@ -993,6 +994,7 @@ void batadv_hardif_remove_interfaces(void) list_for_each_entry_safe(hard_iface, hard_iface_tmp, &batadv_hardif_list, list) { list_del_rcu(&hard_iface->list); + batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); } rtnl_unlock(); @@ -1054,6 +1056,7 @@ static int batadv_hard_if_event(struct notifier_block *this, case NETDEV_UNREGISTER: case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); + batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); break; diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 7b49e4001778..9194f4d891b1 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -32,6 +32,8 @@ static void batadv_hash_init(struct batadv_hashtable *hash) INIT_HLIST_HEAD(&hash->table[i]); spin_lock_init(&hash->list_locks[i]); } + + atomic_set(&hash->generation, 0); } /** diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 9490a7ca2ba6..0e36fa1c7c3e 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -21,6 +21,7 @@ #include "main.h" +#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> #include <linux/rculist.h> @@ -58,6 +59,9 @@ struct batadv_hashtable { /** @size: size of hashtable */ u32 size; + + /** @generation: current (generation) sequence number */ + atomic_t generation; }; /* allocates and clears the hash */ @@ -112,6 +116,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash, /* no duplicate found in list, add new element */ hlist_add_head_rcu(data_node, head); + atomic_inc(&hash->generation); ret = 0; @@ -154,6 +159,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, data_save = node; hlist_del_rcu(node); + atomic_inc(&hash->generation); break; } spin_unlock_bh(&hash->list_locks[index]); diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 6beb5f067810..02e55b78132f 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -43,6 +43,8 @@ #include "debugfs.h" #include "trace.h" +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + #define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; @@ -92,33 +94,6 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, return 0; } -/** - * batadv_debug_log() - Add debug log entry - * @bat_priv: the bat priv with all the soft interface information - * @fmt: format string - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", - jiffies_to_msecs(jiffies), &vaf); - - trace_batadv_dbg(bat_priv, &vaf); - - va_end(args); - - return 0; -} - static int batadv_log_open(struct inode *inode, struct file *file) { if (!try_module_get(THIS_MODULE)) @@ -259,3 +234,34 @@ void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) kfree(bat_priv->debug_log); bat_priv->debug_log = NULL; } + +#endif /* CONFIG_BATMAN_ADV_DEBUGFS */ + +/** + * batadv_debug_log() - Add debug log entry + * @bat_priv: the bat priv with all the soft interface information + * @fmt: format string + * + * Return: 0 on success or negative error number in case of failure + */ +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", + jiffies_to_msecs(jiffies), &vaf); +#endif + + trace_batadv_dbg(bat_priv, &vaf); + + va_end(args); + + return 0; +} diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 69c0d85bceb3..d1ed839fd32b 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -74,6 +74,7 @@ * list traversals just rcu-locked */ struct list_head batadv_hardif_list; +unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -186,6 +187,8 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); + bat_priv->gw.generation = 0; + ret = batadv_v_mesh_init(bat_priv); if (ret < 0) goto err; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2002b70e18db..b572066325e4 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.4" +#define BATADV_SOURCE_VERSION "2019.0" #endif /* B.A.T.M.A.N. parameters */ @@ -247,6 +247,7 @@ static inline int batadv_print_vid(unsigned short vid) } extern struct list_head batadv_hardif_list; +extern unsigned int batadv_hardif_generation; extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 86725d792e15..69244e4598f5 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1365,22 +1365,26 @@ int batadv_mcast_mesh_info_put(struct sk_buff *msg, * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int -batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); @@ -1405,21 +1409,26 @@ batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * table to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message - * @head: bucket to dump + * @cb: Control block containing additional options + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int -batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, - struct hlist_head *head, long *idx_skip) +batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, + struct batadv_hashtable *hash, + unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; @@ -1427,9 +1436,8 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, if (idx < *idx_skip) goto skip; - if (batadv_mcast_flags_dump_entry(msg, portid, seq, - orig_node)) { - rcu_read_unlock(); + if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { + spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; @@ -1438,7 +1446,7 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, skip: idx++; } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return 0; } @@ -1447,7 +1455,7 @@ skip: * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: the bat priv with all the soft interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump @@ -1455,19 +1463,17 @@ skip: * Return: 0 or error code. */ static int -__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq, +__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; - struct hlist_head *head; long idx_tmp = *idx; while (bucket_tmp < hash->size) { - head = &hash->table[bucket_tmp]; - - if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head, - &idx_tmp)) + if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, + *bucket, &idx_tmp)) break; bucket_tmp++; @@ -1550,8 +1556,7 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; bat_priv = netdev_priv(primary_if->soft_iface); - ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, bucket, idx); + ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0d9459b69bdb..2dc3304cee54 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -29,11 +29,11 @@ #include <linux/if_ether.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> @@ -445,23 +445,27 @@ out: * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @hard_iface: Hard interface to dump * * Return: error code, or 0 on success */ static int -batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *hard_iface) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_HARDIFS); if (!hdr) return -EMSGSIZE; + genl_dump_check_consistent(cb, hdr); + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, @@ -498,7 +502,6 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hard_iface *hard_iface; int ifindex; int portid = NETLINK_CB(cb->skb).portid; - int seq = cb->nlh->nlmsg_seq; int skip = cb->args[0]; int i = 0; @@ -516,23 +519,24 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) return -ENODEV; } - rcu_read_lock(); + rtnl_lock(); + cb->seq = batadv_hardif_generation << 1 | 1; - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (i++ < skip) continue; - if (batadv_netlink_dump_hardif_entry(msg, portid, seq, + if (batadv_netlink_dump_hardif_entry(msg, portid, cb, hard_iface)) { i--; break; } } - rcu_read_unlock(); + rtnl_unlock(); dev_put(soft_iface); diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3d57f9981f25..8e1024217cff 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -16,7 +16,5 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include <linux/module.h> - #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 3acda26a30ca..104784be94d7 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -21,7 +21,13 @@ #include "main.h" +#include <linux/bug.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/percpu.h> +#include <linux/printk.h> #include <linux/tracepoint.h> +#include <linux/types.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index d21624c44665..8dcd4968cde7 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1145,14 +1145,15 @@ out: * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success */ static int -batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common) { @@ -1173,12 +1174,14 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, batadv_softif_vlan_put(vlan); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_LOCAL); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || @@ -1201,34 +1204,39 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @head: Pointer to the list containing the local tt entries + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_s: Number of entries to skip * * Return: Error code, or 0 on success */ static int -batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, - struct hlist_head *head, int *idx_s) + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_s) { struct batadv_tt_common_entry *common; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(common, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(common, &hash->table[bucket], hash_entry) { if (idx++ < *idx_s) continue; - if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv, + if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv, common)) { - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = idx - 1; return -EMSGSIZE; } } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = 0; return 0; @@ -1248,7 +1256,6 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; - struct hlist_head *head; int ret; int ifindex; int bucket = cb->args[0]; @@ -1276,10 +1283,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) hash = bat_priv->tt.local_hash; while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, head, &idx)) + if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv, + hash, bucket, &idx)) break; bucket++; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 45b5592de816..cbe17da36fcb 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1096,12 +1096,15 @@ struct batadv_priv_gw { /** @gateway_list: list of available gateway nodes */ struct hlist_head gateway_list; - /** @list_lock: lock protecting gateway_list & curr_gw */ + /** @list_lock: lock protecting gateway_list, curr_gw, generation */ spinlock_t list_lock; /** @curr_gw: pointer to currently selected gateway node */ struct batadv_gw_node __rcu *curr_gw; + /** @generation: current (generation) sequence number */ + unsigned int generation; + /** * @mode: gateway operation: off, client or server (see batadv_gw_modes) */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 828e87fe8027..9d79c7de234a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -607,7 +607,7 @@ static void ifup(struct net_device *netdev) int err; rtnl_lock(); - err = dev_open(netdev); + err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c89c22c49015..fa2644d276ef 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -28,12 +28,13 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, return ret; } -static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, + u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; - u32 ret = 0, i; + u32 i; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -49,7 +50,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx, storage); + *ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -65,7 +66,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return ret; + return 0; } static int bpf_test_finish(const union bpf_attr *kattr, @@ -74,8 +75,18 @@ static int bpf_test_finish(const union bpf_attr *kattr, { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; + u32 copy_size = size; + + /* Clamp copy if the user has provided a size hint, but copy the full + * buffer if not to retain old behaviour. + */ + if (kattr->test.data_size_out && + copy_size > kattr->test.data_size_out) { + copy_size = kattr->test.data_size_out; + err = -ENOSPC; + } - if (data_out && copy_to_user(data_out, data, size)) + if (data_out && copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; @@ -83,7 +94,8 @@ static int bpf_test_finish(const union bpf_attr *kattr, goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; - err = 0; + if (err != -ENOSPC) + err = 0; out: return err; } @@ -165,7 +177,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); - retval = bpf_test_run(prog, skb, repeat, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + if (ret) { + kfree_skb(skb); + kfree(sk); + return ret; + } if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -212,11 +229,14 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - retval = bpf_test_run(prog, &xdp, repeat, &duration); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + if (ret) + goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); +out: kfree(data); return ret; } diff --git a/net/bridge/br.c b/net/bridge/br.c index 360ad66c21e9..a5174e5001d8 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -31,6 +31,8 @@ */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); + struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; @@ -56,6 +58,17 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v br_mtu_auto_adjust(br); break; + case NETDEV_PRE_CHANGEADDR: + if (br->dev->addr_assign_type == NET_ADDR_SET) + break; + prechaddr_info = ptr; + err = dev_pre_changeaddr_notify(br->dev, + prechaddr_info->dev_addr, + extack); + if (err) + return notifier_from_errno(err); + break; + case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); @@ -175,6 +188,82 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +/* br_boolopt_toggle - change user-controlled boolean option + * + * @br: bridge device + * @opt: id of the option to change + * @on: new option value + * @extack: extack for error messages + * + * Changes the value of the respective boolean option to @on taking care of + * any internal option value mapping and configuration. + */ +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack) +{ + switch (opt) { + case BR_BOOLOPT_NO_LL_LEARN: + br_opt_toggle(br, BROPT_NO_LL_LEARN, on); + break; + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) +{ + switch (opt) { + case BR_BOOLOPT_NO_LL_LEARN: + return br_opt_get(br, BROPT_NO_LL_LEARN); + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack) +{ + unsigned long bitmap = bm->optmask; + int err = 0; + int opt_id; + + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { + bool on = !!(bm->optval & BIT(opt_id)); + + err = br_boolopt_toggle(br, opt_id, on, extack); + if (err) { + br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", + opt_id, br_boolopt_get(br, opt_id), on, err); + break; + } + } + + return err; +} + +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm) +{ + u32 optval = 0; + int opt_id; + + for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) + optval |= (br_boolopt_get(br, opt_id) << opt_id); + + bm->optval = optval; + bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); +} + +/* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c6abf927f0c9..013323b6dbe4 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -131,9 +131,17 @@ static int br_dev_init(struct net_device *dev) return err; } + err = br_mdb_hash_init(br); + if (err) { + free_percpu(br->stats); + br_fdb_hash_fini(br); + return err; + } + err = br_vlan_init(br); if (err) { free_percpu(br->stats); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; } @@ -142,6 +150,7 @@ static int br_dev_init(struct net_device *dev) if (err) { free_percpu(br->stats); br_vlan_flush(br); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); } br_set_lockdep_class(dev); @@ -156,6 +165,7 @@ static void br_dev_uninit(struct net_device *dev) br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); free_percpu(br->stats); } @@ -393,6 +403,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, + .ndo_fdb_get = br_fdb_get, .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e56ba3912a90..fe3c758791ca 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -773,6 +773,32 @@ skip: return err; } +int br_fdb_get(struct sk_buff *skb, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, + u16 vid, u32 portid, u32 seq, + struct netlink_ext_ack *extack) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_fdb_entry *f; + int err = 0; + + rcu_read_lock(); + f = br_fdb_find_rcu(br, addr, vid); + if (!f) { + NL_SET_ERR_MSG(extack, "Fdb entry not found"); + err = -ENOENT; + goto errout; + } + + err = fdb_fill_info(skb, br, f, portid, seq, + RTM_NEWNEIGH, 0); +errout: + rcu_read_unlock(); + return err; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, u16 state, u16 flags, u16 vid, @@ -1164,3 +1190,23 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_unlock_bh(&br->hash_lock); } + +void br_fdb_clear_offload(const struct net_device *dev, u16 vid) +{ + struct net_bridge_fdb_entry *f; + struct net_bridge_port *p; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(dev); + if (!p) + return; + + spin_lock_bh(&p->br->hash_lock); + hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { + if (f->dst == p && f->key.vlan_id == vid) + f->offloaded = 0; + } + spin_unlock_bh(&p->br->hash_lock); +} +EXPORT_SYMBOL_GPL(br_fdb_clear_offload); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 9b46d2dc4c22..41f0a696a65f 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -650,7 +650,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); - err = nbp_vlan_init(p); + if (br->dev->addr_assign_type != NET_ADDR_SET) { + /* Ask for permission to use this MAC address now, even if we + * don't end up choosing it below. + */ + err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); + if (err) + goto err7; + } + + err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err7; @@ -741,3 +750,15 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) if (mask & BR_NEIGH_SUPPRESS) br_recalculate_neigh_suppress_enabled(br); } + +bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) +{ + struct net_bridge_port *p; + + p = br_port_get_rtnl_rcu(dev); + if (!p) + return false; + + return p->flags & flag; +} +EXPORT_SYMBOL_GPL(br_port_flag_is_set); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 3ddca11f44c2..5ea7e56119c1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -188,7 +188,9 @@ static void __br_handle_local_finish(struct sk_buff *skb) u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ - if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) + if ((p->flags & BR_LEARNING) && + !br_opt_get(p->br, BROPT_NO_LL_LEARN) && + br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a7ea2d431714..f69c8d91dc81 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -78,82 +78,72 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { + int idx = 0, s_idx = cb->args[1], err = 0; struct net_bridge *br = netdev_priv(dev); - struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; - int i, err = 0; - int idx = 0, s_idx = cb->args[1]; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; - mdb = rcu_dereference(br->mdb); - if (!mdb) - return 0; - nest = nla_nest_start(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; - for (i = 0; i < mdb->max; i++) { - struct net_bridge_mdb_entry *mp; + hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct net_bridge_port *port; - hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) { - if (idx < s_idx) - goto skip; + if (idx < s_idx) + goto skip; - nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); - if (nest2 == NULL) { - err = -EMSGSIZE; - goto out; - } + nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); + if (!nest2) { + err = -EMSGSIZE; + break; + } - for (pp = &mp->ports; - (p = rcu_dereference(*pp)) != NULL; - pp = &p->next) { - struct nlattr *nest_ent; - struct br_mdb_entry e; - - port = p->port; - if (!port) - continue; - - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.vid = p->addr.vid; - __mdb_entry_fill_flags(&e, p->flags); - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + + port = p->port; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - nest_ent = nla_nest_start(skb, - MDBA_MDB_ENTRY_INFO); - if (!nest_ent) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - if (nla_put_nohdr(skb, sizeof(e), &e) || - nla_put_u32(skb, - MDBA_MDB_EATTR_TIMER, - br_timer_value(&p->timer))) { - nla_nest_cancel(skb, nest_ent); - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - nla_nest_end(skb, nest_ent); + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } - nla_nest_end(skb, nest2); - skip: - idx++; + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + nla_nest_end(skb, nest_ent); } + nla_nest_end(skb, nest2); +skip: + idx++; } out: @@ -203,8 +193,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); - /* In theory this could be wrapped to 0... */ - cb->seq = net->dev_base_seq + br_mdb_rehash_seq; + cb->seq = net->dev_base_seq; for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_EBRIDGE) { @@ -297,7 +286,6 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) struct br_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; @@ -306,8 +294,7 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) goto err; spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, &data->ip); + mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; @@ -344,7 +331,7 @@ static void br_mdb_switchdev_host_port(struct net_device *dev, mdb.obj.orig_dev = dev; switch (type) { case RTM_NEWMDB: - switchdev_port_obj_add(lower_dev, &mdb.obj); + switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); break; case RTM_DELMDB: switchdev_port_obj_del(lower_dev, &mdb.obj); @@ -394,7 +381,7 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, __mdb_entry_to_br_ip(entry, &complete_info->ip); mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_mdb_complete; - if (switchdev_port_obj_add(port_dev, &mdb.obj)) + if (switchdev_port_obj_add(port_dev, &mdb.obj, NULL)) kfree(complete_info); } } else if (p && port_dev && type == RTM_DELMDB) { @@ -588,14 +575,12 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct net_bridge_mdb_htable *mdb; unsigned long now = jiffies; int err; - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, group); + mp = br_mdb_ip_get(br, group); if (!mp) { - mp = br_multicast_new_group(br, port, group); + mp = br_multicast_new_group(br, group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; @@ -696,7 +681,6 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; @@ -709,9 +693,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - - mp = br_mdb_ip_get(mdb, &ip); + mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; @@ -728,7 +710,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); err = 0; if (!mp->ports && !mp->host_joined && diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 6bac0d6b7b94..879cd2315769 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -37,6 +37,14 @@ #include "br_private.h" +static const struct rhashtable_params br_mdb_rht_params = { + .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), + .key_offset = offsetof(struct net_bridge_mdb_entry, addr), + .key_len = sizeof(struct br_ip), + .automatic_shrinking = true, + .locks_mul = 1, +}; + static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); static void br_multicast_add_router(struct net_bridge *br, @@ -54,7 +62,6 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif -unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) { @@ -73,89 +80,58 @@ static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) return 0; } -static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip, - __u16 vid) -{ - return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb, - const struct in6_addr *ip, - __u16 vid) -{ - return jhash_2words(ipv6_addr_hash(ip), vid, - mdb->secret) & (mdb->max - 1); -} -#endif - -static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, - struct br_ip *ip) +static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, + struct br_ip *dst) { - switch (ip->proto) { - case htons(ETH_P_IP): - return __br_ip4_hash(mdb, ip->u.ip4, ip->vid); -#if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid); -#endif - } - return 0; + return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); } -static struct net_bridge_mdb_entry *__br_mdb_ip_get( - struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash) +struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br, + struct br_ip *dst) { - struct net_bridge_mdb_entry *mp; + struct net_bridge_mdb_entry *ent; - hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) { - if (br_ip_equal(&mp->addr, dst)) - return mp; - } - - return NULL; -} + lockdep_assert_held_once(&br->multicast_lock); -struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, - struct br_ip *dst) -{ - if (!mdb) - return NULL; + rcu_read_lock(); + ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); + rcu_read_unlock(); - return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst)); + return ent; } -static struct net_bridge_mdb_entry *br_mdb_ip4_get( - struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, + __be32 dst, __u16 vid) { struct br_ip br_dst; + memset(&br_dst, 0, sizeof(br_dst)); br_dst.u.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; - return br_mdb_ip_get(mdb, &br_dst); + return br_mdb_ip_get(br, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) -static struct net_bridge_mdb_entry *br_mdb_ip6_get( - struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst, - __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, + const struct in6_addr *dst, + __u16 vid) { struct br_ip br_dst; + memset(&br_dst, 0, sizeof(br_dst)); br_dst.u.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; - return br_mdb_ip_get(mdb, &br_dst); + return br_mdb_ip_get(br, &br_dst); } #endif struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); struct br_ip ip; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) @@ -164,6 +140,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, if (BR_INPUT_SKB_CB(skb)->igmp) return NULL; + memset(&ip, 0, sizeof(ip)); ip.proto = skb->protocol; ip.vid = vid; @@ -180,70 +157,13 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return NULL; } - return br_mdb_ip_get(mdb, &ip); -} - -static void br_mdb_free(struct rcu_head *head) -{ - struct net_bridge_mdb_htable *mdb = - container_of(head, struct net_bridge_mdb_htable, rcu); - struct net_bridge_mdb_htable *old = mdb->old; - - mdb->old = NULL; - kfree(old->mhash); - kfree(old); -} - -static int br_mdb_copy(struct net_bridge_mdb_htable *new, - struct net_bridge_mdb_htable *old, - int elasticity) -{ - struct net_bridge_mdb_entry *mp; - int maxlen; - int len; - int i; - - for (i = 0; i < old->max; i++) - hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver]) - hlist_add_head(&mp->hlist[new->ver], - &new->mhash[br_ip_hash(new, &mp->addr)]); - - if (!elasticity) - return 0; - - maxlen = 0; - for (i = 0; i < new->max; i++) { - len = 0; - hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver]) - len++; - if (len > maxlen) - maxlen = len; - } - - return maxlen > elasticity ? -EINVAL : 0; -} - -void br_multicast_free_pg(struct rcu_head *head) -{ - struct net_bridge_port_group *p = - container_of(head, struct net_bridge_port_group, rcu); - - kfree(p); -} - -static void br_multicast_free_group(struct rcu_head *head) -{ - struct net_bridge_mdb_entry *mp = - container_of(head, struct net_bridge_mdb_entry, rcu); - - kfree(mp); + return br_mdb_ip_get_rcu(br, &ip); } static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); struct net_bridge *br = mp->br; - struct net_bridge_mdb_htable *mdb; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&mp->timer)) @@ -255,12 +175,11 @@ static void br_multicast_group_expired(struct timer_list *t) if (mp->ports) goto out; - mdb = mlock_dereference(br->mdb, br); + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_rcu(&mp->mdb_node); - hlist_del_rcu(&mp->hlist[mdb->ver]); - mdb->size--; - - call_rcu_bh(&mp->rcu, br_multicast_free_group); + kfree_rcu(mp, rcu); out: spin_unlock(&br->multicast_lock); @@ -269,14 +188,11 @@ out: static void br_multicast_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - mdb = mlock_dereference(br->mdb, br); - - mp = br_mdb_ip_get(mdb, &pg->addr); + mp = br_mdb_ip_get(br, &pg->addr); if (WARN_ON(!mp)) return; @@ -291,7 +207,7 @@ static void br_multicast_del_pg(struct net_bridge *br, del_timer(&p->timer); br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, p->flags); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) @@ -319,53 +235,6 @@ out: spin_unlock(&br->multicast_lock); } -static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max, - int elasticity) -{ - struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1); - struct net_bridge_mdb_htable *mdb; - int err; - - mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC); - if (!mdb) - return -ENOMEM; - - mdb->max = max; - mdb->old = old; - - mdb->mhash = kcalloc(max, sizeof(*mdb->mhash), GFP_ATOMIC); - if (!mdb->mhash) { - kfree(mdb); - return -ENOMEM; - } - - mdb->size = old ? old->size : 0; - mdb->ver = old ? old->ver ^ 1 : 0; - - if (!old || elasticity) - get_random_bytes(&mdb->secret, sizeof(mdb->secret)); - else - mdb->secret = old->secret; - - if (!old) - goto out; - - err = br_mdb_copy(mdb, old, elasticity); - if (err) { - kfree(mdb->mhash); - kfree(mdb); - return err; - } - - br_mdb_rehash_seq++; - call_rcu_bh(&mdb->rcu, br_mdb_free); - -out: - rcu_assign_pointer(*mdbp, mdb); - - return 0; -} - static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, __be32 group, u8 *igmp_type) @@ -589,111 +458,19 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, return NULL; } -static struct net_bridge_mdb_entry *br_multicast_get_group( - struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, int hash) -{ - struct net_bridge_mdb_htable *mdb; - struct net_bridge_mdb_entry *mp; - unsigned int count = 0; - unsigned int max; - int elasticity; - int err; - - mdb = rcu_dereference_protected(br->mdb, 1); - hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) { - count++; - if (unlikely(br_ip_equal(group, &mp->addr))) - return mp; - } - - elasticity = 0; - max = mdb->max; - - if (unlikely(count > br->hash_elasticity && count)) { - if (net_ratelimit()) - br_info(br, "Multicast hash table " - "chain limit reached: %s\n", - port ? port->dev->name : br->dev->name); - - elasticity = br->hash_elasticity; - } - - if (mdb->size >= max) { - max *= 2; - if (unlikely(max > br->hash_max)) { - br_warn(br, "Multicast hash table maximum of %d " - "reached, disabling snooping: %s\n", - br->hash_max, - port ? port->dev->name : br->dev->name); - err = -E2BIG; -disable: - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); - goto err; - } - } - - if (max > mdb->max || elasticity) { - if (mdb->old) { - if (net_ratelimit()) - br_info(br, "Multicast hash table " - "on fire: %s\n", - port ? port->dev->name : br->dev->name); - err = -EEXIST; - goto err; - } - - err = br_mdb_rehash(&br->mdb, max, elasticity); - if (err) { - br_warn(br, "Cannot rehash multicast " - "hash table, disabling snooping: %s, %d, %d\n", - port ? port->dev->name : br->dev->name, - mdb->size, err); - goto disable; - } - - err = -EAGAIN; - goto err; - } - - return NULL; - -err: - mp = ERR_PTR(err); - return mp; -} - struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, - struct net_bridge_port *p, struct br_ip *group) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - int hash; int err; - mdb = rcu_dereference_protected(br->mdb, 1); - if (!mdb) { - err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0); - if (err) - return ERR_PTR(err); - goto rehash; - } - - hash = br_ip_hash(mdb, group); - mp = br_multicast_get_group(br, p, group, hash); - switch (PTR_ERR(mp)) { - case 0: - break; - - case -EAGAIN: -rehash: - mdb = rcu_dereference_protected(br->mdb, 1); - hash = br_ip_hash(mdb, group); - break; + mp = br_mdb_ip_get(br, group); + if (mp) + return mp; - default: - goto out; + if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); + return ERR_PTR(-E2BIG); } mp = kzalloc(sizeof(*mp), GFP_ATOMIC); @@ -703,11 +480,15 @@ rehash: mp->br = br; mp->addr = *group; timer_setup(&mp->timer, br_multicast_group_expired, 0); + err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + if (err) { + kfree(mp); + mp = ERR_PTR(err); + } else { + hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list); + } - hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]); - mdb->size++; - -out: return mp; } @@ -768,7 +549,7 @@ static int br_multicast_add_group(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - mp = br_multicast_new_group(br, port, group); + mp = br_multicast_new_group(br, group); err = PTR_ERR(mp); if (IS_ERR(mp)) goto err; @@ -837,6 +618,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, if (ipv6_addr_is_ll_all_nodes(group)) return 0; + memset(&br_group, 0, sizeof(br_group)); br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; @@ -1483,7 +1265,7 @@ static void br_ip4_multicast_query(struct net_bridge *br, goto out; } - mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid); + mp = br_mdb_ip4_get(br, group, vid); if (!mp) goto out; @@ -1567,7 +1349,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, goto out; } - mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid); + mp = br_mdb_ip6_get(br, group, vid); if (!mp) goto out; @@ -1601,7 +1383,6 @@ br_multicast_leave_group(struct net_bridge *br, struct bridge_mcast_own_query *own_query, const unsigned char *src) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; unsigned long now; @@ -1612,8 +1393,7 @@ br_multicast_leave_group(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, group); + mp = br_mdb_ip_get(br, group); if (!mp) goto out; @@ -1629,7 +1409,7 @@ br_multicast_leave_group(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); br_mdb_notify(br->dev, port, group, RTM_DELMDB, p->flags); @@ -1961,8 +1741,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) void br_multicast_init(struct net_bridge *br) { - br->hash_elasticity = 4; - br->hash_max = 512; + br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_last_member_count = 2; @@ -1999,6 +1778,7 @@ void br_multicast_init(struct net_bridge *br) timer_setup(&br->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif + INIT_HLIST_HEAD(&br->mdb_list); } static void __br_multicast_open(struct net_bridge *br, @@ -2033,40 +1813,20 @@ void br_multicast_stop(struct net_bridge *br) void br_multicast_dev_del(struct net_bridge *br) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - struct hlist_node *n; - u32 ver; - int i; + struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - if (!mdb) - goto out; - - br->mdb = NULL; - - ver = mdb->ver; - for (i = 0; i < mdb->max; i++) { - hlist_for_each_entry_safe(mp, n, &mdb->mhash[i], - hlist[ver]) { - del_timer(&mp->timer); - call_rcu_bh(&mp->rcu, br_multicast_free_group); - } + hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) { + del_timer(&mp->timer); + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_rcu(&mp->mdb_node); + kfree_rcu(mp, rcu); } - - if (mdb->old) { - spin_unlock_bh(&br->multicast_lock); - rcu_barrier_bh(); - spin_lock_bh(&br->multicast_lock); - WARN_ON(mdb->old); - } - - mdb->old = mdb; - call_rcu_bh(&mdb->rcu, br_mdb_free); - -out: spin_unlock_bh(&br->multicast_lock); + + rcu_barrier(); } int br_multicast_set_router(struct net_bridge *br, unsigned long val) @@ -2176,7 +1936,6 @@ static void br_multicast_start_querier(struct net_bridge *br, int br_multicast_toggle(struct net_bridge *br, unsigned long val) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_port *port; int err = 0; @@ -2192,21 +1951,6 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) if (!netif_running(br->dev)) goto unlock; - mdb = mlock_dereference(br->mdb, br); - if (mdb) { - if (mdb->old) { - err = -EEXIST; -rollback: - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); - goto unlock; - } - - err = br_mdb_rehash(&br->mdb, mdb->max, - br->hash_elasticity); - if (err) - goto rollback; - } - br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port(port); @@ -2271,45 +2015,6 @@ unlock: return 0; } -int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) -{ - int err = -EINVAL; - u32 old; - struct net_bridge_mdb_htable *mdb; - - spin_lock_bh(&br->multicast_lock); - if (!is_power_of_2(val)) - goto unlock; - - mdb = mlock_dereference(br->mdb, br); - if (mdb && val < mdb->size) - goto unlock; - - err = 0; - - old = br->hash_max; - br->hash_max = val; - - if (mdb) { - if (mdb->old) { - err = -EEXIST; -rollback: - br->hash_max = old; - goto unlock; - } - - err = br_mdb_rehash(&br->mdb, br->hash_max, - br->hash_elasticity); - if (err) - goto rollback; - } - -unlock: - spin_unlock_bh(&br->multicast_lock); - - return err; -} - int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) { /* Currently we support only version 2 and 3 */ @@ -2646,3 +2351,13 @@ void br_multicast_get_stats(const struct net_bridge *br, } memcpy(dest, &tdst, sizeof(*dest)); } + +int br_mdb_hash_init(struct net_bridge *br) +{ + return rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); +} + +void br_mdb_hash_fini(struct net_bridge *br) +{ + rhashtable_destroy(&br->mdb_hash_tbl); +} diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b1b5e8516724..c9383c470a83 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -671,10 +671,8 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff return 0; } - if (data->vlan_tci) { - skb->vlan_tci = data->vlan_tci; - skb->vlan_proto = data->vlan_proto; - } + if (data->vlan_proto) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); @@ -740,8 +738,13 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff data = this_cpu_ptr(&brnf_frag_data_storage); - data->vlan_tci = skb->vlan_tci; - data->vlan_proto = skb->vlan_proto; + if (skb_vlan_tag_present(skb)) { + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_proto = 0; + } + data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3345f1984542..935495b93a99 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -525,7 +525,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, - int cmd, struct bridge_vlan_info *vinfo, bool *changed) + int cmd, struct bridge_vlan_info *vinfo, bool *changed, + struct netlink_ext_ack *extack) { bool curr_change; int err = 0; @@ -537,11 +538,11 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, - &curr_change); + &curr_change, extack); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags, - &curr_change); + &curr_change, extack); } if (curr_change) *changed = true; @@ -568,7 +569,8 @@ static int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, - bool *changed) + bool *changed, + struct netlink_ext_ack *extack) { if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) return -EINVAL; @@ -598,7 +600,8 @@ static int br_process_vlan_info(struct net_bridge *br, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed); + err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed, + extack); if (err) break; } @@ -607,13 +610,14 @@ static int br_process_vlan_info(struct net_bridge *br, return err; } - return br_vlan_info(br, p, cmd, vinfo_curr, changed); + return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, - int cmd, bool *changed) + int cmd, bool *changed, + struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; @@ -643,7 +647,8 @@ static int br_afspec(struct net_bridge *br, return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, - &vinfo_last, changed); + &vinfo_last, changed, + extack); if (err) return err; break; @@ -850,7 +855,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) } /* Change state and parameters on port. */ -int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) +int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, + struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; @@ -897,7 +903,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) } if (afspec) - err = br_afspec(br, p, afspec, RTM_SETLINK, &changed); + err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack); if (changed) br_ifinfo_notify(RTM_NEWLINK, br, p); @@ -923,7 +929,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (!p && !(dev->priv_flags & IFF_EBRIDGE)) return -EINVAL; - err = br_afspec(br, p, afspec, RTM_DELLINK, &changed); + err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels @@ -1035,6 +1041,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, + [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_boolopt_multi) }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1103,7 +1111,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); - err = __br_vlan_set_default_pvid(br, defpvid); + err = __br_vlan_set_default_pvid(br, defpvid, extack); if (err) return err; } @@ -1187,19 +1195,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } - if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) { - u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]); + if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) + br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", + RHT_ELASTICITY); - br->hash_elasticity = val; - } - - if (data[IFLA_BR_MCAST_HASH_MAX]) { - u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); - - err = br_multicast_set_hash_max(br, hash_max); - if (err) - return err; - } + if (data[IFLA_BR_MCAST_HASH_MAX]) + br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); @@ -1296,6 +1297,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } #endif + if (data[IFLA_BR_MULTI_BOOLOPT]) { + struct br_boolopt_multi *bm; + + bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); + err = br_boolopt_multi_toggle(br, bm, extack); + if (err) + return err; + } + return 0; } @@ -1374,6 +1384,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif + nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } @@ -1387,6 +1398,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); + struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); @@ -1403,6 +1415,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; + br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || @@ -1420,7 +1433,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || - nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || + nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING @@ -1442,8 +1456,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) br_opt_get(br, BROPT_MULTICAST_QUERIER)) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || - nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, - br->hash_elasticity) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_last_member_count) || diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2920e06a5403..d240b3e7919f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_MULTICAST_DEFAULT_HASH_MAX 4096 + #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ @@ -102,12 +104,18 @@ struct br_tunnel_info { struct metadata_dst *tunnel_dst; }; +/* private vlan flags */ +enum { + BR_VLFLAG_PER_PORT_STATS = BIT(0), +}; + /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags + * @priv_flags: private (in-kernel) bridge vlan flags * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct @@ -127,6 +135,7 @@ struct net_bridge_vlan { struct rhash_head tnode; u16 vid; u16 flags; + u16 priv_flags; struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; @@ -206,23 +215,14 @@ struct net_bridge_port_group { }; struct net_bridge_mdb_entry { - struct hlist_node hlist[2]; + struct rhash_head rhnode; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; struct rcu_head rcu; struct timer_list timer; struct br_ip addr; bool host_joined; -}; - -struct net_bridge_mdb_htable { - struct hlist_head *mhash; - struct rcu_head rcu; - struct net_bridge_mdb_htable *old; - u32 size; - u32 max; - u32 secret; - u32 ver; + struct hlist_node mdb_node; }; struct net_bridge_port { @@ -321,6 +321,7 @@ enum net_bridge_opts { BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, + BROPT_NO_LL_LEARN, }; struct net_bridge { @@ -373,7 +374,6 @@ struct net_bridge { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - u32 hash_elasticity; u32 hash_max; u32 multicast_last_member_count; @@ -392,7 +392,9 @@ struct net_bridge { unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; - struct net_bridge_mdb_htable __rcu *mdb; + struct rhashtable mdb_hash_tbl; + + struct hlist_head mdb_list; struct hlist_head router_list; struct timer_list multicast_router_timer; @@ -500,6 +502,14 @@ static inline int br_opt_get(const struct net_bridge *br, return test_bit(opt, &br->options); } +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack); +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt); +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack); +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); /* br_device.c */ @@ -565,6 +575,9 @@ int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); +int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, + const unsigned char *addr, u16 vid, u32 portid, u32 seq, + struct netlink_ext_ack *extack); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, @@ -643,7 +656,6 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -extern unsigned int br_mdb_rehash_seq; int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, @@ -668,17 +680,15 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); #endif struct net_bridge_mdb_entry * -br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); +br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); struct net_bridge_mdb_entry * -br_multicast_new_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group); -void br_multicast_free_pg(struct rcu_head *head); +br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src); -void br_mdb_init(void); -void br_mdb_uninit(void); +int br_mdb_hash_init(struct net_bridge *br); +void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, @@ -690,6 +700,8 @@ void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); +void br_mdb_init(void); +void br_mdb_uninit(void); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -815,6 +827,15 @@ static inline void br_mdb_uninit(void) { } +static inline int br_mdb_hash_init(struct net_bridge *br) +{ + return 0; +} + +static inline void br_mdb_hash_fini(struct net_bridge *br) +{ +} + static inline void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, @@ -850,7 +871,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, - bool *changed); + bool *changed, struct netlink_ext_ack *extack); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); @@ -863,12 +884,13 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); -int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, + struct netlink_ext_ack *extack); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, - bool *changed); + bool *changed, struct netlink_ext_ack *extack); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); -int nbp_vlan_init(struct net_bridge_port *port); +int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, struct br_vlan_stats *stats); @@ -905,7 +927,7 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) int err = 0; if (skb_vlan_tag_present(skb)) { - *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK; + *vid = skb_vlan_tag_get_id(skb); } else { *vid = 0; err = -EINVAL; @@ -953,7 +975,7 @@ static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, } static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, - bool *changed) + bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; @@ -978,7 +1000,7 @@ static inline int br_vlan_init(struct net_bridge *br) } static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, - bool *changed) + bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; @@ -999,7 +1021,8 @@ static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group return NULL; } -static inline int nbp_vlan_init(struct net_bridge_port *port) +static inline int nbp_vlan_init(struct net_bridge_port *port, + struct netlink_ext_ack *extack) { return 0; } @@ -1120,7 +1143,8 @@ int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); -int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); +int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, + struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); @@ -1155,7 +1179,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long mask); void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type); -int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags); +int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, + struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) @@ -1187,7 +1212,8 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, } static inline int br_switchdev_port_vlan_add(struct net_device *dev, - u16 vid, u16 flags) + u16 vid, u16 flags, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index b993df770675..035ff59d9cbd 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -140,7 +140,8 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) } } -int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags) +int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, + struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, @@ -150,7 +151,7 @@ int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags) .vid_end = vid, }; - return switchdev_port_obj_add(dev, &v.obj); + return switchdev_port_obj_add(dev, &v.obj, extack); } int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 60182bef6341..b05b94e9c595 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -328,6 +328,27 @@ static ssize_t flush_store(struct device *d, } static DEVICE_ATTR_WO(flush); +static ssize_t no_linklocal_learn_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); +} + +static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val) +{ + return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL); +} + +static ssize_t no_linklocal_learn_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_no_linklocal_learn); +} +static DEVICE_ATTR_RW(no_linklocal_learn); + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) @@ -403,13 +424,13 @@ static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { - struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->hash_elasticity); + return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val) { - br->hash_elasticity = val; + br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", + RHT_ELASTICITY); return 0; } @@ -428,10 +449,16 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%u\n", br->hash_max); } +static int set_hash_max(struct net_bridge *br, unsigned long val) +{ + br->hash_max = val; + return 0; +} + static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_hash_max); + return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); @@ -841,6 +868,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, + &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 7c87a2fe5248..88715edb119a 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -320,9 +320,6 @@ static ssize_t brport_store(struct kobject *kobj, if (!rtnl_trylock()) return restart_syscall(); - if (!p->dev || !p->br) - goto out_unlock; - if (brport_attr->store_raw) { char *buf_copy; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 8c9297a01947..4a2f31157ef5 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -80,14 +80,14 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, - u16 vid, u16 flags) + u16 vid, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = br_switchdev_port_vlan_add(dev, vid, flags); + err = br_switchdev_port_vlan_add(dev, vid, flags, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, vid); return err; @@ -139,7 +139,9 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, /* Returns a master vlan, if it didn't exist it gets created. In all cases a * a reference is taken to the master vlan before returning. */ -static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid) +static struct net_bridge_vlan * +br_vlan_get_master(struct net_bridge *br, u16 vid, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; @@ -150,7 +152,7 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid bool changed; /* missing global ctx, create it now */ - if (br_vlan_add(br, vid, 0, &changed)) + if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) @@ -197,7 +199,7 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu) v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ - if (v->brvlan->stats != v->stats) + if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); @@ -214,7 +216,8 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu) * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ -static int __vlan_add(struct net_bridge_vlan *v, u16 flags) +static int __vlan_add(struct net_bridge_vlan *v, u16 flags, + struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; @@ -239,7 +242,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = __vlan_vid_add(dev, br, v->vid, flags); + err = __vlan_vid_add(dev, br, v->vid, flags, extack); if (err) goto out; @@ -249,12 +252,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, - &changed); + &changed, extack); if (err) goto out_filt; } - masterv = br_vlan_get_master(br, v->vid); + masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) goto out_filt; v->brvlan = masterv; @@ -264,11 +267,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) err = -ENOMEM; goto out_filt; } + v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } } else { - err = br_switchdev_port_vlan_add(dev, v->vid, flags); + err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err && err != -EOPNOTSUPP) goto out; } @@ -420,7 +424,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, } if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { @@ -493,8 +497,8 @@ static bool __allowed_ingress(const struct net_bridge *br, __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. - * At this point, We know that skb->vlan_tci had - * VLAN_TAG_PRESENT bit and its VID field was 0x000. + * At this point, we know that skb->vlan_tci VID + * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; @@ -590,11 +594,12 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, - u16 flags, bool *changed) + u16 flags, bool *changed, + struct netlink_ext_ack *extack) { int err; - err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags); + err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, extack); if (err && err != -EOPNOTSUPP) return err; @@ -633,7 +638,8 @@ err_flags: * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ -int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) +int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; @@ -645,7 +651,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) - return br_vlan_add_existing(br, vg, vlan, flags, changed); + return br_vlan_add_existing(br, vg, vlan, flags, changed, + extack); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) @@ -662,7 +669,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); - ret = __vlan_add(vlan, flags); + ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); @@ -913,7 +920,8 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) br->default_pvid = 0; } -int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, + struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; @@ -945,7 +953,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, - &vlchange); + &vlchange, extack); if (err) goto out; br_vlan_delete(br, old_pvid); @@ -965,7 +973,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, - &vlchange); + &vlchange, extack); if (err) goto err_port; nbp_vlan_delete(p, old_pvid); @@ -987,7 +995,7 @@ err_port: nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, - &vlchange); + &vlchange, NULL); nbp_vlan_delete(p, pvid); } @@ -997,7 +1005,7 @@ err_port: BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, - &vlchange); + &vlchange, NULL); br_vlan_delete(br, pvid); } goto out; @@ -1020,7 +1028,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto out; } - err = __br_vlan_set_default_pvid(br, pvid); + err = __br_vlan_set_default_pvid(br, pvid, NULL); out: return err; } @@ -1046,7 +1054,7 @@ int br_vlan_init(struct net_bridge *br) rcu_assign_pointer(br->vlgrp, vg); ret = br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_BRENTRY, &changed); + BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); if (ret) goto err_vlan_add; @@ -1063,7 +1071,7 @@ err_rhtbl: goto out; } -int nbp_vlan_init(struct net_bridge_port *p) +int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, @@ -1096,7 +1104,7 @@ int nbp_vlan_init(struct net_bridge_port *p) ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, - &changed); + &changed, extack); if (ret) goto err_vlan_add; } @@ -1121,7 +1129,7 @@ err_vlan_enabled: * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, - bool *changed) + bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; @@ -1132,7 +1140,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { /* Pass the flags to the hardware bridge */ - ret = br_switchdev_port_vlan_add(port->dev, vid, flags); + ret = br_switchdev_port_vlan_add(port->dev, vid, flags, extack); if (ret && ret != -EOPNOTSUPP) return ret; *changed = __vlan_add_flags(vlan, flags); @@ -1146,7 +1154,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, vlan->vid = vid; vlan->port = port; - ret = __vlan_add(vlan, flags); + ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else @@ -1216,9 +1224,13 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; + struct net_bridge_port *p; ASSERT_RTNL(); - if (netif_is_bridge_master(dev)) + p = br_port_get_check_rtnl(dev); + if (p) + vg = nbp_vlan_group(p); + else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; diff --git a/net/can/raw.c b/net/can/raw.c index 1051eee82581..3aab7664933f 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -745,18 +745,19 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } else ifindex = ro->ifindex; - if (ro->fd_frames) { + dev = dev_get_by_index(sock_net(sk), ifindex); + if (!dev) + return -ENXIO; + + err = -EINVAL; + if (ro->fd_frames && dev->mtu == CANFD_MTU) { if (unlikely(size != CANFD_MTU && size != CAN_MTU)) - return -EINVAL; + goto put_dev; } else { if (unlikely(size != CAN_MTU)) - return -EINVAL; + goto put_dev; } - dev = dev_get_by_index(sock_net(sk), ifindex); - if (!dev) - return -ENXIO; - skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57fcc6b4bf6e..2f126eff275d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -580,9 +580,15 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, struct bio_vec bvec; int ret; - /* sendpage cannot properly handle pages with page_count == 0, - * we need to fallback to sendmsg if that's the case */ - if (page_count(page) >= 1) + /* + * sendpage cannot properly handle pages with page_count == 0, + * we need to fall back to sendmsg if that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag which + * triggers one of hardened usercopy checks. + */ + if (page_count(page) >= 1 && !PageSlab(page)) return __ceph_tcp_sendpage(sock, page, offset, size, more); bvec.bv_page = page; diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..4bf62b1afa3b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -728,49 +728,6 @@ fault: return -EFAULT; } -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete); - /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff @@ -810,7 +767,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index 0ffcbdd55fa9..1b5a4410be0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> +#include <linux/indirect_call_wrapper.h> #include "net-sysfs.h" @@ -162,6 +163,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1361,7 +1365,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1377,7 +1381,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1406,7 +1410,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,14 +1421,14 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -1585,6 +1590,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1733,6 +1739,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1744,11 +1762,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -2175,6 +2189,20 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + bool is_rxqs_map) +{ + if (is_rxqs_map) { + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + } + static_key_slow_dec_cpuslocked(&xps_needed); + kfree_rcu(dev_maps, rcu); +} + static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, struct xps_dev_maps *dev_maps, unsigned int nr_ids, u16 offset, u16 count, bool is_rxqs_map) @@ -2186,18 +2214,15 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); - if (!active) { - if (is_rxqs_map) { - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (!is_rxqs_map) { + for (i = offset + (count - 1); count--; i--) { + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); } - kfree_rcu(dev_maps, rcu); } } @@ -2234,10 +2259,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, false); out_no_maps: - if (static_key_enabled(&xps_rxqs_needed)) - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - - static_key_slow_dec_cpuslocked(&xps_needed); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2355,9 +2376,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; - static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) - static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { @@ -2455,13 +2479,8 @@ out_no_new_maps: } /* free map if not active */ - if (!active) { - if (is_rxqs_map) - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - else - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -3091,10 +3110,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -4520,9 +4546,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4537,6 +4568,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -4889,7 +4921,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5009,7 +5041,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; - list_del(&skb->list); + skb_list_del_init(skb); __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; @@ -5165,7 +5197,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(netdev_tstamp_prequeue, skb); - list_del(&skb->list); + skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } @@ -5176,7 +5208,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); list_for_each_entry_safe(skb, next, head, list) { xdp_prog = rcu_dereference(skb->dev->xdp_prog); - list_del(&skb->list); + skb_list_del_init(skb); if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) list_add_tail(&skb->list, &sublist); } @@ -5195,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) if (cpu >= 0) { /* Will be handled, remove from list */ - list_del(&skb->list); + skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } @@ -5222,9 +5254,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5244,9 +5281,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5299,6 +5339,8 @@ static void flush_all_backlogs(void) put_online_cpus(); } +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; @@ -5318,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb, 0); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); break; } rcu_read_unlock(); @@ -5357,11 +5401,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); @@ -5386,7 +5432,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5461,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head) napi_gro_complete(oldest); } +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5510,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(gro_head, skb); + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + gro_head, skb); break; } rcu_read_unlock(); @@ -5634,12 +5688,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5652,9 +5711,13 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); @@ -5753,6 +5816,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5760,7 +5824,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); @@ -5776,10 +5843,11 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; @@ -5966,11 +6034,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) timeout = n->dev->gro_flush_timeout; + /* When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer + */ + napi_gro_flush(n, !!timeout); if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - else - napi_gro_flush(n, false); } if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6197,8 +6268,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) - pr_err_once("netif_napi_add() called with weight %d on device %s\n", - weight, dev->name); + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; @@ -7455,7 +7526,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7492,7 +7564,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { @@ -7552,16 +7624,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; @@ -7694,13 +7768,36 @@ void dev_set_group(struct net_device *dev, int new_group) EXPORT_SYMBOL(dev_set_group); /** + * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + */ +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL(dev_pre_changeaddr_notify); + +/** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address + * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -7711,6 +7808,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; + err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + if (err) + return err; err = ops->ndo_set_mac_address(dev, sa); if (err) return err; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..a6723b306717 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync @@ -401,6 +498,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); + err = dev_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..31380fd5a4e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ @@ -246,7 +246,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..f9348806e843 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,18 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -467,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { @@ -2428,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3908,6 +4073,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -4825,47 +5010,40 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4882,31 +5060,34 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; - if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + if (unlikely(family == AF_UNSPEC || flags || + !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); - if (netns_id) { + sdif = inet6_sdif(skb); + + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); - } else { - net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); } if (sk) @@ -4915,6 +5096,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4964,6 +5164,87 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -4986,6 +5267,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5070,6 +5352,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -5214,6 +5504,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5240,6 +5538,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -5264,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } @@ -5436,8 +5738,12 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): - if (size != sizeof(struct bpf_flow_keys *)) + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) return false; break; default: @@ -5465,8 +5771,10 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5490,7 +5798,8 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5505,6 +5814,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5531,7 +5844,9 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5741,6 +6056,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -5757,7 +6073,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5959,7 +6275,9 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6040,12 +6358,14 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6140,19 +6460,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } break; case offsetof(struct __sk_buff, cb[0]) ... @@ -6355,6 +6675,33 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + break; + + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 56d1e9b73142..2e8d91e54179 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1165,8 +1165,8 @@ ip_proto_again: break; } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && + !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..fb4372cb1de1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,21 +118,77 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_update_gc_list(struct neighbour *n) +{ + bool on_gc_list, exempt_from_gc; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + + /* remove from the gc list if new state is permanent or if neighbor + * is externally learned; otherwise entry should be on the gc list + */ + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_LEARNED; + on_gc_list = !list_empty(&n->gc_list); + + if (exempt_from_gc && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!exempt_from_gc && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } + + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} + +static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + bool rc = false; + u8 ndm_flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return rc; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + rc = true; + *notify = 1; + } + + return rc; +} -static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, - struct neighbour __rcu **np, struct neigh_table *tbl) +static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, + struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && - !(n->flags & flags)) { + if (refcount_read(&n->refcnt) == 1) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -158,7 +214,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, 0, np, tbl); + return neigh_del(n, np, tbl); np = &n->next; } return false; @@ -166,32 +222,29 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + struct neighbour *n, *tmp; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if ((n->nud_state == NUD_FAILED) || + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +313,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +373,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (exempt_from_gc) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +397,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +416,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +567,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool exempt_from_gc, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -574,6 +638,9 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } n->dead = 0; + if (!exempt_from_gc) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, @@ -591,6 +658,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -854,7 +927,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1137,9 +1210,11 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { + bool ext_learn_change = false; u8 old; int err; int notify = 0; @@ -1155,10 +1230,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } - neigh_update_ext_learned(neigh, flags, ¬ify); + ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1193,8 +1270,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1302,11 +1381,20 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) + neigh_update_gc_list(neigh); + if (notify) neigh_update_notify(neigh, nlmsg_pid); return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1571,6 +1659,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1678,8 +1767,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1785,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1804,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1736,6 +1828,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; + u8 protocol = 0; int err; ASSERT_RTNL(); @@ -1744,8 +1837,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,19 +1850,32 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (tb[NDA_PROTOCOL]) { + if (nla_len(tb[NDA_PROTOCOL]) != sizeof(u8)) { + NL_SET_ERR_MSG(extack, "Invalid protocol attribute"); + goto out; + } + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + } + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -1775,22 +1883,30 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; + if (protocol) + pn->protocol = protocol; err = 0; } goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { + bool exempt_from_gc; + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || + ndm->ndm_flags & NTF_EXT_LEARNED; + neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1817,8 +1933,12 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + + if (protocol) + neigh->protocol = protocol; + neigh_release(neigh); out: @@ -2312,6 +2432,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2343,6 +2466,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2631,7 +2757,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); @@ -2998,7 +3124,8 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4); /* NDA_PROBES */ + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ } static void __neigh_notify(struct neighbour *n, int type, int flags, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..05b23b285058 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -735,23 +736,38 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +783,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -782,6 +803,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; @@ -793,15 +819,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -811,14 +851,17 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (fillargs.add_ref) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { - struct net *net; + struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -831,9 +874,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -842,33 +886,96 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, + .tgt_net = sock_net(skb->sk), .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -876,7 +983,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2b9fdbc43205..36a2b63ffd6d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 86f2d9cbdae3..baf2685b4da2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -2444,7 +2444,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa, extack); kfree(sa); if (err) goto errout; @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -2870,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } @@ -2971,20 +2973,24 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3040,195 +3046,200 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, extack); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - NL_SET_ERR_MSG(extack, "Unknown device type"); - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; - - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ops->setup) + return -EOPNOTSUPP; - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } - dev->ifindex = ifm->ifi_index; + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; + } + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; } - err = rtnl_configure_link(dev, ifm); + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3449,6 +3460,18 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, new_nsid, new_ifindex); } +static const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, +}; + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -3808,6 +3831,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, { int err; + if (dev->type != ARPHRD_ETHER) + return -EINVAL; + netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) @@ -4007,6 +4033,160 @@ out: return skb->len; } +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { @@ -4318,7 +4498,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); if (err) goto out; @@ -4330,7 +4511,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, - flags); + flags, + extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; @@ -5065,7 +5247,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b4ee5c8b928f..40552547c69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1105,6 +1106,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } @@ -1925,8 +1932,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; @@ -2366,19 +2371,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer @@ -2645,6 +2637,65 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( @@ -2962,28 +3013,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) @@ -4854,6 +4883,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset(skb); nf_reset_trace(skb); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; +#endif + if (!xnet) return; @@ -5123,7 +5157,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 6d7e189e3cd9..f00902c532cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -700,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -1018,7 +1019,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 658cd32bb7b3..be0b223aa862 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1141,6 +1141,9 @@ static int __init dccp_init(void) goto out_fail; rc = -ENOBUFS; inet_hashinfo_init(&dccp_hashinfo); + inet_hashinfo2_init(&dccp_hashinfo, "dccp_listen_portaddr_hash", + INET_LHTABLE_SIZE, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d6ff983ba2c..7aab5d088c72 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -192,7 +192,7 @@ static int check_port(__le16 port) static unsigned short port_alloc(struct sock *sk) { struct dn_scp *scp = DN_SK(sk); -static unsigned short port = 0x2000; + static unsigned short port = 0x2000; unsigned short i_port = port; while(check_port(cpu_to_le16(++port)) != 0) { diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 48c41918fb35..91e52973ee13 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -44,6 +44,10 @@ config NET_DSA_TAG_GSWIP config NET_DSA_TAG_KSZ bool +config NET_DSA_TAG_KSZ9477 + bool + select NET_DSA_TAG_KSZ + config NET_DSA_TAG_LAN9303 bool diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a69c1790bbfc..aee909bcddc4 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -55,8 +55,8 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_GSWIP [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_KSZ - [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_KSZ9477 + [DSA_TAG_PROTO_KSZ9477] = &ksz9477_netdev_ops, #endif #ifdef CONFIG_NET_DSA_TAG_LAN9303 [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, @@ -91,8 +91,8 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) #ifdef CONFIG_NET_DSA_TAG_GSWIP [DSA_TAG_PROTO_GSWIP] = "gswip", #endif -#ifdef CONFIG_NET_DSA_TAG_KSZ - [DSA_TAG_PROTO_KSZ] = "ksz", +#ifdef CONFIG_NET_DSA_TAG_KSZ9477 + [DSA_TAG_PROTO_KSZ9477] = "ksz9477", #endif #ifdef CONFIG_NET_DSA_TAG_LAN9303 [DSA_TAG_PROTO_LAN9303] = "lan9303", diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9e4fd04ab53c..026a05774bf7 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -210,7 +210,7 @@ extern const struct dsa_device_ops edsa_netdev_ops; extern const struct dsa_device_ops gswip_netdev_ops; /* tag_ksz.c */ -extern const struct dsa_device_ops ksz_netdev_ops; +extern const struct dsa_device_ops ksz9477_netdev_ops; /* tag_lan9303.c */ extern const struct dsa_device_ops lan9303_netdev_ops; diff --git a/net/dsa/master.c b/net/dsa/master.c index c90ee3227dea..71bb15f491c8 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -158,8 +158,59 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) cpu_dp->orig_ethtool_ops = NULL; } +static ssize_t tagging_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct dsa_port *cpu_dp = dev->dsa_ptr; + + return sprintf(buf, "%s\n", + dsa_tag_protocol_to_str(cpu_dp->tag_ops)); +} +static DEVICE_ATTR_RO(tagging); + +static struct attribute *dsa_slave_attrs[] = { + &dev_attr_tagging.attr, + NULL +}; + +static const struct attribute_group dsa_group = { + .name = "dsa", + .attrs = dsa_slave_attrs, +}; + +static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp) +{ + unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + int err; + + rtnl_lock(); + if (mtu <= dev->max_mtu) { + err = dev_set_mtu(dev, mtu); + if (err) + netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n"); + } + rtnl_unlock(); +} + +static void dsa_master_reset_mtu(struct net_device *dev) +{ + int err; + + rtnl_lock(); + err = dev_set_mtu(dev, ETH_DATA_LEN); + if (err) + netdev_dbg(dev, + "Unable to reset MTU to exclude DSA overheads\n"); + rtnl_unlock(); +} + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { + int ret; + + dsa_master_set_mtu(dev, cpu_dp); + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -168,12 +219,22 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) dev->dsa_ptr = cpu_dp; - return dsa_master_ethtool_setup(dev); + ret = dsa_master_ethtool_setup(dev); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->dev.kobj, &dsa_group); + if (ret) + dsa_master_ethtool_teardown(dev); + + return ret; } void dsa_master_teardown(struct net_device *dev) { + sysfs_remove_group(&dev->dev.kobj, &dsa_group); dsa_master_ethtool_teardown(dev); + dsa_master_reset_mtu(dev); dev->dsa_ptr = NULL; diff --git a/net/dsa/port.c b/net/dsa/port.c index ed0595459df1..2d7e01b23572 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -252,9 +252,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - if (netif_is_bridge_master(vlan->obj.orig_dev)) - return -EOPNOTSUPP; - if (br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7d0c19e7edcf..a3fcc1d01615 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1050,35 +1050,12 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, - .switchdev_port_obj_add = dsa_slave_port_obj_add, - .switchdev_port_obj_del = dsa_slave_port_obj_del, }; static struct device_type dsa_type = { .name = "dsa", }; -static ssize_t tagging_show(struct device *d, struct device_attribute *attr, - char *buf) -{ - struct net_device *dev = to_net_dev(d); - struct dsa_port *dp = dsa_slave_to_port(dev); - - return sprintf(buf, "%s\n", - dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops)); -} -static DEVICE_ATTR_RO(tagging); - -static struct attribute *dsa_slave_attrs[] = { - &dev_attr_tagging.attr, - NULL -}; - -static const struct attribute_group dsa_group = { - .name = "dsa", - .attrs = dsa_slave_attrs, -}; - static void dsa_slave_phylink_validate(struct net_device *dev, unsigned long *supported, struct phylink_link_state *state) @@ -1374,14 +1351,8 @@ int dsa_slave_create(struct dsa_port *port) goto out_phy; } - ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group); - if (ret) - goto out_unreg; - return 0; -out_unreg: - unregister_netdev(slave_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); @@ -1405,7 +1376,6 @@ void dsa_slave_destroy(struct net_device *slave_dev) rtnl_unlock(); dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); - sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group); unregister_netdev(slave_dev); phylink_destroy(dp->pl); free_percpu(p->stats64); @@ -1557,6 +1527,44 @@ err_fdb_work_init: return NOTIFY_BAD; } +static int +dsa_slave_switchdev_port_obj_event(unsigned long event, + struct net_device *netdev, + struct switchdev_notifier_port_obj_info *port_obj_info) +{ + int err = -EOPNOTSUPP; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: + err = dsa_slave_port_obj_add(netdev, port_obj_info->obj, + port_obj_info->trans); + break; + case SWITCHDEV_PORT_OBJ_DEL: + err = dsa_slave_port_obj_del(netdev, port_obj_info->obj); + break; + } + + port_obj_info->handled = true; + return notifier_from_errno(err); +} + +static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + + if (!dsa_slave_dev_check(dev)) + return NOTIFY_DONE; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: /* fall through */ + case SWITCHDEV_PORT_OBJ_DEL: + return dsa_slave_switchdev_port_obj_event(event, dev, ptr); + } + + return NOTIFY_DONE; +} + static struct notifier_block dsa_slave_nb __read_mostly = { .notifier_call = dsa_slave_netdevice_event, }; @@ -1565,8 +1573,13 @@ static struct notifier_block dsa_slave_switchdev_notifier = { .notifier_call = dsa_slave_switchdev_event, }; +static struct notifier_block dsa_slave_switchdev_blocking_notifier = { + .notifier_call = dsa_slave_switchdev_blocking_event, +}; + int dsa_slave_register_notifier(void) { + struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_slave_nb); @@ -1577,8 +1590,15 @@ int dsa_slave_register_notifier(void) if (err) goto err_switchdev_nb; + nb = &dsa_slave_switchdev_blocking_notifier; + err = register_switchdev_blocking_notifier(nb); + if (err) + goto err_switchdev_blocking_nb; + return 0; +err_switchdev_blocking_nb: + unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_slave_nb); return err; @@ -1586,8 +1606,14 @@ err_switchdev_nb: void dsa_slave_unregister_notifier(void) { + struct notifier_block *nb; int err; + nb = &dsa_slave_switchdev_blocking_notifier; + err = unregister_switchdev_blocking_notifier(nb); + if (err) + pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); + err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 2b06bb91318b..4aa1d368a5ae 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -174,6 +174,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops brcm_netdev_ops = { .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, + .overhead = BRCM_TAG_LEN, }; #endif @@ -196,5 +197,6 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, const struct dsa_device_ops brcm_prepend_netdev_ops = { .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, + .overhead = BRCM_TAG_LEN, }; #endif diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cd13cfc542ce..8b2f92e3f3a2 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -149,4 +149,5 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops dsa_netdev_ops = { .xmit = dsa_xmit, .rcv = dsa_rcv, + .overhead = DSA_HLEN, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 4083326b806e..f5b87ee5c94e 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -168,4 +168,5 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops edsa_netdev_ops = { .xmit = edsa_xmit, .rcv = edsa_rcv, + .overhead = EDSA_HLEN, }; diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 49e9b73f1be3..cb6f82ffe5eb 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -106,4 +106,5 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, const struct dsa_device_ops gswip_netdev_ops = { .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, + .overhead = GSWIP_RX_HEADER_LEN, }; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0f62effad88f..da71b9e2af52 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -14,34 +14,18 @@ #include <net/dsa.h> #include "dsa_priv.h" -/* For Ingress (Host -> KSZ), 2 bytes are added before FCS. - * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) - * --------------------------------------------------------------------------- - * tag0 : Prioritization (not used now) - * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) - * - * For Egress (KSZ -> Host), 1 byte is added before FCS. - * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) - * --------------------------------------------------------------------------- - * tag0 : zero-based value represents port - * (eg, 0x00=port1, 0x02=port3, 0x06=port7) - */ - -#define KSZ_INGRESS_TAG_LEN 2 -#define KSZ_EGRESS_TAG_LEN 1 +/* Typically only one byte is used for tail tag. */ +#define KSZ_EGRESS_TAG_LEN 1 -static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *ksz_common_xmit(struct sk_buff *skb, + struct net_device *dev, int len) { - struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; int padlen; - u8 *tag; padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; - if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { + if (skb_tailroom(skb) >= padlen + len) { /* Let dsa_slave_xmit() free skb */ if (__skb_put_padto(skb, skb->len + padlen, false)) return NULL; @@ -49,7 +33,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) nskb = skb; } else { nskb = alloc_skb(NET_IP_ALIGN + skb->len + - padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC); + padlen + len, GFP_ATOMIC); if (!nskb) return NULL; skb_reserve(nskb, NET_IP_ALIGN); @@ -70,33 +54,88 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); } - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); - tag[0] = 0; - tag[1] = 1 << dp->index; /* destination port */ - return nskb; } -static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, + struct net_device *dev, + unsigned int port, unsigned int len) { - u8 *tag; - int source_port; + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; - tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + pskb_trim_rcsum(skb, skb->len - len); - source_port = tag[0] & 7; + return skb; +} - skb->dev = dsa_master_find_slave(dev, 0, source_port); - if (!skb->dev) +/* + * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : Prioritization (not used now) + * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) + * + * For Egress (KSZ9477 -> Host), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : zero-based value represents port + * (eg, 0x00=port1, 0x02=port3, 0x06=port7) + */ + +#define KSZ9477_INGRESS_TAG_LEN 2 +#define KSZ9477_PTP_TAG_LEN 4 +#define KSZ9477_PTP_TAG_INDICATION 0x80 + +#define KSZ9477_TAIL_TAG_OVERRIDE BIT(9) +#define KSZ9477_TAIL_TAG_LOOKUP BIT(10) + +static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct sk_buff *nskb; + u16 *tag; + u8 *addr; + + nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN); + if (!nskb) return NULL; - pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); + /* Tag encoding */ + tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN); + addr = skb_mac_header(nskb); - return skb; + *tag = BIT(dp->index); + + if (is_link_local_ether_addr(addr)) + *tag |= KSZ9477_TAIL_TAG_OVERRIDE; + + *tag = cpu_to_be16(*tag); + + return nskb; +} + +static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + /* Tag decoding */ + u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + unsigned int port = tag[0] & 7; + unsigned int len = KSZ_EGRESS_TAG_LEN; + + /* Extra 4-bytes PTP timestamp */ + if (tag[0] & KSZ9477_PTP_TAG_INDICATION) + len += KSZ9477_PTP_TAG_LEN; + + return ksz_common_rcv(skb, dev, port, len); } -const struct dsa_device_ops ksz_netdev_ops = { - .xmit = ksz_xmit, - .rcv = ksz_rcv, +const struct dsa_device_ops ksz9477_netdev_ops = { + .xmit = ksz9477_xmit, + .rcv = ksz9477_rcv, + .overhead = KSZ9477_INGRESS_TAG_LEN, }; diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 548c00254c07..f48889e46ff7 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -140,4 +140,5 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops lan9303_netdev_ops = { .xmit = lan9303_xmit, .rcv = lan9303_rcv, + .overhead = LAN9303_TAG_LEN, }; diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 11535bc70743..f39f4dfeda34 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -109,4 +109,5 @@ const struct dsa_device_ops mtk_netdev_ops = { .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, .flow_dissect = mtk_tag_flow_dissect, + .overhead = MTK_HDR_LEN, }; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 613f4ee97771..ed4f6dc26365 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -101,4 +101,5 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops qca_netdev_ops = { .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, + .overhead = QCA_HDR_LEN, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 56197f0d9608..b40756ed6e57 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -84,4 +84,5 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops trailer_netdev_ops = { .xmit = trailer_xmit, .rcv = trailer_rcv, + .overhead = 4, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index fd8faa0dfa61..4c520110b04f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -47,6 +47,7 @@ #include <linux/inet.h> #include <linux/ip.h> #include <linux/netdevice.h> +#include <linux/nvmem-consumer.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> @@ -165,15 +166,17 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { - if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; + if (unlikely(!ether_addr_equal_64bits(eth->h_dest, + dev->dev_addr))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { + if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + skb->pkt_type = PACKET_OTHERHOST; + } } - else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) - skb->pkt_type = PACKET_OTHERHOST; /* * Some variants of DSA tagging don't have an ethertype field @@ -548,3 +551,40 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); + +/** + * Obtain the MAC address from an nvmem cell named 'mac-address' associated + * with given device. + * + * @dev: Device with which the mac-address cell is associated. + * @addrbuf: Buffer to which the MAC address will be copied on success. + * + * Returns 0 on success or a negative error number on failure. + */ +int nvmem_get_mac_address(struct device *dev, void *addrbuf) +{ + struct nvmem_cell *cell; + const void *mac; + size_t len; + + cell = nvmem_cell_get(dev, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + ether_addr_copy(addrbuf, mac); + kfree(mac); + + return 0; +} +EXPORT_SYMBOL(nvmem_get_mac_address); diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index b231e40f006a..0c25c0bcc4da 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -242,7 +242,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); - rc = dev_set_mac_address(dev, &addr); + rc = dev_set_mac_address(dev, &addr, NULL); rtnl_unlock(); if (rc) goto dev_unregister; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 326c422c22f8..0dfb72c46671 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1385,6 +1385,10 @@ out: } EXPORT_SYMBOL(inet_gso_segment); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, + struct sk_buff *)); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; @@ -1494,7 +1498,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, + ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -1556,6 +1561,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } +INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); @@ -1581,7 +1588,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) * because any hdr with option will have been flushed in * inet_gro_receive(). */ - err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); + err = INDIRECT_CALL_2(ops->callbacks.gro_complete, + tcp4_gro_complete, udp4_gro_complete, + skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a34602ae27de..5b9b6d497f71 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1100,7 +1100,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr->ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 0d0ad19ecb87..0c9f171fb085 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1061,6 +1061,13 @@ static int gue_err(struct sk_buff *skb, u32 info) if (validate_gue_flags(guehdr, optlen)) return -EINVAL; + /* Handling exceptions for direct UDP encapsulation in GUE would lead to + * recursion. Besides, this kind of encapsulation can't even be + * configured currently. Discard this. + */ + if (guehdr->proto_ctype == IPPROTO_UDP) + return -EOPNOTSUPP; + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bcb11f3a27c0..760a9e52e02b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -178,21 +178,22 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - void *arg) + void *arg, + struct inet_frag_queue **prev) { struct inet_frags *f = nf->f; struct inet_frag_queue *q; - int err; q = inet_frag_alloc(nf, f, arg); - if (!q) + if (!q) { + *prev = ERR_PTR(-ENOMEM); return NULL; - + } mod_timer(&q->timer, jiffies + nf->timeout); - err = rhashtable_insert_fast(&nf->rhashtable, &q->node, - f->rhash_params); - if (err < 0) { + *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + &q->node, f->rhash_params); + if (*prev) { q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q); inet_frag_destroy(q); @@ -204,22 +205,22 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_queue *fq; + struct inet_frag_queue *fq = NULL, *prev; if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; rcu_read_lock(); - fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); - if (fq) { + prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (!prev) + fq = inet_frag_create(nf, key, &prev); + if (prev && !IS_ERR(prev)) { + fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) fq = NULL; - rcu_read_unlock(); - return fq; } rcu_read_unlock(); - - return inet_frag_create(nf, key); + return fq; } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 13890d5bfc34..2445614de6a7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -234,24 +234,16 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; - struct inet_sock *inet = inet_sk(sk); - bool dev_match; - if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && + if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { - __be32 rcv_saddr = inet->inet_rcv_saddr; - score = sk->sk_family == PF_INET ? 2 : 1; - if (rcv_saddr) { - if (rcv_saddr != daddr) - return -1; - score += 4; - } - dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, - dif, sdif); - if (!dev_match) + if (sk->sk_rcv_saddr != daddr) + return -1; + + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score += 4; + score = sk->sk_family == PF_INET ? 2 : 1; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -307,26 +299,12 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - unsigned int hash = inet_lhashfn(net, hnum); - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - bool exact_dif = inet_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; - struct sock *sk, *result = NULL; - int score, hiscore = 0; + struct sock *result = NULL; unsigned int hash2; - u32 phash = 0; - - if (ilb->count <= 10 || !hashinfo->lhash2) - goto port_lookup; - - /* Too many sk in the ilb bucket (which is hashed by port alone). - * Try lhash2 (which is hashed by port and addr) instead. - */ hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, @@ -335,34 +313,12 @@ struct sock *__inet_lookup_listener(struct net *net, goto done; /* Lookup lhash2 with INADDR_ANY */ - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, + saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); - goto done; - -port_lookup: - sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); - if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - goto done; - } - result = sk; - hiscore = score; - } - } done: if (unlikely(IS_ERR(result))) return NULL; @@ -829,6 +785,7 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, h->lhash2[i].count = 0; } } +EXPORT_SYMBOL_GPL(inet_hashinfo2_init); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 32662e9e5d21..06ee4696703c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + if (unlikely(opt->optlen)) ip_forward_options(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d6ee343fdb86..aa0b22697998 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -515,6 +515,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct rb_node *rbn; int len; int ihlen; + int delta; int err; u8 ecn; @@ -556,10 +557,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; + delta += head->truesize; + if (delta) + add_frag_mem_limit(qp->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 76a9a5f7a40e..c7a7bd58a23c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1341,12 +1341,6 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } -bool is_gretap_dev(const struct net_device *dev) -{ - return dev->netdev_ops == &gre_tap_netdev_ops; -} -EXPORT_SYMBOL_GPL(is_gretap_dev); - static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 72250b4e466d..26921f6b3b92 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -546,7 +546,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -593,7 +593,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c09219e7f230..ab6618036afe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; @@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; + bool paged, extra_uref; u32 tskey = 0; - bool paged; skb = skb_peek_tail(queue); @@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + cork->length += length; /* So, what's going on in the loop below? @@ -939,7 +954,7 @@ static int __ip_append_data(struct sock *sk, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - unsigned int pagedlen = 0; + unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; @@ -956,6 +971,7 @@ alloc_new_skb: if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; + pagedlen = 0; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) @@ -1000,12 +1016,6 @@ alloc_new_skb: skb->csum = 0; skb_reserve(skb, hh_len); - /* only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes. */ @@ -1038,6 +1048,13 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + /* only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1067,7 +1084,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1097,6 +1114,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1109,6 +1130,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index dde671e97829..c857ec6b9784 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -80,7 +80,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; @@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, } skb_clear_hash_if_not_l4(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, xnet); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 88212615bf4c..208a5b4419c6 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -220,7 +220,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) continue; - if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); } @@ -238,7 +238,7 @@ static int __init ic_open_devs(void) if (ic_proto_enabled && !able) continue; oflags = dev->flags; - if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) { pr_err("IP-Config: Failed to open %s\n", dev->name); continue; @@ -315,7 +315,7 @@ static void __init ic_close_devs(void) dev = d->dev; if (d != ic_dev && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); - dev_change_flags(dev, d->flags); + dev_change_flags(dev, d->flags, NULL); } kfree(d); } @@ -1361,18 +1361,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v) } return 0; } - -static int ntp_servers_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ntp_servers_seq_show, NULL); -} - -static const struct file_operations ntp_servers_seq_fops = { - .open = ntp_servers_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq); #endif /* CONFIG_PROC_FS */ /* diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a6defbec4f1b..75c654924532 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -506,7 +506,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); } @@ -589,7 +589,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); @@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; - if (!skb->offload_mr_fwd_mark) + if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; @@ -1820,8 +1820,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - int in_vifi, struct sk_buff *skb, - struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -2027,7 +2026,7 @@ forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, - skb2, c, psend); + skb2, psend); } psend = ct; } @@ -2039,9 +2038,9 @@ last_forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, - c, psend); + psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); return; } } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index ce1512b02cb2..fd3f9e8a74da 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -81,9 +81,12 @@ static int __init masquerade_tg_init(void) int ret; ret = xt_register_target(&masquerade_tg_reg); + if (ret) + return ret; - if (ret == 0) - nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (ret) + xt_unregister_target(&masquerade_tg_reg); return ret; } diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index a9d5e013e555..41327bb99093 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -147,28 +147,50 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); +static int masq_refcnt; +static DEFINE_MUTEX(masq_mutex); -void nf_nat_masquerade_ipv4_register_notifier(void) +int nf_nat_masquerade_ipv4_register_notifier(void) { + int ret = 0; + + mutex_lock(&masq_mutex); /* check if the notifier was already set */ - if (atomic_inc_return(&masquerade_notifier_refcount) > 1) - return; + if (++masq_refcnt > 1) + goto out_unlock; /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); + ret = register_netdevice_notifier(&masq_dev_notifier); + if (ret) + goto err_dec; /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); + ret = register_inetaddr_notifier(&masq_inet_notifier); + if (ret) + goto err_unregister; + + mutex_unlock(&masq_mutex); + return ret; + +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); +err_dec: + masq_refcnt--; +out_unlock: + mutex_unlock(&masq_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); void nf_nat_masquerade_ipv4_unregister_notifier(void) { + mutex_lock(&masq_mutex); /* check if the notifier still has clients */ - if (atomic_dec_return(&masquerade_notifier_refcount) > 0) - return; + if (--masq_refcnt > 0) + goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); +out_unlock: + mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index f1193e1e928a..6847de1d1db8 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -69,7 +69,9 @@ static int __init nft_masq_ipv4_module_init(void) if (ret < 0) return ret; - nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (ret) + nft_unregister_expr(&nft_masq_ipv4_type); return ret; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 70289682a670..c3610b37bb4c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fb1f02015a15..076f51646d26 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1132,6 +1132,7 @@ void __init raw_proc_exit(void) { unregister_pernet_subsys(&raw_net_ops); } +#endif /* CONFIG_PROC_FS */ static void raw_sysctl_init_net(struct net *net) { @@ -1156,4 +1157,3 @@ void __init raw_init(void) if (register_pernet_subsys(&raw_sysctl_ops)) panic("RAW: failed to init sysctl parameters.\n"); } -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c0a9d26c06ce..c4ddbc5f01fc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev, print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), - dev->hard_header_len, true); + dev->hard_header_len, false); } } #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e6bc4d6daa7..27e2f6837062 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, } continue; - found_ok_skb: +found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -2147,7 +2147,7 @@ skip_copy: sk_eat_skb(sk, skb); continue; - found_fin_ok: +found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) @@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state) * socket sitting in hash tables. */ inet_sk_state_store(sk, state); - -#ifdef STATE_TRACE - SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -#endif } EXPORT_SYMBOL_GPL(tcp_set_state); @@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ 0; } @@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); return stats; } @@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void) if (unlikely(!tcp_md5sig_pool_populated)) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool_populated) + if (!tcp_md5sig_pool_populated) { __tcp_alloc_md5sig_pool(); + if (tcp_md5sig_pool_populated) + static_key_slow_inc(&tcp_md5_needed); + } mutex_unlock(&tcp_md5sig_mutex); } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 9277abdd822a..0f497fc49c3f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; -/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ static const int bbr_pacing_margin_percent = 1; /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain @@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); } -/* Pace using current bw estimate and a gain factor. In order to help drive the - * network toward lower queues while maintaining high utilization and low - * latency, the average pacing rate aims to be slightly (~1%) lower than the - * estimated bandwidth. This is an important aspect of the design. In this - * implementation this slightly lower pacing rate is achieved implicitly by not - * including link-layer headers in the packet size used for the pacing rate. - */ +/* Pace using current bw estimate and a gain factor. */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3b45fe530f91..a47c1cdf90fc 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, delta = 0; int ret; more_data: - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + /* Track delta in msg size to add/subtract it on SK_DROP from + * returned to user copied size. This ensures user doesn't + * get a positive return code with msg_cut_data and SK_DROP + * verdict. + */ + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->sg.size < delta) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { @@ -350,7 +361,7 @@ more_data: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; + *copied -= (tosend + delta); return -EACCES; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2868ef28ce52..76858b14ebe9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -579,10 +579,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + tcp_rcv_rtt_update(tp, delta_us, 0); + } } } @@ -1863,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack) { - struct tcp_sock *tp = tcp_sk(sk); - u32 prior_sacked = tp->sacked_out; + if (num_dupack) { + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + s32 delivered; - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - if (tp->sacked_out > prior_sacked) - tp->delivered++; /* Some out-of-order packet is delivered */ - tcp_verify_left_out(tp); + tp->sacked_out += num_dupack; + tcp_check_reno_reordering(sk, 0); + delivered = tp->sacked_out - prior_sacked; + if (delivered > 0) + tp->delivered += delivered; + tcp_verify_left_out(tp); + } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ @@ -2457,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && - !(flag & FLAG_LOST_RETRANS)) { + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == + FLAG_RETRANS_DATA_ACKED) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -2634,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); @@ -2653,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, return; if (after(tp->snd_nxt, tp->high_seq)) { - if (flag & FLAG_DATA_SACKED || is_dupack) + if (flag & FLAG_DATA_SACKED || num_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; @@ -2679,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. */ - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) - tcp_add_reno_sack(sk); + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) + tcp_add_reno_sack(sk, num_dupack); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2757,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, - bool is_dupack, int *ack_flag, int *rexmit) + int num_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - tcp_force_fast_retransmit(sk)); + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && + tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; @@ -2810,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { - if (tcp_is_reno(tp) && is_dupack) - tcp_add_reno_sack(sk); + if (tcp_is_reno(tp)) + tcp_add_reno_sack(sk, num_dupack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2826,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack, rexmit); + tcp_process_loss(sk, flag, num_dupack, rexmit); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -2837,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - if (is_dupack) - tcp_add_reno_sack(sk); + tcp_add_reno_sack(sk, num_dupack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2910,9 +2915,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - seq_rtt_us = ca_rtt_us = delta_us; + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + ca_rtt_us = seq_rtt_us; + } } rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) @@ -3558,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; - bool is_dupack = false; + int num_dupack = 0; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; @@ -3610,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == + FLAG_SND_UNA_ADVANCED) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3668,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { + num_dupack = 1; + /* Consider if pure acks were aggregated in tcp_add_backlog() */ + if (!(flag & FLAG_DATA)) + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + } + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); } @@ -3687,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); } @@ -3712,7 +3725,7 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); @@ -4268,7 +4281,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; @@ -4363,6 +4376,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; to->tstamp = from->tstamp; + skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; } return true; @@ -4601,13 +4615,12 @@ end: } } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, - bool *fragstolen) +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, + bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); - __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; @@ -4658,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { + if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } @@ -4718,7 +4731,7 @@ queue_and_out: goto drop; } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5188,7 +5201,17 @@ send_now: if (!tcp_is_sack(tp) || tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) goto send_now; - tp->compressed_ack++; + + if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { + tp->compressed_ack_rcv_nxt = tp->rcv_nxt; + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = 0; + } + + if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + goto send_now; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; @@ -5584,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); + __skb_pull(skb, tcp_header_len); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a336787d75e5..efc6fef692ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -542,7 +542,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_rtx_queue_head(sk); - BUG_ON(!skb); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); @@ -971,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ +struct static_key tcp_md5_needed __read_mostly; +EXPORT_SYMBOL(tcp_md5_needed); + /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, + const union tcp_md5_addr *addr, + int family) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1012,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, } return best_match; } -EXPORT_SYMBOL(tcp_md5_do_lookup); +EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1620,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - - /* Only socket owner can try to collapse/prune rx queues - * to reduce memory overhead, so add a little headroom here. - * Few sockets backlog are possibly concurrently non empty. - */ - limit += 64*1024; + struct skb_shared_info *shinfo; + const struct tcphdr *th; + struct tcphdr *thtail; + struct sk_buff *tail; + unsigned int hdrlen; + bool fragstolen; + u32 gso_segs; + int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -1635,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ skb_condense(skb); + skb_dst_drop(skb); + + if (unlikely(tcp_checksum_complete(skb))) { + bh_unlock_sock(sk); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + return true; + } + + /* Attempt coalescing to last skb in backlog, even if we are + * above the limits. + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. + */ + th = (const struct tcphdr *)skb->data; + hdrlen = th->doff * 4; + shinfo = skb_shinfo(skb); + + if (!shinfo->gso_size) + shinfo->gso_size = skb->len - hdrlen; + + if (!shinfo->gso_segs) + shinfo->gso_segs = 1; + + tail = sk->sk_backlog.tail; + if (!tail) + goto no_coalesce; + thtail = (struct tcphdr *)tail->data; + + if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || + TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || + ((TCP_SKB_CB(tail)->tcp_flags | + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + ((TCP_SKB_CB(tail)->tcp_flags ^ + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || +#ifdef CONFIG_TLS_DEVICE + tail->decrypted != skb->decrypted || +#endif + thtail->doff != th->doff || + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + goto no_coalesce; + + __skb_pull(skb, hdrlen); + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + thtail->window = th->window; + + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + + TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + TCP_SKB_CB(tail)->has_rxtstamp = true; + tail->tstamp = skb->tstamp; + skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + + /* Not as strict as GRO. We only need to carry mss max value */ + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, + skb_shinfo(tail)->gso_size); + + gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; + skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + + sk->sk_backlog.len += delta; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPBACKLOGCOALESCE); + kfree_skb_partial(skb, fragstolen); + return false; + } + __skb_push(skb, hdrlen); + +no_coalesce: + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); @@ -2575,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of four TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* Default TSQ limit of 16 TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 870b0a335061..0fbf7d4df9da 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -10,6 +10,7 @@ * TCPv4 GSO/GRO support */ +#include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/tcp.h> #include <net/protocol.h> @@ -305,7 +306,8 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && @@ -318,7 +320,7 @@ static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff * return tcp_gro_receive(head, skb); } -static int tcp4_gro_complete(struct sk_buff *skb, int thoff) +INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9c34b97d365d..730bc44dbad9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -180,10 +180,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack)) { + if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack); - tp->compressed_ack = 0; + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = TCP_FASTRETRANS_THRESH; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } @@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - (*rcv_wscale) = 0; + *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { - space >>= 1; - (*rcv_wscale)++; - } + *rcv_wscale = clamp_t(int, ilog2(space) - 15, + 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); @@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -1904,24 +1904,27 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited, u32 max_segs) + bool *is_cwnd_limited, + bool *is_rwnd_limited, + u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 age, send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; - - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - goto send_now; + s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer - * only if the last write was recent. + * only if the last write was recent (1 ms). + * Note that tp->tcp_wstamp_ns can be in the future if we have + * packets waiting in a qdisc or device for EDT delivery. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) + delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; + if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1968,15 +1971,33 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); + delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ - if (age < (tp->srtt_us >> 4)) + if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; - /* Ok, it looks like it is advisable to defer. */ + /* Ok, it looks like it is advisable to defer. + * Three cases are tracked : + * 1) We are cwnd-limited + * 2) We are rwnd-limited + * 3) We are application limited. + */ + if (cong_win < send_win) { + if (cong_win <= skb->len) { + *is_cwnd_limited = true; + return true; + } + } else { + if (send_win <= skb->len) { + *is_rwnd_limited = true; + return true; + } + } - if (cong_win < send_win && cong_win <= skb->len) - *is_cwnd_limited = true; + /* If this packet won't get more data, do not wait. */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || + TCP_SKB_CB(skb)->eor) + goto send_now; return true; @@ -2212,8 +2233,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + if (sk->sk_pacing_status == SK_PACING_NONE) + limit = min_t(unsigned long, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { @@ -2356,7 +2378,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, - max_segs)) + &is_rwnd_limited, max_segs)) break; } @@ -2494,15 +2516,18 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); + if (unlikely(!skb)) { + WARN_ONCE(tp->packets_out, + "invalid inflight: %u state %u cwnd %u mss %d\n", + tp->packets_out, sk->sk_state, tp->snd_cwnd, mss); + inet_csk(sk)->icsk_pending = 0; + return; + } /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) goto rearm_timer; - /* Retransmit last segment. */ - if (WARN_ON(!skb)) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2920,7 +2945,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } return err; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 676020663ce8..f87dbc78b6bc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -40,15 +40,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; + s32 remaining; start_ts = tcp_retransmit_stamp(sk); if (!icsk->icsk_user_timeout || !start_ts) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - if (elapsed >= icsk->icsk_user_timeout) + remaining = icsk->icsk_user_timeout - elapsed; + if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ - else - return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); + + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } /** @@ -209,7 +211,7 @@ static bool retransmits_timed_out(struct sock *sk, (boundary - linear_backoff_thresh) * TCP_RTO_MAX; timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; + return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ @@ -376,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk) return; } - if (icsk->icsk_probes_out > max_probes) { + if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ @@ -482,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk) goto out_reset_timer; } + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { - int mib_idx; + int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) @@ -501,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else { - mib_idx = LINUX_MIB_TCPTIMEOUTS; } - __NET_INC_STATS(sock_net(sk), mib_idx); + if (mib_idx) + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); @@ -740,7 +742,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6f8890c5bc7e..3fb0ed5e4789 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -380,15 +380,12 @@ static int compute_score(struct sock *sk, struct net *net, ipv6_only_sock(sk)) return -1; - score = (sk->sk_family == PF_INET) ? 2 : 1; - inet = inet_sk(sk); + if (sk->sk_rcv_saddr != daddr) + return -1; - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; @@ -464,65 +461,30 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *result; unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); - struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + unsigned int hash2, slot2; + struct udp_hslot *hslot2; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness; - u32 hash = 0; - if (hslot->count > 10) { - hash2 = ipv4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp4_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, + exact_dif, hslot2, skb); + if (!result) { + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, + htonl(INADDR_ANY), hnum, dif, sdif, exact_dif, hslot2, skb); - if (!result) { - unsigned int old_slot2 = slot2; - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - /* avoid searching the same slot again. */ - if (unlikely(slot2 == old_slot2)) - return result; - - hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; - - result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, - exact_dif, hslot2, skb); - } - if (unlikely(IS_ERR(result))) - return NULL; - return result; - } -begin: - result = NULL; - badness = 0; - sk_for_each_rcu(sk, &hslot->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); - if (score > badness) { - if (sk->sk_reuseport) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (unlikely(IS_ERR(result))) - return NULL; - if (result) - return result; - } - result = sk; - badness = score; - } } + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -587,7 +549,7 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { - static_branch_enable(&udp_encap_needed_key); + static_branch_inc(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_enable); @@ -2524,7 +2486,7 @@ void udp_destroy_sock(struct sock *sk) encap_destroy(sk); } if (up->encap_enabled) - static_branch_disable(&udp_encap_needed_key); + static_branch_dec(&udp_encap_needed_key); } } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0646d61f4fa8..64f9715173ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -13,6 +13,7 @@ #include <linux/skbuff.h> #include <net/udp.h> #include <net/protocol.h> +#include <net/inet_common.h> static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -391,6 +392,8 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } +INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, udp_lookup_t lookup) { @@ -402,7 +405,8 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct sock *sk; rcu_read_lock(); - sk = (*lookup)(skb, uh->source, uh->dest); + sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, + udp4_lib_lookup_skb, skb, uh->source, uh->dest); if (!sk) goto out_unlock; @@ -451,8 +455,8 @@ out_unlock: } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff *udp4_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); @@ -502,7 +506,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, uh->len = newlen; rcu_read_lock(); - sk = (*lookup)(skb, uh->source, uh->dest); + sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, + udp4_lib_lookup_skb, skb, uh->source, uh->dest); if (sk && udp_sk(sk)->gro_enabled) { err = udp_gro_complete_segment(skb); } else if (sk && udp_sk(sk)->gro_complete) { @@ -525,7 +530,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, } EXPORT_SYMBOL(udp_gro_complete); -static int udp4_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) { const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index d0c412fc56ad..be8b5b2157d8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } + udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 63a808d5af15..521e471f1cf9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -179,7 +179,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, bool send_na); -static void addrconf_dad_run(struct inet6_dev *idev); +static void addrconf_dad_run(struct inet6_dev *idev, bool restart); static void addrconf_rs_timer(struct timer_list *t); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -2820,7 +2820,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) dev = __dev_get_by_name(net, p.name); if (!dev) goto err_exit; - err = dev_open(dev); + err = dev_open(dev, NULL); } } #endif @@ -3439,6 +3439,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_change_info *change_info; struct netdev_notifier_changeupper_info *info; struct inet6_dev *idev = __in6_dev_get(dev); struct net *net = dev_net(dev); @@ -3513,7 +3514,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; } - if (idev) { + if (!IS_ERR_OR_NULL(idev)) { if (idev->if_flags & IF_READY) { /* device is already configured - * but resend MLD reports, we might @@ -3521,6 +3522,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * multicast snooping switches */ ipv6_mc_up(idev); + change_info = ptr; + if (change_info->flags_changed & IFF_NOARP) + addrconf_dad_run(idev, true); rt6_sync_up(dev, RTNH_F_LINKDOWN); break; } @@ -3555,7 +3559,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!IS_ERR_OR_NULL(idev)) { if (run_pending) - addrconf_dad_run(idev); + addrconf_dad_run(idev, false); /* Device has an address by now */ rt6_sync_up(dev, RTNH_F_DEAD); @@ -4173,16 +4177,19 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, addrconf_verify_rtnl(); } -static void addrconf_dad_run(struct inet6_dev *idev) +static void addrconf_dad_run(struct inet6_dev *idev, bool restart) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { spin_lock(&ifp->lock); - if (ifp->flags & IFA_F_TENTATIVE && - ifp->state == INET6_IFADDR_STATE_DAD) + if ((ifp->flags & IFA_F_TENTATIVE && + ifp->state == INET6_IFADDR_STATE_DAD) || restart) { + if (restart) + ifp->state = INET6_IFADDR_STATE_PREDAD; addrconf_dad_kick(ifp); + } spin_unlock(&ifp->lock); } read_unlock_bh(&idev->lock); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 5eeeba7181a1..f3515ebe9b3a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -99,23 +99,16 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; - bool dev_match; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; - score = 1; - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - score++; - } - dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, - dif, sdif); - if (!dev_match) + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score++; + score = 1; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -164,26 +157,12 @@ struct sock *inet6_lookup_listener(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { - unsigned int hash = inet_lhashfn(net, hnum); - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - bool exact_dif = inet6_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; - struct sock *sk, *result = NULL; - int score, hiscore = 0; + struct sock *result = NULL; unsigned int hash2; - u32 phash = 0; - - if (ilb->count <= 10 || !hashinfo->lhash2) - goto port_lookup; - - /* Too many sk in the ilb bucket (which is hashed by port alone). - * Try lhash2 (which is hashed by port and addr) instead. - */ hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, @@ -192,33 +171,12 @@ struct sock *inet6_lookup_listener(struct net *net, goto done; /* Lookup lhash2 with in6addr_any */ - hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, + saddr, sport, &in6addr_any, hnum, dif, sdif); - goto done; - -port_lookup: - sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); - if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - goto done; - } - result = sk; - hiscore = score; - } - } done: if (unlikely(IS_ERR(result))) return NULL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 81b69bcee714..229e55c99021 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1885,12 +1885,6 @@ static void ip6gre_tap_setup(struct net_device *dev) netif_keep_dst(dev); } -bool is_ip6gretap_dev(const struct net_device *dev) -{ - return dev->netdev_ops == &ip6gre_tap_netdev_ops; -} -EXPORT_SYMBOL_GPL(is_ip6gretap_dev); - static bool ip6gre_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 3c06cc9e9b79..c7ed2b6d5a1d 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -95,7 +95,7 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -296,7 +296,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip6_rcv_core(skb, dev, net); if (skb == NULL) continue; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 70f525c33cb6..5c045691c302 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -20,6 +20,23 @@ #include "ip6_offload.h" +/* All GRO functions are always builtin, except UDP over ipv6, which lays in + * ipv6 module, as it depends on UDPv6 lookup function, so we need special care + * when ipv6 is built as a module + */ +#if IS_BUILTIN(CONFIG_IPV6) +#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) +#endif + +#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ +({ \ + unlikely(gro_recursion_inc_test(skb)) ? \ + NAPI_GRO_CB(skb)->flush |= 1, NULL : \ + INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ +}) + static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; @@ -164,8 +181,12 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff *ipv6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; @@ -260,7 +281,8 @@ not_same_flow: skb_gro_postpull_rcsum(skb, iph, nlen); - pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, + ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -301,7 +323,9 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, return inet_gro_receive(head, skb); } -static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); @@ -320,7 +344,8 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb, nhoff); + err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, + udp6_gro_complete, skb, nhoff); out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 89e0d5118afe..9d55ee33b7f9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -195,37 +195,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); + unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; - if (opt) { - unsigned int head_room; + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + if (opt) + head_room += opt->opt_nflen + opt->opt_flen; - /* First: exthdrs may take lots of space (~8K for now) - MAX_HEADER is not enough. - */ - head_room = opt->opt_nflen + opt->opt_flen; - seg_len += head_room; - head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); - - if (skb_headroom(skb) < head_room) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (!skb2) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); - return -ENOBUFS; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; + if (unlikely(skb_headroom(skb) < head_room)) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (!skb2) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + if (opt) { + seg_len += opt->opt_nflen + opt->opt_flen; + if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); @@ -378,6 +378,13 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + return dst_output(net, sk, skb); } @@ -1245,6 +1252,7 @@ static int __ip6_append_data(struct sock *sk, { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; @@ -1257,7 +1265,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged; + bool paged, extra_uref; skb = skb_peek_tail(queue); if (!skb) { @@ -1322,6 +1330,20 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1354,7 +1376,7 @@ emsgsize: unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - unsigned int pagedlen = 0; + unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ if (skb) @@ -1378,6 +1400,7 @@ alloc_new_skb: if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; + pagedlen = 0; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) @@ -1439,12 +1462,6 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes */ @@ -1476,6 +1493,13 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1505,7 +1529,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1535,6 +1559,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1547,6 +1575,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index b283f293ee4a..3965d5396b0a 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -31,6 +31,22 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; } + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e2ea691e42c6..34b8a90e6be2 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -655,7 +655,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) return NULL; } - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); @@ -1968,7 +1968,7 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c, int vifi) + struct sk_buff *skb, int vifi) { struct ipv6hdr *ipv6h; struct vif_device *vif = &mrt->vif_table[vifi]; @@ -2134,15 +2134,14 @@ forward: if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ip6mr_forward2(net, mrt, skb2, - c, psend); + ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { - ip6mr_forward2(net, mrt, skb, c, psend); + ip6mr_forward2(net, mrt, skb, psend); return; } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 5ae8e1c51079..8b075f0bc351 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,7 +24,8 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) unsigned int hh_len; struct dst_entry *dst; struct flowi6 fl6 = { - .flowi6_oif = sk ? sk->sk_bound_dev_if : 0, + .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if : + rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0, .flowi6_mark = skb->mark, .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 491f808e356a..29c7f1915a96 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -58,8 +58,12 @@ static int __init masquerade_tg6_init(void) int err; err = xt_register_target(&masquerade_tg6_reg); - if (err == 0) - nf_nat_masquerade_ipv6_register_notifier(); + if (err) + return err; + + err = nf_nat_masquerade_ipv6_register_notifier(); + if (err) + xt_unregister_target(&masquerade_tg6_reg); return err; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d219979c3e52..181da2c40f9a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -341,7 +341,7 @@ static bool nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; u8 ecn; inet_frag_kill(&fq->q); @@ -363,10 +363,16 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic return false; } + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) return false; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 3e4bf2286abe..0ad0da5a2600 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -132,8 +132,8 @@ static void iterate_cleanup_work(struct work_struct *work) * of ipv6 addresses being deleted), we also need to add an upper * limit to the number of queued work items. */ -static int masq_inet_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int masq_inet6_event(struct notifier_block *this, + unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; @@ -171,30 +171,53 @@ static int masq_inet_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, +static struct notifier_block masq_inet6_notifier = { + .notifier_call = masq_inet6_event, }; -static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); +static int masq_refcnt; +static DEFINE_MUTEX(masq_mutex); -void nf_nat_masquerade_ipv6_register_notifier(void) +int nf_nat_masquerade_ipv6_register_notifier(void) { + int ret = 0; + + mutex_lock(&masq_mutex); /* check if the notifier is already set */ - if (atomic_inc_return(&masquerade_notifier_refcount) > 1) - return; + if (++masq_refcnt > 1) + goto out_unlock; + + ret = register_netdevice_notifier(&masq_dev_notifier); + if (ret) + goto err_dec; + + ret = register_inet6addr_notifier(&masq_inet6_notifier); + if (ret) + goto err_unregister; - register_netdevice_notifier(&masq_dev_notifier); - register_inet6addr_notifier(&masq_inet_notifier); + mutex_unlock(&masq_mutex); + return ret; + +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); +err_dec: + masq_refcnt--; +out_unlock: + mutex_unlock(&masq_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); void nf_nat_masquerade_ipv6_unregister_notifier(void) { + mutex_lock(&masq_mutex); /* check if the notifier still has clients */ - if (atomic_dec_return(&masquerade_notifier_refcount) > 0) - return; + if (--masq_refcnt > 0) + goto out_unlock; - unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_inet6addr_notifier(&masq_inet6_notifier); unregister_netdevice_notifier(&masq_dev_notifier); +out_unlock: + mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index dd0122f3cffe..e06c82e9dfcd 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -70,7 +70,9 @@ static int __init nft_masq_ipv6_module_init(void) if (ret < 0) return ret; - nf_nat_masquerade_ipv6_register_notifier(); + ret = nf_nat_masquerade_ipv6_register_notifier(); + if (ret) + nft_unregister_expr(&nft_masq_ipv6_type); return ret; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c3c92713096..aa26c45486d9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -281,7 +281,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, { struct net *net = container_of(fq->q.net, struct net, ipv6.frags); struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; unsigned int nhoff; int sum_truesize; u8 ecn; @@ -322,10 +322,16 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (payload_len > IPV6_MAXPLEN) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b2447b7c7303..194bc162866d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2232,8 +2232,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { - if (dst_hold_safe(&rt->dst)) - rt6_remove_exception_rt(rt); + rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; @@ -2360,10 +2359,13 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { + int oif = sk->sk_bound_dev_if; struct dst_entry *dst; - ip6_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); + if (!oif && skb->dev) + oif = l3mdev_master_ifindex(skb->dev); + + ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || @@ -3215,8 +3217,8 @@ static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; - if (dst_hold_safe(&rt->dst)) - rc = rt6_remove_exception_rt(rt); + + rc = rt6_remove_exception_rt(rt); out: return rc; } diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index a8854dd3e9c5..8181ee7e1e27 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -347,6 +347,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a3f559162521..b81eb7cb815e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -737,6 +737,7 @@ static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { + bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk_listener); @@ -744,7 +745,7 @@ static void tcp_v6_init_req(struct request_sock *req, ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ - if (!sk_listener->sk_bound_dev_if && + if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index e72947c99454..3179c425d7ff 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -9,14 +9,15 @@ * * TCPv6 GSO/GRO support */ +#include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff *tcp6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && @@ -29,7 +30,7 @@ static struct sk_buff *tcp6_gro_receive(struct list_head *head, return tcp_gro_receive(head, skb); } -static int tcp6_gro_complete(struct sk_buff *skb, int thoff) +INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0c0cb1611aef..9cbf363172bd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -125,6 +125,9 @@ static int compute_score(struct sock *sk, struct net *net, sk->sk_family != PF_INET6) return -1; + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score = 0; inet = inet_sk(sk); @@ -134,12 +137,6 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) return -1; @@ -197,66 +194,32 @@ struct sock *__udp6_lib_lookup(struct net *net, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *sk, *result; unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); - struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + unsigned int hash2, slot2; + struct udp_hslot *hslot2; + struct sock *result; bool exact_dif = udp6_lib_exact_dif_match(net, skb); - int score, badness; - u32 hash = 0; - if (hslot->count > 10) { - hash2 = ipv6_portaddr_hash(net, daddr, hnum); + hash2 = ipv6_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp6_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, exact_dif, + hslot2, skb); + if (!result) { + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, - hslot2, skb); - if (!result) { - unsigned int old_slot2 = slot2; - hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - /* avoid searching the same slot again. */ - if (unlikely(slot2 == old_slot2)) - return result; - - hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; - - result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, - exact_dif, hslot2, - skb); - } - if (unlikely(IS_ERR(result))) - return NULL; - return result; - } -begin: - result = NULL; - badness = -1; - sk_for_each_rcu(sk, &hslot->head) { - score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, - sdif, exact_dif); - if (score > badness) { - if (sk->sk_reuseport) { - hash = udp6_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (unlikely(IS_ERR(result))) - return NULL; - if (result) - return result; - } - result = sk; - badness = score; - } + &in6addr_any, hnum, dif, sdif, + exact_dif, hslot2, + skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); @@ -326,6 +289,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; + struct udp_mib *mib; int is_udp4; if (flags & MSG_ERRQUEUE) @@ -349,6 +313,7 @@ try_again: msg->msg_flags |= MSG_TRUNC; is_udp4 = (skb->protocol == htons(ETH_P_IP)); + mib = __UDPX_MIB(sk, is_udp4); /* * If checksum is needed at all, try to do it while copying the @@ -377,24 +342,13 @@ try_again: if (unlikely(err)) { if (!peeked) { atomic_inc(&sk->sk_drops); - if (is_udp4) - UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - is_udplite); - else - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - is_udplite); + SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } - if (!peeked) { - if (is_udp4) - UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, - is_udplite); - else - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, - is_udplite); - } + if (!peeked) + SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS); sock_recv_ts_and_drops(msg, sk, skb); @@ -443,17 +397,8 @@ try_again: csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { - if (is_udp4) { - UDP_INC_STATS(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); - } else { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); - } + SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); + SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -466,7 +411,7 @@ csum_copy_err: DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { - static_branch_enable(&udpv6_encap_needed_key); + static_branch_inc(&udpv6_encap_needed_key); } EXPORT_SYMBOL(udpv6_encap_enable); @@ -1597,7 +1542,7 @@ void udpv6_destroy_sock(struct sock *sk) encap_destroy(sk); } if (up->encap_enabled) - static_branch_disable(&udpv6_encap_needed_key); + static_branch_dec(&udpv6_encap_needed_key); } inet6_destroy_sock(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 828b2457f97b..83b11d0ac091 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -11,6 +11,7 @@ */ #include <linux/skbuff.h> #include <linux/netdevice.h> +#include <linux/indirect_call_wrapper.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/udp.h> @@ -114,8 +115,8 @@ out: return segs; } -static struct sk_buff *udp6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); @@ -142,7 +143,7 @@ flush: return NULL; } -static int udp6_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 0bed4cc20603..78ea5a739d10 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1873,30 +1873,26 @@ static void iucv_callback_txdone(struct iucv_path *path, struct sock *sk = path->private; struct sk_buff *this = NULL; struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q; - struct sk_buff *list_skb = list->next; + struct sk_buff *list_skb; unsigned long flags; bh_lock_sock(sk); - if (!skb_queue_empty(list)) { - spin_lock_irqsave(&list->lock, flags); - while (list_skb != (struct sk_buff *)list) { - if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { - this = list_skb; - break; - } - list_skb = list_skb->next; + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk(list, list_skb) { + if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { + this = list_skb; + break; } - if (this) - __skb_unlink(this, list); - - spin_unlock_irqrestore(&list->lock, flags); + } + if (this) + __skb_unlink(this, list); + spin_unlock_irqrestore(&list->lock, flags); - if (this) { - kfree_skb(this); - /* wake up any process waiting for sending */ - iucv_sock_wake_msglim(sk); - } + if (this) { + kfree_skb(this); + /* wake up any process waiting for sending */ + iucv_sock_wake_msglim(sk); } if (sk->sk_state == IUCV_CLOSING) { @@ -2284,11 +2280,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, list = &iucv->send_skb_q; spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) - goto out_unlock; - list_skb = list->next; - nskb = list_skb->next; - while (list_skb != (struct sk_buff *)list) { + skb_queue_walk_safe(list, list_skb, nskb) { if (skb_shinfo(list_skb) == skb_shinfo(skb)) { switch (n) { case TX_NOTIFY_OK: @@ -2321,10 +2313,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, } break; } - list_skb = nskb; - nskb = nskb->next; } -out_unlock: spin_unlock_irqrestore(&list->lock, flags); if (sk->sk_state == IUCV_CLOSING) { diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 82cdf9020b53..26f1d435696a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1490,12 +1490,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, goto err_sock; } - sk = sock->sk; - - sock_hold(sk); - tunnel->sock = sk; tunnel->l2tp_net = net; - pn = l2tp_pernet(net); spin_lock_bh(&pn->l2tp_tunnel_list_lock); @@ -1510,6 +1505,10 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + sk = sock->sk; + sock_hold(sk); + tunnel->sock = sk; + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .sk_user_data = tunnel, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 04d9946dcdba..c03c6461f236 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -236,6 +236,10 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int skb->data[1] == PPP_UI) skb_pull(skb, 2); + /* Decompress protocol field if PFC is enabled */ + if ((*skb->data) & 0x1) + *(u8 *)skb_push(skb, 1) = 0; + if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 8da86ceca33d..309dee76724e 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -47,6 +47,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev) EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** + * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master + * device + * @net: network namespace for device index lookup + * @ifindex: targeted interface + */ +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, ifindex); + while (dev && !netif_is_l3_master(dev)) + dev = netdev_master_upper_dev_get(dev); + + return dev ? dev->ifindex : 0; +} +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); + +/** * l3mdev_fib_table - get FIB table id associated with an L3 * master interface * @dev: targeted interface diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 51622333d460..818aa0060349 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2891,7 +2891,7 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + - beacon->probe_resp_len; + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) @@ -2934,8 +2934,9 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } - if (beacon->ftm_responder) - new_beacon->ftm_responder = beacon->ftm_responder; + + /* might copy -1, meaning no changes requested */ + new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5836ddeac9e3..5f3c81e705c7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1015,6 +1015,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (local->open_count == 0) ieee80211_clear_tx_pending(local); + sdata->vif.bss_conf.beacon_int = 0; + /* * If the interface goes down while suspended, presumably because * the device was unplugged and that happens before our resume, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d2bc8d57c87e..bcf5ffc1567a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2766,6 +2766,7 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sta_info *sta; + bool result = true; sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; @@ -2778,15 +2779,18 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, sta = sta_info_get(sdata, bssid); if (!sta) { WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid); - return false; + result = false; + goto out; } if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { sdata_info(sdata, "failed moving %pM to auth\n", bssid); - return false; + result = false; + goto out; } - mutex_unlock(&sdata->local->sta_mtx); - return true; +out: + mutex_unlock(&sdata->local->sta_mtx); + return result; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3bd3b5769797..428f7ad5f9b5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1403,6 +1403,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (ieee80211_is_ctl(hdr->frame_control) || + ieee80211_is_nullfunc(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; @@ -3063,7 +3064,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, &sta_opmode, - GFP_KERNEL); + GFP_ATOMIC); goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { @@ -3100,7 +3101,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, &sta_opmode, - GFP_KERNEL); + GFP_ATOMIC); goto handled; } default: diff --git a/net/mac80211/status.c b/net/mac80211/status.c index aa4afbf0abaf..a794ca729000 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -964,6 +964,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, /* Track when last TDLS packet was ACKed */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) sta->status_stats.last_tdls_pkt_time = jiffies; + } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + return; } else { ieee80211_lost_packet(sta, info); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e0ccee23fbcd..1f536ba573b4 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -439,8 +439,8 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; - /* no stations in PS mode */ - if (!atomic_read(&ps->num_sta_ps)) + /* no stations in PS mode and no buffered packets */ + if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf)) return TX_CONTINUE; info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 1dae77c54009..87505600dbb2 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -73,10 +73,15 @@ enum { #define NCSI_OEM_MFR_BCM_ID 0x113d /* Broadcom specific OEM Command */ #define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ +/* Mellanox specific OEM Command */ +#define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */ +#define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */ /* OEM Command payload lengths*/ #define NCSI_OEM_BCM_CMD_GMA_LEN 12 +#define NCSI_OEM_MLX_CMD_GMA_LEN 8 /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 +#define MLX_MAC_ADDR_OFFSET 8 struct ncsi_channel_version { @@ -222,6 +227,10 @@ struct ncsi_package { unsigned int channel_num; /* Number of channels */ struct list_head channels; /* List of chanels */ struct list_head node; /* Form list of packages */ + + bool multi_channel; /* Enable multiple channels */ + u32 channel_whitelist; /* Channels to configure */ + struct ncsi_channel *preferred_channel; /* Primary channel */ }; struct ncsi_request { @@ -287,16 +296,16 @@ struct ncsi_dev_priv { #define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ #define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ #define NCSI_DEV_RESHUFFLE 4 +#define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ #if IS_ENABLED(CONFIG_IPV6) unsigned int inet6_addr_num; /* Number of IPv6 addresses */ #endif + unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ - struct ncsi_package *force_package; /* Force a specific package */ - struct ncsi_channel *force_channel; /* Force a specific channel */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ #define NCSI_REQ_START_IDX 1 @@ -309,6 +318,9 @@ struct ncsi_dev_priv { struct list_head node; /* Form NCSI device list */ #define NCSI_MAX_VLAN_VIDS 15 struct list_head vlan_vids; /* List of active VLAN IDs */ + + bool multi_package; /* Enable multiple packages */ + u32 package_whitelist; /* Packages to configure */ }; struct ncsi_cmd_arg { @@ -341,6 +353,7 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ +int ncsi_reset_dev(struct ncsi_dev *nd); void ncsi_start_channel_monitor(struct ncsi_channel *nc); void ncsi_stop_channel_monitor(struct ncsi_channel *nc); struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, @@ -361,6 +374,13 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); +bool ncsi_channel_has_link(struct ncsi_channel *channel); +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, + struct ncsi_channel *channel); +int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, + struct ncsi_package *np, + struct ncsi_channel *disable, + struct ncsi_channel *enable); /* Packet handlers */ u32 ncsi_calculate_checksum(unsigned char *data, int len); diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 25e483e8278b..26d67e27551f 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -50,13 +50,15 @@ static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h, static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, struct ncsi_aen_pkt_hdr *h) { - struct ncsi_aen_lsc_pkt *lsc; - struct ncsi_channel *nc; + struct ncsi_channel *nc, *tmp; struct ncsi_channel_mode *ncm; - bool chained; - int state; unsigned long old_data, data; + struct ncsi_aen_lsc_pkt *lsc; + struct ncsi_package *np; + bool had_link, has_link; unsigned long flags; + bool chained; + int state; /* Find the NCSI channel */ ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); @@ -73,6 +75,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); + had_link = !!(old_data & 0x1); + has_link = !!(data & 0x1); + netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", nc->id, data & 0x1 ? "up" : "down"); @@ -80,22 +85,60 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, state = nc->state; spin_unlock_irqrestore(&nc->lock, flags); - if (!((old_data ^ data) & 0x1) || chained) - return 0; - if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) && - !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1))) + if (state == NCSI_CHANNEL_INACTIVE) + netdev_warn(ndp->ndev.dev, + "NCSI: Inactive channel %u received AEN!\n", + nc->id); + + if ((had_link == has_link) || chained) return 0; - if (!(ndp->flags & NCSI_DEV_HWA) && - state == NCSI_CHANNEL_ACTIVE) - ndp->flags |= NCSI_DEV_RESHUFFLE; + if (!ndp->multi_package && !nc->package->multi_channel) { + if (had_link) { + ndp->flags |= NCSI_DEV_RESHUFFLE; + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + return ncsi_process_next_channel(ndp); + } + /* Configured channel came up */ + return 0; + } - ncsi_stop_channel_monitor(nc); - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); + if (had_link) { + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (ncsi_channel_is_last(ndp, nc)) { + /* No channels left, reconfigure */ + return ncsi_reset_dev(&ndp->ndev); + } else if (ncm->enable) { + /* Need to failover Tx channel */ + ncsi_update_tx_channel(ndp, nc->package, nc, NULL); + } + } else if (has_link && nc->package->preferred_channel == nc) { + /* Return Tx to preferred channel */ + ncsi_update_tx_channel(ndp, nc->package, NULL, nc); + } else if (has_link) { + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, tmp) { + /* Enable Tx on this channel if the current Tx + * channel is down. + */ + ncm = &tmp->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable && + !ncsi_channel_has_link(tmp)) { + ncsi_update_tx_channel(ndp, nc->package, + tmp, nc); + break; + } + } + } + } - return ncsi_process_next_channel(ndp); + /* Leave configured channels active in a multi-channel scenario so + * AEN events are still received. + */ + return 0; } static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bfc43b28c7a6..31359d5e14ad 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -28,6 +28,29 @@ LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); +bool ncsi_channel_has_link(struct ncsi_channel *channel) +{ + return !!(channel->modes[NCSI_MODE_LINK].data[2] & 0x1); +} + +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, + struct ncsi_channel *channel) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + + NCSI_FOR_EACH_PACKAGE(ndp, np) + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (nc == channel) + continue; + if (nc->state == NCSI_CHANNEL_ACTIVE && + ncsi_channel_has_link(nc)) + return false; + } + + return true; +} + static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) { struct ncsi_dev *nd = &ndp->ndev; @@ -52,7 +75,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) continue; } - if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + if (ncsi_channel_has_link(nc)) { spin_unlock_irqrestore(&nc->lock, flags); nd->link_up = 1; goto report; @@ -113,10 +136,8 @@ static void ncsi_channel_monitor(struct timer_list *t) default: netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n", nc->id); - if (!(ndp->flags & NCSI_DEV_HWA)) { - ncsi_report_link(ndp, true); - ndp->flags |= NCSI_DEV_RESHUFFLE; - } + ncsi_report_link(ndp, true); + ndp->flags |= NCSI_DEV_RESHUFFLE; ncsi_stop_channel_monitor(nc); @@ -269,6 +290,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, np->ndp = ndp; spin_lock_init(&np->lock); INIT_LIST_HEAD(&np->channels); + np->channel_whitelist = UINT_MAX; spin_lock_irqsave(&ndp->lock, flags); tmp = ncsi_find_package(ndp, id); @@ -442,12 +464,14 @@ static void ncsi_request_timeout(struct timer_list *t) static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; - struct ncsi_package *np = ndp->active_package; - struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_package *np; + struct ncsi_channel *nc, *tmp; struct ncsi_cmd_arg nca; unsigned long flags; int ret; + np = ndp->active_package; + nc = ndp->active_channel; nca.ndp = ndp; nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { @@ -523,6 +547,15 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; + NCSI_FOR_EACH_CHANNEL(np, tmp) { + /* If there is another channel active on this package + * do not deselect the package. + */ + if (tmp != nc && tmp->state == NCSI_CHANNEL_ACTIVE) { + nd->state = ncsi_dev_state_suspend_done; + break; + } + } break; case ncsi_dev_state_suspend_deselect: ndp->pending_req_num = 1; @@ -541,8 +574,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); - ncsi_process_next_channel(ndp); - + if (ndp->flags & NCSI_DEV_RESET) + ncsi_reset_dev(nd); + else + ncsi_process_next_channel(ndp); break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", @@ -675,12 +710,38 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) +{ + union { + u8 data_u8[NCSI_OEM_MLX_CMD_GMA_LEN]; + u32 data_u32[NCSI_OEM_MLX_CMD_GMA_LEN / sizeof(u32)]; + } u; + int ret = 0; + + nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN; + + memset(&u, 0, sizeof(u)); + u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA; + u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM; + + nca->data = u.data_u8; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { - { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm } + { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, + { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) @@ -717,13 +778,144 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) #endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ +/* Determine if a given channel from the channel_queue should be used for Tx */ +static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, + struct ncsi_channel *nc) +{ + struct ncsi_channel_mode *ncm; + struct ncsi_channel *channel; + struct ncsi_package *np; + + /* Check if any other channel has Tx enabled; a channel may have already + * been configured and removed from the channel queue. + */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (!ndp->multi_package && np != nc->package) + continue; + NCSI_FOR_EACH_CHANNEL(np, channel) { + ncm = &channel->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable) + return false; + } + } + + /* This channel is the preferred channel and has link */ + list_for_each_entry_rcu(channel, &ndp->channel_queue, link) { + np = channel->package; + if (np->preferred_channel && + ncsi_channel_has_link(np->preferred_channel)) { + return np->preferred_channel == nc; + } + } + + /* This channel has link */ + if (ncsi_channel_has_link(nc)) + return true; + + list_for_each_entry_rcu(channel, &ndp->channel_queue, link) + if (ncsi_channel_has_link(channel)) + return false; + + /* No other channel has link; default to this one */ + return true; +} + +/* Change the active Tx channel in a multi-channel setup */ +int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, + struct ncsi_package *package, + struct ncsi_channel *disable, + struct ncsi_channel *enable) +{ + struct ncsi_cmd_arg nca; + struct ncsi_channel *nc; + struct ncsi_package *np; + int ret = 0; + + if (!package->multi_channel && !ndp->multi_package) + netdev_warn(ndp->ndev.dev, + "NCSI: Trying to update Tx channel in single-channel mode\n"); + nca.ndp = ndp; + nca.req_flags = 0; + + /* Find current channel with Tx enabled */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (disable) + break; + if (!ndp->multi_package && np != package) + continue; + + NCSI_FOR_EACH_CHANNEL(np, nc) + if (nc->modes[NCSI_MODE_TX_ENABLE].enable) { + disable = nc; + break; + } + } + + /* Find a suitable channel for Tx */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (enable) + break; + if (!ndp->multi_package && np != package) + continue; + if (!(ndp->package_whitelist & (0x1 << np->id))) + continue; + + if (np->preferred_channel && + ncsi_channel_has_link(np->preferred_channel)) { + enable = np->preferred_channel; + break; + } + + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!(np->channel_whitelist & 0x1 << nc->id)) + continue; + if (nc->state != NCSI_CHANNEL_ACTIVE) + continue; + if (ncsi_channel_has_link(nc)) { + enable = nc; + break; + } + } + } + + if (disable == enable) + return -1; + + if (!enable) + return -1; + + if (disable) { + nca.channel = disable->id; + nca.package = disable->package->id; + nca.type = NCSI_PKT_CMD_DCNT; + ret = ncsi_xmit_cmd(&nca); + if (ret) + netdev_err(ndp->ndev.dev, + "Error %d sending DCNT\n", + ret); + } + + netdev_info(ndp->ndev.dev, "NCSI: channel %u enables Tx\n", enable->id); + + nca.channel = enable->id; + nca.package = enable->package->id; + nca.type = NCSI_PKT_CMD_ECNT; + ret = ncsi_xmit_cmd(&nca); + if (ret) + netdev_err(ndp->ndev.dev, + "Error %d sending ECNT\n", + ret); + + return ret; +} + static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_dev *nd = &ndp->ndev; - struct net_device *dev = nd->dev; struct ncsi_package *np = ndp->active_package; struct ncsi_channel *nc = ndp->active_channel; struct ncsi_channel *hot_nc = NULL; + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; struct ncsi_cmd_arg nca; unsigned char index; unsigned long flags; @@ -845,20 +1037,29 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; - nd->state = ncsi_dev_state_config_ecnt; + if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; #if IS_ENABLED(CONFIG_IPV6) if (ndp->inet6_addr_num > 0 && (nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) nd->state = ncsi_dev_state_config_egmf; - else - nd->state = ncsi_dev_state_config_ecnt; } else if (nd->state == ncsi_dev_state_config_egmf) { nca.type = NCSI_PKT_CMD_EGMF; nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - nd->state = ncsi_dev_state_config_ecnt; + if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; #endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { + if (np->preferred_channel && + nc != np->preferred_channel) + netdev_info(ndp->ndev.dev, + "NCSI: Tx failed over to channel %u\n", + nc->id); nca.type = NCSI_PKT_CMD_ECNT; nd->state = ncsi_dev_state_config_ec; } else if (nd->state == ncsi_dev_state_config_ec) { @@ -889,6 +1090,16 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) netdev_dbg(ndp->ndev.dev, "NCSI: channel %u config done\n", nc->id); spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_ACTIVE; + + if (ndp->flags & NCSI_DEV_RESET) { + /* A reset event happened during config, start it now */ + nc->reconfigure_needed = false; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_reset_dev(nd); + break; + } + if (nc->reconfigure_needed) { /* This channel's configuration has been updated * part-way during the config state - start the @@ -909,10 +1120,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { hot_nc = nc; - nc->state = NCSI_CHANNEL_ACTIVE; } else { hot_nc = NULL; - nc->state = NCSI_CHANNEL_INACTIVE; netdev_dbg(ndp->ndev.dev, "NCSI: channel %u link down after config\n", nc->id); @@ -940,43 +1149,35 @@ error: static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_package *np, *force_package; - struct ncsi_channel *nc, *found, *hot_nc, *force_channel; + struct ncsi_channel *nc, *found, *hot_nc; struct ncsi_channel_mode *ncm; - unsigned long flags; + unsigned long flags, cflags; + struct ncsi_package *np; + bool with_link; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; - force_channel = ndp->force_channel; - force_package = ndp->force_package; spin_unlock_irqrestore(&ndp->lock, flags); - /* Force a specific channel whether or not it has link if we have been - * configured to do so - */ - if (force_package && force_channel) { - found = force_channel; - ncm = &found->modes[NCSI_MODE_LINK]; - if (!(ncm->data[2] & 0x1)) - netdev_info(ndp->ndev.dev, - "NCSI: Channel %u forced, but it is link down\n", - found->id); - goto out; - } - - /* The search is done once an inactive channel with up - * link is found. + /* By default the search is done once an inactive channel with up + * link is found, unless a preferred channel is set. + * If multi_package or multi_channel are configured all channels in the + * whitelist are added to the channel queue. */ found = NULL; + with_link = false; NCSI_FOR_EACH_PACKAGE(ndp, np) { - if (ndp->force_package && np != ndp->force_package) + if (!(ndp->package_whitelist & (0x1 << np->id))) continue; NCSI_FOR_EACH_CHANNEL(np, nc) { - spin_lock_irqsave(&nc->lock, flags); + if (!(np->channel_whitelist & (0x1 << nc->id))) + continue; + + spin_lock_irqsave(&nc->lock, cflags); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_INACTIVE) { - spin_unlock_irqrestore(&nc->lock, flags); + spin_unlock_irqrestore(&nc->lock, cflags); continue; } @@ -988,32 +1189,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) ncm = &nc->modes[NCSI_MODE_LINK]; if (ncm->data[2] & 0x1) { - spin_unlock_irqrestore(&nc->lock, flags); found = nc; - goto out; + with_link = true; } - spin_unlock_irqrestore(&nc->lock, flags); + /* If multi_channel is enabled configure all valid + * channels whether or not they currently have link + * so they will have AENs enabled. + */ + if (with_link || np->multi_channel) { + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, + &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u added to queue (link %s)\n", + nc->id, + ncm->data[2] & 0x1 ? "up" : "down"); + } + + spin_unlock_irqrestore(&nc->lock, cflags); + + if (with_link && !np->multi_channel) + break; } + if (with_link && !ndp->multi_package) + break; } - if (!found) { + if (list_empty(&ndp->channel_queue) && found) { + netdev_info(ndp->ndev.dev, + "NCSI: No channel with link found, configuring channel %u\n", + found->id); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&found->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + } else if (!found) { netdev_warn(ndp->ndev.dev, - "NCSI: No channel found with link\n"); + "NCSI: No channel found to configure!\n"); ncsi_report_link(ndp, true); return -ENODEV; } - ncm = &found->modes[NCSI_MODE_LINK]; - netdev_dbg(ndp->ndev.dev, - "NCSI: Channel %u added to queue (link %s)\n", - found->id, ncm->data[2] & 0x1 ? "up" : "down"); - -out: - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&found->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); - return ncsi_process_next_channel(ndp); } @@ -1050,35 +1268,6 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) return false; } -static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) -{ - struct ncsi_package *np; - struct ncsi_channel *nc; - unsigned long flags; - - /* Move all available channels to processing queue */ - spin_lock_irqsave(&ndp->lock, flags); - NCSI_FOR_EACH_PACKAGE(ndp, np) { - NCSI_FOR_EACH_CHANNEL(np, nc) { - WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE || - !list_empty(&nc->link)); - ncsi_stop_channel_monitor(nc); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - } - } - spin_unlock_irqrestore(&ndp->lock, flags); - - /* We can have no channels in extremely case */ - if (list_empty(&ndp->channel_queue)) { - netdev_err(ndp->ndev.dev, - "NCSI: No available channels for HWA\n"); - ncsi_report_link(ndp, false); - return -ENOENT; - } - - return ncsi_process_next_channel(ndp); -} - static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1110,70 +1299,28 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: - ndp->pending_req_num = 16; + ndp->pending_req_num = 1; - /* Select all possible packages */ nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; + nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; - for (index = 0; index < 8; index++) { - nca.package = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - - /* Disable all possible packages */ - nca.type = NCSI_PKT_CMD_DP; - for (index = 0; index < 8; index++) { - nca.package = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; nd->state = ncsi_dev_state_probe_channel; break; case ncsi_dev_state_probe_channel: - if (!ndp->active_package) - ndp->active_package = list_first_or_null_rcu( - &ndp->packages, struct ncsi_package, node); - else if (list_is_last(&ndp->active_package->node, - &ndp->packages)) - ndp->active_package = NULL; - else - ndp->active_package = list_next_entry( - ndp->active_package, node); - - /* All available packages and channels are enumerated. The - * enumeration happens for once when the NCSI interface is - * started. So we need continue to start the interface after - * the enumeration. - * - * We have to choose an active channel before configuring it. - * Note that we possibly don't have active channel in extreme - * situation. - */ + ndp->active_package = ncsi_find_package(ndp, + ndp->package_probe_id); if (!ndp->active_package) { - ndp->flags |= NCSI_DEV_PROBED; - if (ncsi_check_hwa(ndp)) - ncsi_enable_hwa(ndp); - else - ncsi_choose_active_channel(ndp); - return; + /* No response */ + nd->state = ncsi_dev_state_probe_dp; + schedule_work(&ndp->work); + break; } - - /* Select the active package */ - ndp->pending_req_num = 1; - nca.type = NCSI_PKT_CMD_SP; - nca.bytes[0] = 1; - nca.package = ndp->active_package->id; - nca.channel = NCSI_RESERVED_CHANNEL; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - nd->state = ncsi_dev_state_probe_cis; + schedule_work(&ndp->work); break; case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1222,22 +1369,35 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_probe_dp: ndp->pending_req_num = 1; - /* Deselect the active package */ + /* Deselect the current package */ nca.type = NCSI_PKT_CMD_DP; - nca.package = ndp->active_package->id; + nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; - /* Scan channels in next package */ - nd->state = ncsi_dev_state_probe_channel; + /* Probe next package */ + ndp->package_probe_id++; + if (ndp->package_probe_id >= 8) { + /* Probe finished */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + nd->state = ncsi_dev_state_probe_package; + ndp->active_package = NULL; break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", nd->state); } + if (ndp->flags & NCSI_DEV_PROBED) { + /* Check if all packages have HWA support */ + ncsi_check_hwa(ndp); + ncsi_choose_active_channel(ndp); + } + return; error: netdev_err(ndp->ndev.dev, @@ -1556,6 +1716,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, INIT_LIST_HEAD(&ndp->channel_queue); INIT_LIST_HEAD(&ndp->vlan_vids); INIT_WORK(&ndp->work, ncsi_dev_work); + ndp->package_whitelist = UINT_MAX; /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); @@ -1592,26 +1753,19 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); - int ret; if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) return -ENOTTY; if (!(ndp->flags & NCSI_DEV_PROBED)) { + ndp->package_probe_id = 0; nd->state = ncsi_dev_state_probe; schedule_work(&ndp->work); return 0; } - if (ndp->flags & NCSI_DEV_HWA) { - netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n"); - ret = ncsi_enable_hwa(ndp); - } else { - ret = ncsi_choose_active_channel(ndp); - } - - return ret; + return ncsi_reset_dev(nd); } EXPORT_SYMBOL_GPL(ncsi_start_dev); @@ -1624,7 +1778,10 @@ void ncsi_stop_dev(struct ncsi_dev *nd) int old_state; unsigned long flags; - /* Stop the channel monitor and reset channel's state */ + /* Stop the channel monitor on any active channels. Don't reset the + * channel state so we know which were active when ncsi_start_dev() + * is next called. + */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { ncsi_stop_channel_monitor(nc); @@ -1632,7 +1789,6 @@ void ncsi_stop_dev(struct ncsi_dev *nd) spin_lock_irqsave(&nc->lock, flags); chained = !list_empty(&nc->link); old_state = nc->state; - nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); WARN_ON_ONCE(chained || @@ -1645,6 +1801,92 @@ void ncsi_stop_dev(struct ncsi_dev *nd) } EXPORT_SYMBOL_GPL(ncsi_stop_dev); +int ncsi_reset_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_channel *nc, *active, *tmp; + struct ncsi_package *np; + unsigned long flags; + + spin_lock_irqsave(&ndp->lock, flags); + + if (!(ndp->flags & NCSI_DEV_RESET)) { + /* Haven't been called yet, check states */ + switch (nd->state & ncsi_dev_state_major) { + case ncsi_dev_state_registered: + case ncsi_dev_state_probe: + /* Not even probed yet - do nothing */ + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + case ncsi_dev_state_suspend: + case ncsi_dev_state_config: + /* Wait for the channel to finish its suspend/config + * operation; once it finishes it will check for + * NCSI_DEV_RESET and reset the state. + */ + ndp->flags |= NCSI_DEV_RESET; + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + } + } else { + switch (nd->state) { + case ncsi_dev_state_suspend_done: + case ncsi_dev_state_config_done: + case ncsi_dev_state_functional: + /* Ok */ + break; + default: + /* Current reset operation happening */ + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + } + } + + if (!list_empty(&ndp->channel_queue)) { + /* Clear any channel queue we may have interrupted */ + list_for_each_entry_safe(nc, tmp, &ndp->channel_queue, link) + list_del_init(&nc->link); + } + spin_unlock_irqrestore(&ndp->lock, flags); + + active = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + + if (nc->state == NCSI_CHANNEL_ACTIVE) { + active = nc; + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_stop_channel_monitor(nc); + break; + } + + spin_unlock_irqrestore(&nc->lock, flags); + } + if (active) + break; + } + + if (!active) { + /* Done */ + spin_lock_irqsave(&ndp->lock, flags); + ndp->flags &= ~NCSI_DEV_RESET; + spin_unlock_irqrestore(&ndp->lock, flags); + return ncsi_choose_active_channel(ndp); + } + + spin_lock_irqsave(&ndp->lock, flags); + ndp->flags |= NCSI_DEV_RESET; + ndp->active_channel = active; + ndp->active_package = active->package; + spin_unlock_irqrestore(&ndp->lock, flags); + + nd->state = ncsi_dev_state_suspend; + schedule_work(&ndp->work); + return 0; +} + void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 33314381b4f5..5d782445d2fc 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -30,6 +30,9 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, + [NCSI_ATTR_MULTI_FLAG] = { .type = NLA_FLAG }, + [NCSI_ATTR_PACKAGE_MASK] = { .type = NLA_U32 }, + [NCSI_ATTR_CHANNEL_MASK] = { .type = NLA_U32 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) @@ -69,7 +72,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb, nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); if (nc->state == NCSI_CHANNEL_ACTIVE) nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); - if (ndp->force_channel == nc) + if (nc == nc->package->preferred_channel) nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); @@ -114,7 +117,7 @@ static int ncsi_write_package_info(struct sk_buff *skb, if (!pnest) return -ENOMEM; nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); - if (ndp->force_package == np) + if ((0x1 << np->id) == ndp->package_whitelist) nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); if (!cnest) { @@ -290,49 +293,58 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; - spin_lock_irqsave(&ndp->lock, flags); - NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) { /* The user has set a package that does not exist */ - spin_unlock_irqrestore(&ndp->lock, flags); return -ERANGE; } channel = NULL; - if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { - /* Allow any channel */ - channel_id = NCSI_RESERVED_CHANNEL; - } else { + if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(package, nc) - if (nc->id == channel_id) + if (nc->id == channel_id) { channel = nc; + break; + } + if (!channel) { + netdev_info(ndp->ndev.dev, + "NCSI: Channel %u does not exist!\n", + channel_id); + return -ERANGE; + } } - if (channel_id != NCSI_RESERVED_CHANNEL && !channel) { - /* The user has set a channel that does not exist on this - * package - */ - spin_unlock_irqrestore(&ndp->lock, flags); - netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", - channel_id); - return -ERANGE; - } - - ndp->force_package = package; - ndp->force_channel = channel; + spin_lock_irqsave(&ndp->lock, flags); + ndp->package_whitelist = 0x1 << package->id; + ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); - netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n", - package_id, channel_id, - channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : ""); + spin_lock_irqsave(&package->lock, flags); + package->multi_channel = false; + if (channel) { + package->channel_whitelist = 0x1 << channel->id; + package->preferred_channel = channel; + } else { + /* Allow any channel */ + package->channel_whitelist = UINT_MAX; + package->preferred_channel = NULL; + } + spin_unlock_irqrestore(&package->lock, flags); + + if (channel) + netdev_info(ndp->ndev.dev, + "Set package 0x%x, channel 0x%x as preferred\n", + package_id, channel_id); + else + netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n", + package_id); - /* Bounce the NCSI channel to set changes */ - ncsi_stop_dev(&ndp->ndev); - ncsi_start_dev(&ndp->ndev); + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); return 0; } @@ -340,6 +352,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; + struct ncsi_package *np; unsigned long flags; if (!info || !info->attrs) @@ -353,16 +366,24 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) if (!ndp) return -ENODEV; - /* Clear any override */ + /* Reset any whitelists and disable multi mode */ spin_lock_irqsave(&ndp->lock, flags); - ndp->force_package = NULL; - ndp->force_channel = NULL; + ndp->package_whitelist = UINT_MAX; + ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); + + NCSI_FOR_EACH_PACKAGE(ndp, np) { + spin_lock_irqsave(&np->lock, flags); + np->multi_channel = false; + np->channel_whitelist = UINT_MAX; + np->preferred_channel = NULL; + spin_unlock_irqrestore(&np->lock, flags); + } netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); - /* Bounce the NCSI channel to set changes */ - ncsi_stop_dev(&ndp->ndev); - ncsi_start_dev(&ndp->ndev); + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); return 0; } @@ -563,6 +584,138 @@ int ncsi_send_netlink_err(struct net_device *dev, return nlmsg_unicast(net->genl_sock, skb, snd_portid); } +static int ncsi_set_package_mask_nl(struct sk_buff *msg, + struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned long flags; + int rc; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_MASK]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + spin_lock_irqsave(&ndp->lock, flags); + if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { + if (ndp->flags & NCSI_DEV_HWA) { + ndp->multi_package = true; + rc = 0; + } else { + netdev_err(ndp->ndev.dev, + "NCSI: Can't use multiple packages without HWA\n"); + rc = -EPERM; + } + } else { + ndp->multi_package = false; + rc = 0; + } + + if (!rc) + ndp->package_whitelist = + nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]); + spin_unlock_irqrestore(&ndp->lock, flags); + + if (!rc) { + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); + } + + return rc; +} + +static int ncsi_set_channel_mask_nl(struct sk_buff *msg, + struct genl_info *info) +{ + struct ncsi_package *np, *package; + struct ncsi_channel *nc, *channel; + u32 package_id, channel_id; + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_CHANNEL_MASK]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + package = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) { + package = np; + break; + } + if (!package) + return -ERANGE; + + spin_lock_irqsave(&package->lock, flags); + + channel = NULL; + if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + NCSI_FOR_EACH_CHANNEL(np, nc) + if (nc->id == channel_id) { + channel = nc; + break; + } + if (!channel) { + spin_unlock_irqrestore(&package->lock, flags); + return -ERANGE; + } + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u set as preferred channel\n", + channel->id); + } + + package->channel_whitelist = + nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]); + if (package->channel_whitelist == 0) + netdev_dbg(ndp->ndev.dev, + "NCSI: Package %u set to all channels disabled\n", + package->id); + + package->preferred_channel = channel; + + if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { + package->multi_channel = true; + netdev_info(ndp->ndev.dev, + "NCSI: Multi-channel enabled on package %u\n", + package_id); + } else { + package->multi_channel = false; + } + + spin_unlock_irqrestore(&package->lock, flags); + + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); + + return 0; +} + static const struct genl_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, @@ -589,6 +742,18 @@ static const struct genl_ops ncsi_ops[] = { .doit = ncsi_send_cmd_nl, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NCSI_CMD_SET_PACKAGE_MASK, + .policy = ncsi_genl_policy, + .doit = ncsi_set_package_mask_nl, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NCSI_CMD_SET_CHANNEL_MASK, + .policy = ncsi_genl_policy, + .doit = ncsi_set_channel_mask_nl, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family ncsi_genl_family __ro_after_init = { diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 4d3f06be38bd..2a6d83a596c9 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -165,6 +165,15 @@ struct ncsi_rsp_oem_pkt { unsigned char data[]; /* Payload data */ }; +/* Mellanox Response Data */ +struct ncsi_rsp_oem_mlx_pkt { + unsigned char cmd_rev; /* Command Revision */ + unsigned char cmd; /* Command ID */ + unsigned char param; /* Parameter */ + unsigned char optional; /* Optional data */ + unsigned char data[]; /* Data */ +}; + /* Broadcom Response Data */ struct ncsi_rsp_oem_bcm_pkt { unsigned char ver; /* Payload Version */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 77e07ba3f493..dc07fcc7938e 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -256,7 +256,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) if (!ncm->enable) return 0; - ncm->enable = 1; + ncm->enable = 0; return 0; } @@ -611,6 +611,45 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) return 0; } +/* Response handler for Mellanox command Get Mac Address */ +static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + +/* Response handler for Mellanox card */ +static int ncsi_rsp_handler_oem_mlx(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_mlx_pkt *mlx; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + mlx = (struct ncsi_rsp_oem_mlx_pkt *)(rsp->data); + + if (mlx->cmd == NCSI_OEM_MLX_CMD_GMA && + mlx->param == NCSI_OEM_MLX_CMD_GMA_PARAM) + return ncsi_rsp_handler_oem_mlx_gma(nr); + return 0; +} + /* Response handler for Broadcom command Get Mac Address */ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) { @@ -655,7 +694,7 @@ static struct ncsi_rsp_oem_handler { unsigned int mfr_id; int (*handler)(struct ncsi_request *nr); } ncsi_rsp_oem_handlers[] = { - { NCSI_OEM_MFR_MLX_ID, NULL }, + { NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx }, { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm } }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 83395bf6dc35..432141f04af3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3980,6 +3980,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, +#ifdef CONFIG_IP_VS_IPV6 + .priority = ADDRCONF_NOTIFY_PRIORITY + 5, +#endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 02ca7df793f5..b6d0f6deea86 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -49,6 +49,7 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; + bool dead; struct rcu_head rcu_head; }; @@ -106,15 +107,16 @@ nf_conncount_add(struct nf_conncount_list *list, conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; - spin_lock(&list->list_lock); + conn->dead = false; + spin_lock_bh(&list->list_lock); if (list->dead == true) { kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_SKIP; } list_add_tail(&conn->node, &list->head); list->count++; - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_ADDED; } EXPORT_SYMBOL_GPL(nf_conncount_add); @@ -132,19 +134,22 @@ static bool conn_free(struct nf_conncount_list *list, { bool free_entry = false; - spin_lock(&list->list_lock); + spin_lock_bh(&list->list_lock); - if (list->count == 0) { - spin_unlock(&list->list_lock); - return free_entry; + if (conn->dead) { + spin_unlock_bh(&list->list_lock); + return free_entry; } list->count--; + conn->dead = true; list_del_rcu(&conn->node); - if (list->count == 0) + if (list->count == 0) { + list->dead = true; free_entry = true; + } - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); call_rcu(&conn->rcu_head, __conn_free); return free_entry; } @@ -245,7 +250,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); - list->count = 1; + list->count = 0; list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -259,6 +264,7 @@ bool nf_conncount_gc_list(struct net *net, struct nf_conn *found_ct; unsigned int collected = 0; bool free_entry = false; + bool ret = false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn, &free_entry); @@ -288,7 +294,15 @@ bool nf_conncount_gc_list(struct net *net, if (collected > CONNCOUNT_GC_MAX_NODES) return false; } - return false; + + spin_lock_bh(&list->list_lock); + if (!list->count) { + list->dead = true; + ret = true; + } + spin_unlock_bh(&list->list_lock); + + return ret; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); @@ -309,11 +323,8 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - if (rbconn->list.count == 0 && rbconn->list.dead == false) { - rbconn->list.dead = true; - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); - } + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); spin_unlock(&rbconn->list.list_lock); } } @@ -414,6 +425,7 @@ insert_tree(struct net *net, nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; + rbconn->list.count = count; rb_link_node(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9b48dc8b4b88..2a5e56c6d8d9 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -43,24 +43,12 @@ #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> -enum grep_conntrack { - GRE_CT_UNREPLIED, - GRE_CT_REPLIED, - GRE_CT_MAX -}; - static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; static unsigned int proto_gre_net_id __read_mostly; -struct netns_proto_gre { - struct nf_proto_net nf; - rwlock_t keymap_lock; - struct list_head keymap_list; - unsigned int gre_timeouts[GRE_CT_MAX]; -}; static inline struct netns_proto_gre *gre_pernet(struct net *net) { @@ -402,6 +390,8 @@ static int __init nf_ct_proto_gre_init(void) { int ret; + BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0); + ret = register_pernet_subsys(&proto_gre_net_ops); if (ret < 0) goto out_pernet; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 42487d01a3ed..2e61aab6ed73 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2457,7 +2457,7 @@ err: static void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { - struct nft_expr *expr; + struct nft_expr *expr, *next; /* * Careful: some expressions might not be initialized in case this @@ -2465,8 +2465,9 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, */ expr = nft_expr_first(rule); while (expr != nft_expr_last(rule) && expr->ops) { + next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); - expr = nft_expr_next(expr); + expr = next; } kfree(rule); } @@ -2589,17 +2590,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (chain->use == UINT_MAX) return -EOVERFLOW; - } - - if (nla[NFTA_RULE_POSITION]) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) - return -EOPNOTSUPP; - pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); - if (IS_ERR(old_rule)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); - return PTR_ERR(old_rule); + if (nla[NFTA_RULE_POSITION]) { + pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); + old_rule = __nft_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); + return PTR_ERR(old_rule); + } } } @@ -2669,21 +2667,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (!nft_is_active_next(net, old_rule)) { - err = -ENOENT; - goto err2; - } - trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, - old_rule); + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err2; } - nft_deactivate_next(net, old_rule); - chain->use--; - - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { - err = -ENOMEM; + err = nft_delrule(&ctx, old_rule); + if (err < 0) { + nft_trans_destroy(trans); goto err2; } @@ -6324,7 +6315,7 @@ static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); } -static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain) +static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { struct nft_rule **g0, **g1; bool next_genbit; @@ -6441,11 +6432,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) /* step 2. Make rules_gen_X visible to packet path */ list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(chain, &table->chains, list) { - if (!nft_is_active_next(net, chain)) - continue; - nf_tables_commit_chain_active(net, chain); - } + list_for_each_entry(chain, &table->chains, list) + nf_tables_commit_chain(net, chain); } /* diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index a518eb162344..109b0d27345a 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -455,7 +455,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_TCP: timeouts = nf_tcp_pernet(net)->timeouts; break; - case IPPROTO_UDP: + case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(net)->timeouts; break; case IPPROTO_DCCP: @@ -471,11 +472,21 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, timeouts = nf_sctp_pernet(net)->timeouts; #endif break; + case IPPROTO_GRE: +#ifdef CONFIG_NF_CT_PROTO_GRE + if (l4proto->net_id) { + struct netns_proto_gre *net_gre; + + net_gre = net_generic(net, *l4proto->net_id); + timeouts = net_gre->gre_timeouts; + } +#endif + break; case 255: timeouts = &nf_generic_pernet(net)->timeout; break; default: - WARN_ON_ONCE(1); + WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 43041f087eb3..1ce30efe6854 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1148,8 +1148,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; - entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])); - entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]); + __vlan_hwaccel_put_tag(entry->skb, + nla_get_be16(tb[NFQA_VLAN_PROTO]), + ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 9d0ede474224..7334e0b80a5e 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -520,6 +520,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; + struct module *me = match->me; struct xt_mtdtor_param par; par.net = ctx->net; @@ -530,7 +531,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, par.match->destroy(&par); if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops))) - module_put(match->me); + module_put(me); } static void diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index e82d9a966c45..974525eb92df 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -214,7 +214,9 @@ static int __init nft_flow_offload_module_init(void) { int err; - register_netdevice_notifier(&flow_offload_netdev_notifier); + err = register_netdevice_notifier(&flow_offload_netdev_notifier); + if (err) + goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) @@ -224,6 +226,7 @@ static int __init nft_flow_offload_module_init(void) register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); +err: return err; } diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index dec843cadf46..9e05c86ba5c4 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -201,18 +201,8 @@ static __net_init int xt_rateest_net_init(struct net *net) return 0; } -static void __net_exit xt_rateest_net_exit(struct net *net) -{ - struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); - int i; - - for (i = 0; i < ARRAY_SIZE(xn->hash); i++) - WARN_ON_ONCE(!hlist_empty(&xn->hash[i])); -} - static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, - .exit = xt_rateest_net_exit, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3e7d259e5d8d..1ad4017f9b73 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -295,9 +295,10 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); - - if (ret) + if (ret) { + vfree(hinfo); return ret; + } hinfo->cfg.size = size; if (hinfo->cfg.max == 0) @@ -814,7 +815,6 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); - if (ret) return ret; @@ -830,7 +830,6 @@ hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); - if (ret) return ret; @@ -921,7 +920,6 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); - if (ret) return ret; @@ -940,7 +938,6 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); - if (ret) return ret; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a4660c48ff01..cd94f925495a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1166,7 +1166,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); if (err) { net_warn_ratelimited("openvswitch: zone: %u " - "execeeds conntrack limit\n", + "exceeds conntrack limit\n", info->zone.id); return err; } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index fa393815991e..57e07768c9d1 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -597,7 +597,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * skb_vlan_pop(), which will later shift the ethertype into * skb->protocol. */ - if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) skb->protocol = key->eth.cvlan.tpid; else skb->protocol = key->eth.type; diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 5aaf3babfc3f..acb6077b7478 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -93,7 +93,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 0e72d95b0e8f..c38a62464b85 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -68,7 +68,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 7e6301b2ec4d..8f16f11f7ad3 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -131,7 +131,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ec3095f13aae..a74650e98f42 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2394,7 +2394,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) void *ph; __u32 ts; - ph = skb_shinfo(skb)->destructor_arg; + ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); @@ -2461,7 +2461,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->mark = po->sk.sk_mark; skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); - skb_shinfo(skb)->destructor_arg = ph.raw; + skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 64362d078da8..a2522f9d71e2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -375,17 +375,36 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * getting ACKs from the server. Returns a number representing the life state * which can be compared to that returned by a previous call. * - * If this is a client call, ping ACKs will be sent to the server to find out - * whether it's still responsive and whether the call is still alive on the - * server. + * If the life state stalls, rxrpc_kernel_probe_life() should be called and + * then 2RTT waited. */ -u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) +u32 rxrpc_kernel_check_life(const struct socket *sock, + const struct rxrpc_call *call) { return call->acks_latest; } EXPORT_SYMBOL(rxrpc_kernel_check_life); /** + * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive + * @sock: The socket the call is on + * @call: The call to check + * + * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to + * find out whether a call is still alive by pinging it. This should cause the + * life state to be bumped in about 2*RTT. + * + * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. + */ +void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) +{ + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_check_life); + rxrpc_send_ack_packet(call, true, NULL); +} +EXPORT_SYMBOL(rxrpc_kernel_probe_life); + +/** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on * @call: The call to query diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9c1b0729aebf..d4b8355737d8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -21,8 +21,6 @@ #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> -#include <linux/rhashtable.h> -#include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> @@ -1522,227 +1520,8 @@ out_module_put: return skb->len; } -struct tcf_action_net { - struct rhashtable egdev_ht; -}; - -static unsigned int tcf_action_net_id; - -struct tcf_action_egdev_cb { - struct list_head list; - tc_setup_cb_t *cb; - void *cb_priv; -}; - -struct tcf_action_egdev { - struct rhash_head ht_node; - const struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params tcf_action_egdev_ht_params = { - .key_offset = offsetof(struct tcf_action_egdev, dev), - .head_offset = offsetof(struct tcf_action_egdev, ht_node), - .key_len = sizeof(const struct net_device *), -}; - -static struct tcf_action_egdev * -tcf_action_egdev_lookup(const struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); - - return rhashtable_lookup_fast(&tan->egdev_ht, &dev, - tcf_action_egdev_ht_params); -} - -static struct tcf_action_egdev * -tcf_action_egdev_get(const struct net_device *dev) -{ - struct tcf_action_egdev *egdev; - struct tcf_action_net *tan; - - egdev = tcf_action_egdev_lookup(dev); - if (egdev) - goto inc_ref; - - egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); - if (!egdev) - return NULL; - INIT_LIST_HEAD(&egdev->cb_list); - egdev->dev = dev; - tan = net_generic(dev_net(dev), tcf_action_net_id); - rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, - tcf_action_egdev_ht_params); - -inc_ref: - egdev->refcnt++; - return egdev; -} - -static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) -{ - struct tcf_action_net *tan; - - if (--egdev->refcnt) - return; - tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); - rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, - tcf_action_egdev_ht_params); - kfree(egdev); -} - -static struct tcf_action_egdev_cb * -tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev_cb *egdev_cb; - - list_for_each_entry(egdev_cb, &egdev->cb_list, list) - if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) - return egdev_cb; - return NULL; -} - -static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, - enum tc_setup_type type, - void *type_data, bool err_stop) -{ - struct tcf_action_egdev_cb *egdev_cb; - int ok_count = 0; - int err; - - list_for_each_entry(egdev_cb, &egdev->cb_list, list) { - err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); - if (err) { - if (err_stop) - return err; - } else { - ok_count++; - } - } - return ok_count; -} - -static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev_cb *egdev_cb; - - egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); - if (WARN_ON(egdev_cb)) - return -EEXIST; - egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); - if (!egdev_cb) - return -ENOMEM; - egdev_cb->cb = cb; - egdev_cb->cb_priv = cb_priv; - list_add(&egdev_cb->list, &egdev->cb_list); - return 0; -} - -static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev_cb *egdev_cb; - - egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); - if (WARN_ON(!egdev_cb)) - return; - list_del(&egdev_cb->list); - kfree(egdev_cb); -} - -static int __tc_setup_cb_egdev_register(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); - int err; - - if (!egdev) - return -ENOMEM; - err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); - if (err) - goto err_cb_add; - return 0; - -err_cb_add: - tcf_action_egdev_put(egdev); - return err; -} -int tc_setup_cb_egdev_register(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - int err; - - rtnl_lock(); - err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); - rtnl_unlock(); - return err; -} -EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); - -static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); - - if (WARN_ON(!egdev)) - return; - tcf_action_egdev_cb_del(egdev, cb, cb_priv); - tcf_action_egdev_put(egdev); -} -void tc_setup_cb_egdev_unregister(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - rtnl_lock(); - __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); - -int tc_setup_cb_egdev_call(const struct net_device *dev, - enum tc_setup_type type, void *type_data, - bool err_stop) -{ - struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); - - if (!egdev) - return 0; - return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); -} -EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); - -static __net_init int tcf_action_net_init(struct net *net) -{ - struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); - - return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); -} - -static void __net_exit tcf_action_net_exit(struct net *net) -{ - struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); - - rhashtable_destroy(&tan->egdev_ht); -} - -static struct pernet_operations tcf_action_net_ops = { - .init = tcf_action_net_init, - .exit = tcf_action_net_exit, - .id = &tcf_action_net_id, - .size = sizeof(struct tcf_action_net), -}; - static int __init tc_action_init(void) { - int err; - - err = register_pernet_subsys(&tcf_action_net_ops); - if (err) - return err; - rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1dae5f2b358f..c8cf4d10c435 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -258,7 +258,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; - + if (skb2->tc_from_ingress) + skb2->tstamp = 0; /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index da3dd0f68cc2..2b372a06b432 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -201,7 +201,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_release; } } else { - return err; + ret = err; + goto out_free; } p = to_pedit(*a); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 052855d47354..ec8ec55e0fe8 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -27,10 +27,7 @@ struct tcf_police_params { u32 tcfp_ewma_rate; s64 tcfp_burst; u32 tcfp_mtu; - s64 tcfp_toks; - s64 tcfp_ptoks; s64 tcfp_mtu_ptoks; - s64 tcfp_t_c; struct psched_ratecfg rate; bool rate_present; struct psched_ratecfg peak; @@ -41,6 +38,11 @@ struct tcf_police_params { struct tcf_police { struct tc_action common; struct tcf_police_params __rcu *params; + + spinlock_t tcfp_lock ____cacheline_aligned_in_smp; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_t_c; }; #define to_police(pc) ((struct tcf_police *)pc) @@ -83,7 +85,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { - int ret = 0, err; + int ret = 0, tcfp_result = TC_ACT_OK, err, size; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; struct tcf_police *police; @@ -91,7 +93,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, police_net_id); struct tcf_police_params *new; bool exists = false; - int size; if (nla == NULL) return -EINVAL; @@ -122,6 +123,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; + spin_lock_init(&(to_police(*a)->tcfp_lock)); } else if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; @@ -157,6 +159,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, goto failure; } + if (tb[TCA_POLICE_RESULT]) { + tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) { + NL_SET_ERR_MSG(extack, + "goto chain not allowed on fallback"); + err = -EINVAL; + goto failure; + } + } + new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; @@ -164,6 +176,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } /* No failure allowed after this point */ + new->tcfp_result = tcfp_result; new->tcfp_mtu = parm->mtu; if (!new->tcfp_mtu) { new->tcfp_mtu = ~0; @@ -186,28 +199,20 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); - new->tcfp_toks = new->tcfp_burst; - if (new->peak_present) { + if (new->peak_present) new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, new->tcfp_mtu); - new->tcfp_ptoks = new->tcfp_mtu_ptoks; - } if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); - if (tb[TCA_POLICE_RESULT]) { - new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) { - NL_SET_ERR_MSG(extack, - "goto chain not allowed on fallback"); - err = -EINVAL; - goto failure; - } - } - spin_lock_bh(&police->tcf_lock); - new->tcfp_t_c = ktime_get_ns(); + spin_lock_bh(&police->tcfp_lock); + police->tcfp_t_c = ktime_get_ns(); + police->tcfp_toks = new->tcfp_burst; + if (new->peak_present) + police->tcfp_ptoks = new->tcfp_mtu_ptoks; + spin_unlock_bh(&police->tcfp_lock); police->tcf_action = parm->action; rcu_swap_protected(police->params, new, @@ -257,25 +262,28 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, } now = ktime_get_ns(); - toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst); + spin_lock_bh(&police->tcfp_lock); + toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst); if (p->peak_present) { - ptoks = toks + p->tcfp_ptoks; + ptoks = toks + police->tcfp_ptoks; if (ptoks > p->tcfp_mtu_ptoks) ptoks = p->tcfp_mtu_ptoks; ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } - toks += p->tcfp_toks; + toks += police->tcfp_toks; if (toks > p->tcfp_burst) toks = p->tcfp_burst; toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { - p->tcfp_t_c = now; - p->tcfp_toks = toks; - p->tcfp_ptoks = ptoks; + police->tcfp_t_c = now; + police->tcfp_toks = toks; + police->tcfp_ptoks = ptoks; + spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; } + spin_unlock_bh(&police->tcfp_lock); } inc_overlimits: diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 4cca8f274662..c3b90fadaff6 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -210,9 +210,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + __be64 key_id = 0; int opts_len = 0; - __be64 key_id; - __be16 flags; + __be16 flags = 0; u8 tos, ttl; int ret = 0; int err; @@ -246,15 +246,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, case TCA_TUNNEL_KEY_ACT_RELEASE: break; case TCA_TUNNEL_KEY_ACT_SET: - if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { - NL_SET_ERR_MSG(extack, "Missing tunnel key id"); - ret = -EINVAL; - goto err_out; - } + if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + __be32 key32; - key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); + key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); + key_id = key32_to_tunnel_id(key32); + flags = TUNNEL_KEY; + } - flags = TUNNEL_KEY | TUNNEL_CSUM; + flags |= TUNNEL_CSUM; if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) flags &= ~TUNNEL_CSUM; @@ -508,10 +508,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); - if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || + if (((key->tun_flags & TUNNEL_KEY) && + nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || - nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || + (key->tp_dst && + nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, + key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, !(key->tun_flags & TUNNEL_CSUM)) || tunnel_key_opts_dump(skb, info)) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ba677d54a7af..93fdaf707313 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { /* in-payload vlan tag, pop it */ err = __skb_vlan_pop(skb, &tci); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f427a1e00e7e..8ce2a0507970 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -25,6 +25,7 @@ #include <linux/kmod.h> #include <linux/slab.h> #include <linux/idr.h> +#include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -365,6 +366,245 @@ static void tcf_chain_flush(struct tcf_chain *chain) } } +static struct tcf_block *tc_dev_ingress_block(struct net_device *dev) +{ + const struct Qdisc_class_ops *cops; + struct Qdisc *qdisc; + + if (!dev_ingress_queue(dev)) + return NULL; + + qdisc = dev_ingress_queue(dev)->qdisc_sleeping; + if (!qdisc) + return NULL; + + cops = qdisc->ops->cl_ops; + if (!cops) + return NULL; + + if (!cops->tcf_block) + return NULL; + + return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL); +} + +static struct rhashtable indr_setup_block_ht; + +struct tc_indr_block_dev { + struct rhash_head ht_node; + struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; + struct tcf_block *block; +}; + +struct tc_indr_block_cb { + struct list_head list; + void *cb_priv; + tc_indr_block_bind_cb_t *cb; + void *cb_ident; +}; + +static const struct rhashtable_params tc_indr_setup_block_ht_params = { + .key_offset = offsetof(struct tc_indr_block_dev, dev), + .head_offset = offsetof(struct tc_indr_block_dev, ht_node), + .key_len = sizeof(struct net_device *), +}; + +static struct tc_indr_block_dev * +tc_indr_block_dev_lookup(struct net_device *dev) +{ + return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, + tc_indr_setup_block_ht_params); +} + +static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev) +{ + struct tc_indr_block_dev *indr_dev; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (indr_dev) + goto inc_ref; + + indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + INIT_LIST_HEAD(&indr_dev->cb_list); + indr_dev->dev = dev; + indr_dev->block = tc_dev_ingress_block(dev); + if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, + tc_indr_setup_block_ht_params)) { + kfree(indr_dev); + return NULL; + } + +inc_ref: + indr_dev->refcnt++; + return indr_dev; +} + +static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev) +{ + if (--indr_dev->refcnt) + return; + + rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, + tc_indr_setup_block_ht_params); + kfree(indr_dev); +} + +static struct tc_indr_block_cb * +tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + if (indr_block_cb->cb == cb && + indr_block_cb->cb_ident == cb_ident) + return indr_block_cb; + return NULL; +} + +static struct tc_indr_block_cb * +tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + + indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (indr_block_cb) + return ERR_PTR(-EEXIST); + + indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); + if (!indr_block_cb) + return ERR_PTR(-ENOMEM); + + indr_block_cb->cb_priv = cb_priv; + indr_block_cb->cb = cb; + indr_block_cb->cb_ident = cb_ident; + list_add(&indr_block_cb->list, &indr_dev->cb_list); + + return indr_block_cb; +} + +static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb) +{ + list_del(&indr_block_cb->list); + kfree(indr_block_cb); +} + +static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev, + struct tc_indr_block_cb *indr_block_cb, + enum tc_block_command command) +{ + struct tc_block_offload bo = { + .command = command, + .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS, + .block = indr_dev->block, + }; + + if (!indr_dev->block) + return; + + indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + &bo); +} + +int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + int err; + + indr_dev = tc_indr_block_dev_get(dev); + if (!indr_dev) + return -ENOMEM; + + indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); + err = PTR_ERR_OR_ZERO(indr_block_cb); + if (err) + goto err_dev_put; + + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND); + return 0; + +err_dev_put: + tc_indr_block_dev_put(indr_dev); + return err; +} +EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register); + +int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + int err; + + rtnl_lock(); + err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident); + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(tc_indr_block_cb_register); + +void __tc_indr_block_cb_unregister(struct net_device *dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (!indr_block_cb) + return; + + /* Send unbind message if required to free any block cbs. */ + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND); + tc_indr_block_cb_del(indr_block_cb); + tc_indr_block_dev_put(indr_dev); +} +EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister); + +void tc_indr_block_cb_unregister(struct net_device *dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + rtnl_lock(); + __tc_indr_block_cb_unregister(dev, cb, cb_ident); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister); + +static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, + struct tcf_block_ext_info *ei, + enum tc_block_command command, + struct netlink_ext_ack *extack) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + struct tc_block_offload bo = { + .command = command, + .binder_type = ei->binder_type, + .block = block, + .extack = extack, + }; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_dev->block = command == TC_BLOCK_BIND ? block : NULL; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + &bo); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -406,12 +646,17 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; - return err; + if (err) + return err; + + tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); + return 0; no_offload_dev_inc: if (tcf_block_offload_in_use(block)) return -EOPNOTSUPP; block->nooffloaddevcnt++; + tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); return 0; } @@ -421,6 +666,8 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; int err; + tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL); + if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); @@ -1023,29 +1270,6 @@ void tcf_block_cb_unregister(struct tcf_block *block, } EXPORT_SYMBOL(tcf_block_cb_unregister); -static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type, - void *type_data, bool err_stop) -{ - struct tcf_block_cb *block_cb; - int ok_count = 0; - int err; - - /* Make sure all netdevs sharing this block are offload-capable. */ - if (block->nooffloaddevcnt && err_stop) - return -EOPNOTSUPP; - - list_for_each_entry(block_cb, &block->cb_list, list) { - err = block_cb->cb(type, type_data, block_cb->cb_priv); - if (err) { - if (err_stop) - return err; - } else { - ok_count++; - } - } - return ok_count; -} - /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. @@ -2268,54 +2492,26 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, - enum tc_setup_type type, - void *type_data, bool err_stop) +int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, + void *type_data, bool err_stop) { + struct tcf_block_cb *block_cb; int ok_count = 0; -#ifdef CONFIG_NET_CLS_ACT - const struct tc_action *a; - struct net_device *dev; - int i, ret; + int err; - if (!tcf_exts_has_actions(exts)) - return 0; + /* Make sure all netdevs sharing this block are offload-capable. */ + if (block->nooffloaddevcnt && err_stop) + return -EOPNOTSUPP; - for (i = 0; i < exts->nr_actions; i++) { - a = exts->actions[i]; - if (!a->ops->get_dev) - continue; - dev = a->ops->get_dev(a); - if (!dev) - continue; - ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); - a->ops->put_dev(dev); - if (ret < 0) - return ret; - ok_count += ret; + list_for_each_entry(block_cb, &block->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } } -#endif - return ok_count; -} - -int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, - enum tc_setup_type type, void *type_data, bool err_stop) -{ - int ok_count; - int ret; - - ret = tcf_block_cb_call(block, type, type_data, err_stop); - if (ret < 0) - return ret; - ok_count = ret; - - if (!exts || ok_count) - return ok_count; - ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); - if (ret < 0) - return ret; - ok_count += ret; - return ok_count; } EXPORT_SYMBOL(tc_setup_cb_call); @@ -2355,6 +2551,11 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; + err = rhashtable_init(&indr_setup_block_ht, + &tc_indr_setup_block_ht_params); + if (err) + goto err_rhash_setup_block_ht; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, @@ -2366,6 +2567,8 @@ static int __init tc_filter_init(void) return 0; +err_rhash_setup_block_ht: + unregister_pernet_subsys(&tcf_net_ops); err_register_pernet_subsys: destroy_workqueue(tc_filter_wq); return err; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index fa6fe2fe0f32..a95cb240a606 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -169,7 +169,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, if (oldprog) tcf_block_offload_dec(block, &oldprog->gen_flags); - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); if (prog) { if (err < 0) { cls_bpf_offload_cmd(tp, oldprog, prog, extack); @@ -234,7 +234,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp, cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false); + tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false); } static int cls_bpf_init(struct tcf_proto *tp) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9aada2d0ef06..1eb2e2c31dd5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -55,6 +55,8 @@ struct fl_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; struct flow_dissector_key_enc_opts enc_opts; + struct flow_dissector_key_ports tp_min; + struct flow_dissector_key_ports tp_max; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -65,6 +67,7 @@ struct fl_flow_mask_range { struct fl_flow_mask { struct fl_flow_key key; struct fl_flow_mask_range range; + u32 flags; struct rhash_head ht_node; struct rhashtable ht; struct rhashtable_params filter_ht_params; @@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key, memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey) +static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter, + struct fl_flow_key *key, + struct fl_flow_key *mkey) +{ + __be16 min_mask, max_mask, min_val, max_val; + + min_mask = htons(filter->mask->key.tp_min.dst); + max_mask = htons(filter->mask->key.tp_max.dst); + min_val = htons(filter->key.tp_min.dst); + max_val = htons(filter->key.tp_max.dst); + + if (min_mask && max_mask) { + if (htons(key->tp.dst) < min_val || + htons(key->tp.dst) > max_val) + return false; + + /* skb does not have min and max values */ + mkey->tp_min.dst = filter->mkey.tp_min.dst; + mkey->tp_max.dst = filter->mkey.tp_max.dst; + } + return true; +} + +static bool fl_range_port_src_cmp(struct cls_fl_filter *filter, + struct fl_flow_key *key, + struct fl_flow_key *mkey) +{ + __be16 min_mask, max_mask, min_val, max_val; + + min_mask = htons(filter->mask->key.tp_min.src); + max_mask = htons(filter->mask->key.tp_max.src); + min_val = htons(filter->key.tp_min.src); + max_val = htons(filter->key.tp_max.src); + + if (min_mask && max_mask) { + if (htons(key->tp.src) < min_val || + htons(key->tp.src) > max_val) + return false; + + /* skb does not have min and max values */ + mkey->tp_min.src = filter->mkey.tp_min.src; + mkey->tp_max.src = filter->mkey.tp_max.src; + } + return true; +} + +static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask, + struct fl_flow_key *mkey) { return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask), mask->filter_ht_params); } +static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, + struct fl_flow_key *mkey, + struct fl_flow_key *key) +{ + struct cls_fl_filter *filter, *f; + + list_for_each_entry_rcu(filter, &mask->filters, list) { + if (!fl_range_port_dst_cmp(filter, key, mkey)) + continue; + + if (!fl_range_port_src_cmp(filter, key, mkey)) + continue; + + f = __fl_lookup(mask, mkey); + if (f) + return f; + } + return NULL; +} + +static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, + struct fl_flow_key *mkey, + struct fl_flow_key *key) +{ + if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) + return fl_lookup_range(mask, mkey, key); + + return __fl_lookup(mask, mkey); +} + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -208,7 +287,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, fl_set_masked_key(&skb_mkey, &skb_key, mask); - f = fl_lookup(mask, &skb_mkey); + f = fl_lookup(mask, &skb_mkey, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -289,8 +368,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); tcf_block_offload_dec(block, &f->flags); } @@ -312,8 +390,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; - err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, - &cls_flower, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); if (err < 0) { fl_hw_destroy_filter(tp, f, NULL); return err; @@ -339,8 +416,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; - tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); } static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, @@ -514,6 +590,31 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask) +{ + fl_set_key_val(tb, &key->tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst)); + fl_set_key_val(tb, &key->tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst)); + fl_set_key_val(tb, &key->tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src)); + fl_set_key_val(tb, &key->tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src)); + + if ((mask->tp_min.dst && mask->tp_max.dst && + htons(key->tp_max.dst) <= htons(key->tp_min.dst)) || + (mask->tp_min.src && mask->tp_max.src && + htons(key->tp_max.src) <= htons(key->tp_min.src))) + return -EINVAL; + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask) @@ -709,11 +810,23 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct netlink_ext_ack *extack) { const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; - int option_len, key_depth, msk_depth = 0; + int err, option_len, key_depth, msk_depth = 0; + + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); } @@ -909,6 +1022,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, sizeof(key->arp.tha)); } + if (key->basic.ip_proto == IPPROTO_TCP || + key->basic.ip_proto == IPPROTO_UDP || + key->basic.ip_proto == IPPROTO_SCTP) { + ret = fl_set_key_port_range(tb, key, mask); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1026,8 +1147,9 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_MASKED(mask, keys, cnt, - FLOW_DISSECTOR_KEY_PORTS, tp); + if (FL_KEY_IS_MASKED(mask, tp) || + FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max)) + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -1074,6 +1196,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, fl_mask_copy(newmask, mask); + if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) || + (newmask->key.tp_min.src && newmask->key.tp_max.src)) + newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE; + err = fl_init_mask_hashtable(newmask); if (err) goto errout_free; @@ -1226,18 +1352,16 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout_idr; - if (!tc_skip_sw(fnew->flags)) { - if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) { - err = -EEXIST; - goto errout_mask; - } - - err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, - fnew->mask->filter_ht_params); - if (err) - goto errout_mask; + if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) { + err = -EEXIST; + goto errout_mask; } + err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, + fnew->mask->filter_ht_params); + if (err) + goto errout_mask; + if (!tc_skip_hw(fnew->flags)) { err = fl_hw_replace_filter(tp, fnew, extack); if (err) @@ -1291,9 +1415,8 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; - if (!tc_skip_sw(f->flags)) - rhashtable_remove_fast(&f->mask->ht, &f->ht_node, - f->mask->filter_ht_params); + rhashtable_remove_fast(&f->mask->ht, &f->ht_node, + f->mask->filter_ht_params); __fl_delete(tp, f, extack); *last = list_empty(&head->masks); return 0; @@ -1376,8 +1499,7 @@ static void fl_hw_create_tmplt(struct tcf_chain *chain, /* We don't care if driver (any of them) fails to handle this * call. It serves just as a hint for it. */ - tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); } static void fl_hw_destroy_tmplt(struct tcf_chain *chain, @@ -1390,8 +1512,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain, cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; cls_flower.cookie = (unsigned long) tmplt; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); } static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, @@ -1464,6 +1585,26 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, + struct fl_flow_key *mask) +{ + if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, + &mask->tp_min.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_min.dst)) || + fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX, + &mask->tp_max.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_max.dst)) || + fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN, + &mask->tp_min.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_min.src)) || + fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX, + &mask->tp_max.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_max.src))) + return -1; + + return 0; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) @@ -1800,6 +1941,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->arp.tha)))) goto nla_put_failure; + if ((key->basic.ip_proto == IPPROTO_TCP || + key->basic.ip_proto == IPPROTO_UDP || + key->basic.ip_proto == IPPROTO_SCTP) && + fl_dump_key_port_range(skb, key, mask)) + goto nla_put_failure; + if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 856fa79d4ffd..0e408ee9dcec 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -71,7 +71,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false); + tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false); tcf_block_offload_dec(block, &head->flags); } @@ -90,8 +90,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, cls_mall.exts = &head->exts; cls_mall.cookie = cookie; - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, - &cls_mall, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw); if (err < 0) { mall_destroy_hw_filter(tp, head, cookie, NULL); return err; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4b28fd44576d..dcea21004604 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -491,7 +491,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); + tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, @@ -509,7 +509,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { u32_clear_hw_hnode(tp, h, NULL); return err; @@ -533,7 +533,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); + tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false); tcf_block_offload_dec(block, &n->flags); } @@ -558,11 +558,12 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; + cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { u32_remove_hw_knode(tp, n, NULL); return err; @@ -1206,6 +1207,7 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; + cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f55bc50cd0a9..187a57e7d601 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -335,7 +335,6 @@ out: static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; - struct Qdisc *leaf; const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) @@ -344,8 +343,7 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) if (cl == 0) return NULL; - leaf = cops->leaf(p, cl); - return leaf; + return cops->leaf(p, cl); } /* Find queueing discipline by name */ @@ -860,6 +858,21 @@ void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, } EXPORT_SYMBOL(qdisc_offload_graft_helper); +static void qdisc_offload_graft_root(struct net_device *dev, + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) +{ + struct tc_root_qopt_offload graft_offload = { + .command = TC_ROOT_GRAFT, + .handle = new ? new->handle : 0, + .ingress = (new && new->flags & TCQ_F_INGRESS) || + (old && old->flags & TCQ_F_INGRESS), + }; + + qdisc_offload_graft_helper(dev, NULL, new, old, + TC_SETUP_ROOT_QDISC, &graft_offload, extack); +} + static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { @@ -1026,6 +1039,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (dev->flags & IFF_UP) dev_deactivate(dev); + qdisc_offload_graft_root(dev, new, old, extack); + if (new && new->ops->attach) goto skip; diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 1538d6fa8165..1150f22983df 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -30,7 +30,7 @@ struct etf_sched_data { int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ - struct rb_root head; + struct rb_root_cached head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; @@ -104,7 +104,7 @@ static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; - p = rb_first(&q->head); + p = rb_first_cached(&q->head); if (!p) return NULL; @@ -117,8 +117,10 @@ static void reset_watchdog(struct Qdisc *sch) struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; - if (!skb) + if (!skb) { + qdisc_watchdog_cancel(&q->watchdog); return; + } next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); @@ -154,8 +156,9 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); - struct rb_node **p = &q->head.rb_node, *parent = NULL; + struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; + bool leftmost = true; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, @@ -168,13 +171,15 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, parent = *p; skb = rb_to_skb(parent); - if (ktime_after(txtime, skb->tstamp)) + if (ktime_after(txtime, skb->tstamp)) { p = &parent->rb_right; - else + leftmost = false; + } else { p = &parent->rb_left; + } } rb_link_node(&nskb->rbnode, parent, p); - rb_insert_color(&nskb->rbnode, &q->head); + rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; @@ -185,12 +190,42 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } -static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, - bool drop) +static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb, + ktime_t now) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *to_free = NULL; + struct sk_buff *tmp = NULL; + + skb_rbtree_walk_from_safe(skb, tmp) { + if (ktime_after(skb->tstamp, now)) + break; + + rb_erase_cached(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + + qdisc_qstats_backlog_dec(sch, skb); + qdisc_drop(skb, sch, &to_free); + qdisc_qstats_overlimit(sch); + sch->q.qlen--; + } + + kfree_skb_list(to_free); +} + +static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); - rb_erase(&skb->rbnode, &q->head); + rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. @@ -201,19 +236,9 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, qdisc_qstats_backlog_dec(sch, skb); - if (drop) { - struct sk_buff *to_free = NULL; + qdisc_bstats_update(sch, skb); - report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); - - qdisc_drop(skb, sch, &to_free); - kfree_skb_list(to_free); - qdisc_qstats_overlimit(sch); - } else { - qdisc_bstats_update(sch, skb); - - q->last = skb->tstamp; - } + q->last = skb->tstamp; sch->q.qlen--; } @@ -232,7 +257,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { - timesortedlist_erase(sch, skb, true); + timesortedlist_drop(sch, skb, now); skb = NULL; goto out; } @@ -241,7 +266,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) * txtime from deadline to (now + delta). */ if (q->deadline_mode) { - timesortedlist_erase(sch, skb, false); + timesortedlist_remove(sch, skb); skb->tstamp = now; goto out; } @@ -250,7 +275,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) - timesortedlist_erase(sch, skb, false); + timesortedlist_remove(sch, skb); else skb = NULL; @@ -386,14 +411,14 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); - struct rb_node *p = rb_first(&q->head); + struct rb_node *p = rb_first_cached(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); - rb_erase(&skb->rbnode, &q->head); + rb_erase_cached(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4b1af706896c..1a662f2bb7bb 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -94,6 +94,7 @@ struct fq_sched_data { u32 flow_refill_delay; u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ + u64 ce_threshold; u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; @@ -107,6 +108,7 @@ struct fq_sched_data { u64 stat_gc_flows; u64 stat_internal_packets; u64 stat_throttled; + u64 stat_ce_mark; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -412,16 +414,21 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_get_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; unsigned long rate; u32 plen; + u64 now; + + if (!sch->q.qlen) + return NULL; skb = fq_dequeue_head(sch, &q->internal); if (skb) goto out; + + now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -454,6 +461,11 @@ begin: fq_flow_set_throttled(q, f); goto begin; } + if (time_next_packet && + (s64)(now - time_next_packet - q->ce_threshold) > 0) { + INET_ECN_set_ce(skb); + q->stat_ce_mark++; + } } skb = fq_dequeue_head(sch, f); @@ -469,22 +481,29 @@ begin: goto begin; } prefetch(&skb->end); - f->credit -= qdisc_pkt_len(skb); + plen = qdisc_pkt_len(skb); + f->credit -= plen; - if (ktime_to_ns(skb->tstamp) || !q->rate_enable) + if (!q->rate_enable) goto out; rate = q->flow_max_rate; - if (skb->sk) - rate = min(skb->sk->sk_pacing_rate, rate); - - if (rate <= q->low_rate_threshold) { - f->credit = 0; - plen = qdisc_pkt_len(skb); - } else { - plen = max(qdisc_pkt_len(skb), q->quantum); - if (f->credit > 0) - goto out; + + /* If EDT time was provided for this skb, we need to + * update f->time_next_packet only if this qdisc enforces + * a flow max rate. + */ + if (!skb->tstamp) { + if (skb->sk) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate <= q->low_rate_threshold) { + f->credit = 0; + } else { + plen = max(plen, q->quantum); + if (f->credit > 0) + goto out; + } } if (rate != ~0UL) { u64 len = (u64)plen * NSEC_PER_SEC; @@ -650,6 +669,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -729,6 +749,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_ORPHAN_MASK]) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); + if (tb[TCA_FQ_CE_THRESHOLD]) + q->ce_threshold = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (!err) { sch_tree_unlock(sch); err = fq_resize(sch, fq_log); @@ -779,6 +803,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; + + /* Default ce_threshold of 4294 seconds */ + q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); if (opt) @@ -792,6 +820,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); + u64 ce_threshold = q->ce_threshold; struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); @@ -800,6 +829,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + do_div(ce_threshold, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || @@ -812,6 +843,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, q->low_rate_threshold) || + nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; @@ -841,6 +873,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.throttled_flows = q->throttled_flows; st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); + st.ce_mark = q->stat_ce_mark; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 4a042abf844c..234afbf9115b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -23,19 +23,23 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/red.h> #define GRED_DEF_PRIO (MAX_DPs / 2) #define GRED_VQ_MASK (MAX_DPs - 1) +#define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP) + struct gred_sched_data; struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ u32 DP; /* the drop parameters */ - u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 red_flags; /* virtualQ version of red_flags */ + u64 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ @@ -139,14 +143,27 @@ static inline void gred_store_wred_set(struct gred_sched *table, table->wred_set.qidlestart = q->vars.qidlestart; } -static inline int gred_use_ecn(struct gred_sched *t) +static int gred_use_ecn(struct gred_sched_data *q) +{ + return q->red_flags & TC_RED_ECN; +} + +static int gred_use_harddrop(struct gred_sched_data *q) { - return t->red_flags & TC_RED_ECN; + return q->red_flags & TC_RED_HARDDROP; } -static inline int gred_use_harddrop(struct gred_sched *t) +static bool gred_per_vq_red_flags_used(struct gred_sched *table) { - return t->red_flags & TC_RED_HARDDROP; + unsigned int i; + + /* Local per-vq flags couldn't have been set unless global are 0 */ + if (table->red_flags) + return false; + for (i = 0; i < MAX_DPs; i++) + if (table->tab[i] && table->tab[i]->red_flags) + return true; + return false; } static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -212,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_PROB_MARK: qdisc_qstats_overlimit(sch); - if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; } @@ -222,7 +239,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_HARD_MARK: qdisc_qstats_overlimit(sch); - if (gred_use_harddrop(t) || !gred_use_ecn(t) || + if (gred_use_harddrop(q) || !gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; goto congestion_drop; @@ -295,15 +312,103 @@ static void gred_reset(struct Qdisc *sch) } } +static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) +{ + struct gred_sched *table = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_gred_qopt_offload opt = { + .command = command, + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + if (command == TC_GRED_REPLACE) { + unsigned int i; + + opt.set.grio_on = gred_rio_mode(table); + opt.set.wred_on = gred_wred_mode(table); + opt.set.dp_cnt = table->DPs; + opt.set.dp_def = table->def; + + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + if (!q) + continue; + opt.set.tab[i].present = true; + opt.set.tab[i].limit = q->limit; + opt.set.tab[i].prio = q->prio; + opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; + opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; + opt.set.tab[i].is_ecn = gred_use_ecn(q); + opt.set.tab[i].is_harddrop = gred_use_harddrop(q); + opt.set.tab[i].probability = q->parms.max_P; + opt.set.tab[i].backlog = &q->backlog; + } + opt.set.qstats = &sch->qstats; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); +} + +static int gred_offload_dump_stats(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_qopt_offload *hw_stats; + unsigned int i; + int ret; + + hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL); + if (!hw_stats) + return -ENOMEM; + + hw_stats->command = TC_GRED_STATS; + hw_stats->handle = sch->handle; + hw_stats->parent = sch->parent; + + for (i = 0; i < MAX_DPs; i++) + if (table->tab[i]) + hw_stats->stats.xstats[i] = &table->tab[i]->stats; + + ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); + /* Even if driver returns failure adjust the stats - in case offload + * ended but driver still wants to adjust the values. + */ + for (i = 0; i < MAX_DPs; i++) { + if (!table->tab[i]) + continue; + table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; + table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; + + _bstats_update(&sch->bstats, + hw_stats->stats.bstats[i].bytes, + hw_stats->stats.bstats[i].packets); + sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; + sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; + sch->qstats.drops += hw_stats->stats.qstats[i].drops; + sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; + sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; + } + + kfree(hw_stats); + return ret; +} + static inline void gred_destroy_vq(struct gred_sched_data *q) { kfree(q); } -static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) +static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; + bool red_flags_changed; int i; if (!dps) @@ -311,13 +416,28 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) sopt = nla_data(dps); - if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || - sopt->def_DP >= sopt->DPs) + if (sopt->DPs > MAX_DPs) { + NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high"); + return -EINVAL; + } + if (sopt->DPs == 0) { + NL_SET_ERR_MSG_MOD(extack, + "number of virtual queues can't be 0"); + return -EINVAL; + } + if (sopt->def_DP >= sopt->DPs) { + NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count"); return -EINVAL; + } + if (sopt->flags && gred_per_vq_red_flags_used(table)) { + NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used"); + return -EINVAL; + } sch_tree_lock(sch); table->DPs = sopt->DPs; table->def = sopt->def_DP; + red_flags_changed = table->red_flags != sopt->flags; table->red_flags = sopt->flags; /* @@ -337,6 +457,12 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) gred_disable_wred_mode(table); } + if (red_flags_changed) + for (i = 0; i < table->DPs; i++) + if (table->tab[i]) + table->tab[i]->red_flags = + table->red_flags & GRED_VQ_RED_FLAGS; + for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", @@ -346,25 +472,30 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) } } + gred_offload(sch, TC_GRED_REPLACE); return 0; } static inline int gred_change_vq(struct Qdisc *sch, int dp, struct tc_gred_qopt *ctl, int prio, u8 *stab, u32 max_P, - struct gred_sched_data **prealloc) + struct gred_sched_data **prealloc, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; + } if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; if (!q) return -ENOMEM; + q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; } q->DP = dp; @@ -384,14 +515,127 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, return 0; } +static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = { + [TCA_GRED_VQ_DP] = { .type = NLA_U32 }, + [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = { + [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED }, +}; + static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, [TCA_GRED_LIMIT] = { .type = NLA_U32 }, + [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED }, }; +static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry) +{ + struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; + u32 dp; + + nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL); + + dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); + + if (tb[TCA_GRED_VQ_FLAGS]) + table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); +} + +static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs) +{ + const struct nlattr *attr; + int rem; + + nla_for_each_nested(attr, vqs, rem) { + switch (nla_type(attr)) { + case TCA_GRED_VQ_ENTRY: + gred_vq_apply(table, attr); + break; + } + } +} + +static int gred_vq_validate(struct gred_sched *table, u32 cdp, + const struct nlattr *entry, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; + int err; + u32 dp; + + err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, + extack); + if (err < 0) + return err; + + if (!tb[TCA_GRED_VQ_DP]) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified"); + return -EINVAL; + } + dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); + if (dp >= table->DPs) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds"); + return -EINVAL; + } + if (dp != cdp && !table->tab[dp]) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated"); + return -EINVAL; + } + + if (tb[TCA_GRED_VQ_FLAGS]) { + u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); + + if (table->red_flags && table->red_flags != red_flags) { + NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used"); + return -EINVAL; + } + if (red_flags & ~GRED_VQ_RED_FLAGS) { + NL_SET_ERR_MSG_MOD(extack, + "invalid RED flags specified"); + return -EINVAL; + } + } + + return 0; +} + +static int gred_vqs_validate(struct gred_sched *table, u32 cdp, + struct nlattr *vqs, struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int rem, err; + + err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX, + gred_vqe_policy, extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, vqs, rem) { + switch (nla_type(attr)) { + case TCA_GRED_VQ_ENTRY: + err = gred_vq_validate(table, cdp, attr, extack); + if (err) + return err; + break; + default: + NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes"); + return -EINVAL; + } + } + + if (rem > 0) { + NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list"); + return -EINVAL; + } + + return 0; +} + static int gred_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -406,29 +650,39 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - return gred_change_table_def(sch, tb[TCA_GRED_DPS]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } if (tb[TCA_GRED_PARMS] == NULL || tb[TCA_GRED_STAB] == NULL || - tb[TCA_GRED_LIMIT] != NULL) + tb[TCA_GRED_LIMIT] != NULL) { + NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time"); return -EINVAL; + } max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; - err = -EINVAL; ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); - if (ctl->DP >= table->DPs) - goto errout; + if (ctl->DP >= table->DPs) { + NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count"); + return -EINVAL; + } + + if (tb[TCA_GRED_VQ_LIST]) { + err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST], + extack); + if (err) + return err; + } if (gred_rio_mode(table)) { if (ctl->prio == 0) { @@ -448,9 +702,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); sch_tree_lock(sch); - err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc); + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc, + extack); if (err < 0) - goto errout_locked; + goto err_unlock_free; + + if (tb[TCA_GRED_VQ_LIST]) + gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]); if (gred_rio_mode(table)) { gred_disable_wred_mode(table); @@ -458,12 +716,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, gred_enable_wred_mode(table); } - err = 0; + sch_tree_unlock(sch); + kfree(prealloc); + + gred_offload(sch, TC_GRED_REPLACE); + return 0; -errout_locked: +err_unlock_free: sch_tree_unlock(sch); kfree(prealloc); -errout: return err; } @@ -476,12 +737,15 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; - if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) + if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) { + NL_SET_ERR_MSG_MOD(extack, + "virtual queue configuration can't be specified at initialization time"); return -EINVAL; + } if (tb[TCA_GRED_LIMIT]) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); @@ -489,13 +753,13 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); - return gred_change_table_def(sch, tb[TCA_GRED_DPS]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); - struct nlattr *parms, *opts = NULL; + struct nlattr *parms, *vqs, *opts = NULL; int i; u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { @@ -505,6 +769,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) .flags = table->red_flags, }; + if (gred_offload_dump_stats(sch)) + goto nla_put_failure; + opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; @@ -522,6 +789,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) goto nla_put_failure; + /* Old style all-in-one dump of VQs */ parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; @@ -572,6 +840,58 @@ append_opt: nla_nest_end(skb, parms); + /* Dump the VQs again, in more structured way */ + vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST); + if (!vqs) + goto nla_put_failure; + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + struct nlattr *vq; + + if (!q) + continue; + + vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY); + if (!vq) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags)) + goto nla_put_failure; + + /* Stats */ + if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin, + TCA_GRED_VQ_PAD)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG, + gred_backlog(table, q, sch))) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP, + q->stats.prob_drop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK, + q->stats.prob_mark)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP, + q->stats.forced_drop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK, + q->stats.forced_mark)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other)) + goto nla_put_failure; + + nla_nest_end(skb, vq); + } + nla_nest_end(skb, vqs); + return nla_nest_end(skb, opts); nla_put_failure: @@ -588,6 +908,7 @@ static void gred_destroy(struct Qdisc *sch) if (table->tab[i]) gred_destroy_vq(table->tab[i]); } + gred_offload(sch, TC_GRED_DESTROY); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 1db5c1bf6ddd..203659bc3906 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -193,6 +193,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + struct tc_mq_qopt_offload graft_offload; struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) @@ -203,6 +204,14 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); + + graft_offload.handle = sch->handle; + graft_offload.graft_params.queue = cl - 1; + graft_offload.graft_params.child_handle = new ? new->handle : 0; + graft_offload.command = TC_MQ_GRAFT; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, + TC_SETUP_QDISC_MQ, &graft_offload, extack); return 0; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 57b3ad9394ad..75046ec72144 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -77,6 +77,10 @@ struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; + /* a linear queue; reduces rbtree rebalancing when jitter is low */ + struct sk_buff *t_head; + struct sk_buff *t_tail; + /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch) rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } + + rtnl_kfree_skbs(q->t_head, q->t_tail); + q->t_head = NULL; + q->t_tail = NULL; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); u64 tnext = netem_skb_cb(nskb)->time_to_send; - struct rb_node **p = &q->t_root.rb_node, *parent = NULL; - while (*p) { - struct sk_buff *skb; - - parent = *p; - skb = rb_to_skb(parent); - if (tnext >= netem_skb_cb(skb)->time_to_send) - p = &parent->rb_right; + if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { + if (q->t_tail) + q->t_tail->next = nskb; else - p = &parent->rb_left; + q->t_head = nskb; + q->t_tail = nskb; + } else { + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (tnext >= netem_skb_cb(skb)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->t_root); } - rb_link_node(&nskb->rbnode, parent, p); - rb_insert_color(&nskb->rbnode, &q->t_root); sch->q.qlen++; } @@ -431,6 +448,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, int count = 1; int rc = NET_XMIT_SUCCESS; + /* Do not fool qdisc_drop_all() */ + skb->prev = NULL; + /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) ++count; @@ -530,9 +550,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || - t_last->time_to_send > last->time_to_send) { + t_last->time_to_send > last->time_to_send) + last = t_last; + } + if (q->t_tail) { + struct netem_skb_cb *t_last = + netem_skb_cb(q->t_tail); + + if (!last || + t_last->time_to_send > last->time_to_send) last = t_last; - } } if (last) { @@ -611,11 +638,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) q->slot.bytes_left = q->slot_config.max_bytes; } +static struct sk_buff *netem_peek(struct netem_sched_data *q) +{ + struct sk_buff *skb = skb_rb_first(&q->t_root); + u64 t1, t2; + + if (!skb) + return q->t_head; + if (!q->t_head) + return skb; + + t1 = netem_skb_cb(skb)->time_to_send; + t2 = netem_skb_cb(q->t_head)->time_to_send; + if (t1 < t2) + return skb; + return q->t_head; +} + +static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) +{ + if (skb == q->t_head) { + q->t_head = skb->next; + if (!q->t_head) + q->t_tail = NULL; + } else { + rb_erase(&skb->rbnode, &q->t_root); + } +} + static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - struct rb_node *p; tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); @@ -625,20 +679,18 @@ deliver: qdisc_bstats_update(sch, skb); return skb; } - p = rb_first(&q->t_root); - if (p) { + skb = netem_peek(q); + if (skb) { u64 time_to_send; u64 now = ktime_get_ns(); - skb = rb_to_skb(p); - /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; if (q->slot.slot_next && q->slot.slot_next < time_to_send) get_slot_next(q, now); - if (time_to_send <= now && q->slot.slot_next <= now) { - rb_erase(p, &q->t_root); + if (time_to_send <= now && q->slot.slot_next <= now) { + netem_erase_head(q, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; @@ -648,15 +700,6 @@ deliver: */ skb->dev = qdisc_dev(sch); -#ifdef CONFIG_NET_CLS_ACT - /* - * If it's at ingress let's pretend the delay is - * from the network (tstamp will be updated). - */ - if (skb->tc_redirected && skb->tc_from_ingress) - skb->tstamp = 0; -#endif - if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7682f7a618a1..9df9942340ea 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -166,7 +166,9 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.set.min = q->parms.qth_min >> q->parms.Wlog; opt.set.max = q->parms.qth_max >> q->parms.Wlog; opt.set.probability = q->parms.max_P; + opt.set.limit = q->limit; opt.set.is_ecn = red_use_ecn(q); + opt.set.is_harddrop = red_use_harddrop(q); opt.set.qstats = &sch->qstats; } else { opt.command = TC_RED_DESTROY; @@ -366,6 +368,21 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl, return 0; } +static void red_graft_offload(struct Qdisc *sch, + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) +{ + struct tc_red_qopt_offload graft_offload = { + .handle = sch->handle, + .parent = sch->parent, + .child_handle = new->handle, + .command = TC_RED_GRAFT, + }; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, + TC_SETUP_QDISC_RED, &graft_offload, extack); +} + static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { @@ -375,6 +392,8 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); + + red_graft_offload(sch, new, *old, extack); return 0; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 6a28b96e779e..201c888604e4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init( asoc->flowlabel = sp->flowlabel; asoc->dscp = sp->dscp; - /* Initialize default path MTU. */ - asoc->pathmtu = sp->pathmtu; - /* Set association default SACK delay */ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); asoc->sackfreq = sp->sackfreq; @@ -135,6 +132,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_burst = sp->max_burst; + asoc->subscribe = sp->subscribe; + /* initialize association timers */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; @@ -252,6 +251,10 @@ static struct sctp_association *sctp_association_init( 0, gfp)) goto fail_init; + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + sctp_assoc_update_frag_point(asoc); + /* Assume that peer would support both address types unless we are * told otherwise. */ @@ -434,7 +437,7 @@ static void sctp_association_destroy(struct sctp_association *asoc) WARN_ON(atomic_read(&asoc->rmem_alloc)); - kfree(asoc); + kfree_rcu(asoc, rcu); SCTP_DBG_OBJCNT_DEC(assoc); } diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 7df3704982f5..ebf28adba789 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp, return match; } +int sctp_bind_addrs_check(struct sctp_sock *sp, + struct sctp_sock *sp2, int cnt2) +{ + struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; + struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; + struct sctp_sockaddr_entry *laddr, *laddr2; + bool exist = false; + int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { + if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && + laddr->valid && laddr2->valid) { + exist = true; + goto next; + } + } + cnt = 0; + break; +next: + cnt++; + } + rcu_read_unlock(); + + return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); +} + /* Does the address 'addr' conflict with any addresses in * the bp. */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index ce8087846f05..64bef313d436 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg) /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { + struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; - struct sctp_sock *sp; struct sctp_ulpevent *ev; - struct sctp_association *asoc = NULL; int error = 0, notify; /* If we failed, we may need to notify. */ @@ -108,9 +107,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) else error = asoc->outqueue.error; - sp = sctp_sk(asoc->base.sk); - notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED, - &sp->subscribe); + notify = sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED); } /* Generate a SEND FAILED event only if enabled. */ @@ -191,6 +189,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * the packet */ max_data = asoc->frag_point; + if (unlikely(!max_data)) { + max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), + sctp_datachk_len(&asoc->stream)); + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)", + __func__, asoc, max_data); + } /* If the the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with diff --git a/net/sctp/input.c b/net/sctp/input.c index 7ab08a5b36dc..d7a649d240e5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -57,6 +57,7 @@ #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> +#include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); @@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp); -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, - const union sctp_addr *laddr); +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( + struct net *net, struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *daddr); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, @@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb) asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport); if (!asoc) - ep = __sctp_rcv_lookup_endpoint(net, &dest); + ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; @@ -721,43 +724,87 @@ discard: } /* Insert endpoint into the hash table. */ -static void __sctp_hash_endpoint(struct sctp_endpoint *ep) +static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); - struct sctp_ep_common *epb; + struct sock *sk = ep->base.sk; + struct net *net = sock_net(sk); struct sctp_hashbucket *head; + struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (sk->sk_reuseport) { + bool any = sctp_is_ep_boundall(sk); + struct sctp_ep_common *epb2; + struct list_head *list; + int cnt = 0, err = 1; + + list_for_each(list, &ep->base.bind_addr.address_list) + cnt++; + + sctp_for_each_hentry(epb2, &head->chain) { + struct sock *sk2 = epb2->sk; + + if (!net_eq(sock_net(sk2), net) || sk2 == sk || + !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || + !sk2->sk_reuseport) + continue; + + err = sctp_bind_addrs_check(sctp_sk(sk2), + sctp_sk(sk), cnt); + if (!err) { + err = reuseport_add_sock(sk, sk2, any); + if (err) + return err; + break; + } else if (err < 0) { + return err; + } + } + + if (err) { + err = reuseport_alloc(sk, any); + if (err) + return err; + } + } + write_lock(&head->lock); hlist_add_head(&epb->node, &head->chain); write_unlock(&head->lock); + return 0; } /* Add an endpoint to the hash. Local BH-safe. */ -void sctp_hash_endpoint(struct sctp_endpoint *ep) +int sctp_hash_endpoint(struct sctp_endpoint *ep) { + int err; + local_bh_disable(); - __sctp_hash_endpoint(ep); + err = __sctp_hash_endpoint(ep); local_bh_enable(); + + return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); + struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); + epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + write_lock(&head->lock); hlist_del_init(&epb->node); write_unlock(&head->lock); @@ -771,16 +818,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep) local_bh_enable(); } +static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, + const union sctp_addr *paddr, __u32 seed) +{ + __u32 addr; + + if (paddr->sa.sa_family == AF_INET6) + addr = jhash(&paddr->v6.sin6_addr, 16, seed); + else + addr = (__force __u32)paddr->v4.sin_addr.s_addr; + + return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | + (__force __u32)lport, net_hash_mix(net), seed); +} + /* Look up an endpoint. */ -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, - const union sctp_addr *laddr) +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( + struct net *net, struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *paddr) { struct sctp_hashbucket *head; struct sctp_ep_common *epb; struct sctp_endpoint *ep; + struct sock *sk; + __be16 lport; int hash; - hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port)); + lport = laddr->v4.sin_port; + hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(epb, &head->chain) { @@ -792,6 +858,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: + sk = ep->base.sk; + if (sk->sk_reuseport) { + __u32 phash = sctp_hashfn(net, lport, paddr, 0); + + sk = reuseport_select_sock(sk, phash, skb, + sizeof(struct sctphdr)); + if (sk) + ep = sctp_sk(sk)->ep; + } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; @@ -830,35 +905,17 @@ out: static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; - const union sctp_addr *paddr = &t->ipaddr; - const struct net *net = sock_net(t->asoc->base.sk); - __be16 lport = htons(t->asoc->base.bind_addr.port); - __u32 addr; - if (paddr->sa.sa_family == AF_INET6) - addr = jhash(&paddr->v6.sin6_addr, 16, seed); - else - addr = (__force __u32)paddr->v4.sin_addr.s_addr; - - return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | - (__force __u32)lport, net_hash_mix(net), seed); + return sctp_hashfn(sock_net(t->asoc->base.sk), + htons(t->asoc->base.bind_addr.port), + &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; - const union sctp_addr *paddr = x->paddr; - const struct net *net = x->net; - __be16 lport = x->lport; - __u32 addr; - - if (paddr->sa.sa_family == AF_INET6) - addr = jhash(&paddr->v6.sin6_addr, 16, seed); - else - addr = (__force __u32)paddr->v4.sin_addr.s_addr; - return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | - (__force __u32)lport, net_hash_mix(net), seed); + return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { diff --git a/net/sctp/output.c b/net/sctp/output.c index 67939ad99c01..025f48e14a91 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -118,6 +118,9 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); + } else if (!sctp_transport_pmtu_check(tp)) { + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { @@ -396,25 +399,6 @@ finish: return retval; } -static void sctp_packet_release_owner(struct sk_buff *skb) -{ - sk_free(skb->sk); -} - -static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = sctp_packet_release_owner; - - /* - * The data chunks have already been accounted for in sctp_sendmsg(), - * therefore only reserve a single byte to keep socket around until - * the packet has been transmitted. - */ - refcount_inc(&sk->sk_wmem_alloc); -} - static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) @@ -426,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; + refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } @@ -601,7 +586,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); - sctp_packet_set_owner_w(head, sk); + skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index c0817f7a8964..a8c4c33377bc 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -53,7 +53,7 @@ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \ void *arg) { \ int error = 0; \ - enum sctp_event event_type; union sctp_subtype subtype; \ + enum sctp_event_type event_type; union sctp_subtype subtype; \ enum sctp_state state; \ struct sctp_endpoint *ep; \ \ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 4a4fd1971255..f4ac6c592e13 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, asoc->c.sinit_max_instreams, gfp)) goto clean_up; + /* Update frag_point when stream_interleave may get changed. */ + sctp_assoc_update_frag_point(asoc); + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 85d393090238..1d143bc3f73d 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -52,7 +52,7 @@ #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> -static int sctp_cmd_interpreter(enum sctp_event event_type, +static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); -static int sctp_side_effects(enum sctp_event event_type, +static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands, /* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, - enum sctp_event event_type, + enum sctp_event_type event_type, union sctp_subtype subtype, struct sctp_chunk *chunk, unsigned int error) @@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc) * If you want to understand all of lksctp, this is a * good place to start. */ -int sctp_do_sm(struct net *net, enum sctp_event event_type, +int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp) @@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type, /***************************************************************** * This the master state function side effect processing function. *****************************************************************/ -static int sctp_side_effects(enum sctp_event event_type, +static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -1285,7 +1285,7 @@ bail: ********************************************************************/ /* This is the side-effect interpreter. */ -static int sctp_cmd_interpreter(enum sctp_event event_type, +static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 691d9dc620e3..d239b94aa48c 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -79,7 +79,7 @@ static const struct sctp_sm_table_entry bug = { const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, - enum sctp_event event_type, + enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 739f3e50120d..f93c3cf9e567 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2230,7 +2230,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (sp->recvrcvinfo) sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ - if (sp->subscribe.sctp_data_io_event) + if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT)) sctp_ulpevent_read_sndrcvinfo(event, msg); err = copied; @@ -2304,22 +2304,33 @@ static int sctp_setsockopt_disable_fragments(struct sock *sk, static int sctp_setsockopt_events(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_event_subscribe subscribe; + __u8 *sn_type = (__u8 *)&subscribe; + struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_ulpevent *event; + int i; if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; - if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) + + if (copy_from_user(&subscribe, optval, optlen)) return -EFAULT; + for (i = 0; i < optlen; i++) + sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, + sn_type[i]); + + list_for_each_entry(asoc, &sp->ep->asocs, asocs) + asoc->subscribe = sctp_sk(sk)->subscribe; + /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ - if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT, - &sctp_sk(sk)->subscribe)) { - asoc = sctp_id2assoc(sk, 0); + if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) { + struct sctp_ulpevent *event; + asoc = sctp_id2assoc(sk, 0); if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); @@ -3324,8 +3335,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) : sizeof(struct sctp_data_chunk); - min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, - datasize); + min_len = sctp_min_frag_point(sp, datasize); max_len = SCTP_MAX_CHUNK_LEN - datasize; if (val < min_len || val > max_len) @@ -3940,32 +3950,16 @@ static int sctp_setsockopt_pr_supported(struct sock *sk, unsigned int optlen) { struct sctp_assoc_value params; - struct sctp_association *asoc; - int retval = -EINVAL; if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - asoc = sctp_id2assoc(sk, params.assoc_id); - if (asoc) { - asoc->prsctp_enable = !!params.assoc_value; - } else if (!params.assoc_id) { - struct sctp_sock *sp = sctp_sk(sk); + return -EINVAL; - sp->ep->prsctp_enable = !!params.assoc_value; - } else { - goto out; - } + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; - retval = 0; + sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value; -out: - return retval; + return 0; } static int sctp_setsockopt_default_prinfo(struct sock *sk, @@ -4277,6 +4271,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, return 0; } +static int sctp_setsockopt_event(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_ulpevent *event; + struct sctp_event param; + int retval = 0; + + if (optlen < sizeof(param)) { + retval = -EINVAL; + goto out; + } + + optlen = sizeof(param); + if (copy_from_user(¶m, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (param.se_type < SCTP_SN_TYPE_BASE || + param.se_type > SCTP_SN_TYPE_MAX) { + retval = -EINVAL; + goto out; + } + + asoc = sctp_id2assoc(sk, param.se_assoc_id); + if (!asoc) { + sctp_ulpevent_type_set(&sctp_sk(sk)->subscribe, + param.se_type, param.se_on); + goto out; + } + + sctp_ulpevent_type_set(&asoc->subscribe, param.se_type, param.se_on); + + if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) { + if (sctp_outq_is_empty(&asoc->outqueue)) { + event = sctp_ulpevent_make_sender_dry_event(asoc, + GFP_USER | __GFP_NOWARN); + if (!event) { + retval = -ENOMEM; + goto out; + } + + asoc->stream.si->enqueue_event(&asoc->ulpq, event); + } + } + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4474,6 +4519,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_REUSE_PORT: retval = sctp_setsockopt_reuse_port(sk, optval, optlen); break; + case SCTP_EVENT: + retval = sctp_setsockopt_event(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4722,7 +4770,7 @@ static int sctp_init_sock(struct sock *sk) /* Initialize default event subscriptions. By default, all the * options are off. */ - memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe)); + sp->subscribe = 0; /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS @@ -5267,14 +5315,24 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, int __user *optlen) { + struct sctp_event_subscribe subscribe; + __u8 *sn_type = (__u8 *)&subscribe; + int i; + if (len == 0) return -EINVAL; if (len > sizeof(struct sctp_event_subscribe)) len = sizeof(struct sctp_event_subscribe); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len)) + + for (i = 0; i < len; i++) + sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe, + SCTP_SN_TYPE_BASE + i); + + if (copy_to_user(optval, &subscribe, len)) return -EFAULT; + return 0; } @@ -7409,6 +7467,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval, + int __user *optlen) +{ + struct sctp_association *asoc; + struct sctp_event param; + __u16 subscribe; + + if (len < sizeof(param)) + return -EINVAL; + + len = sizeof(param); + if (copy_from_user(¶m, optval, len)) + return -EFAULT; + + if (param.se_type < SCTP_SN_TYPE_BASE || + param.se_type > SCTP_SN_TYPE_MAX) + return -EINVAL; + + asoc = sctp_id2assoc(sk, param.se_assoc_id); + subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe; + param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type); + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, ¶m, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7607,6 +7696,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_REUSE_PORT: retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); break; + case SCTP_EVENT: + retval = sctp_getsockopt_event(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7644,8 +7736,10 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { - bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); + struct sctp_sock *sp = sctp_sk(sk); + bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ + kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; unsigned short snum; int ret; @@ -7721,7 +7815,10 @@ pp_found: pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) + if ((pp->fastreuse && reuse && + sk->sk_state != SCTP_SS_LISTENING) || + (pp->fastreuseport && sk->sk_reuseport && + uid_eq(pp->fastuid, uid))) goto success; /* Run through the list of sockets bound to the port @@ -7735,16 +7832,18 @@ pp_found: * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { - struct sctp_endpoint *ep2; - ep2 = sctp_sk(sk2)->ep; + struct sctp_sock *sp2 = sctp_sk(sk2); + struct sctp_endpoint *ep2 = sp2->ep; if (sk == sk2 || - (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && - sk2->sk_state != SCTP_SS_LISTENING)) + (reuse && (sk2->sk_reuse || sp2->reuse) && + sk2->sk_state != SCTP_SS_LISTENING) || + (sk->sk_reuseport && sk2->sk_reuseport && + uid_eq(uid, sock_i_uid(sk2)))) continue; - if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, - sctp_sk(sk2), sctp_sk(sk))) { + if (sctp_bind_addr_conflict(&ep2->base.bind_addr, + addr, sp2, sp)) { ret = (long)sk2; goto fail_unlock; } @@ -7767,19 +7866,32 @@ pp_not_found: pp->fastreuse = 1; else pp->fastreuse = 0; - } else if (pp->fastreuse && - (!reuse || sk->sk_state == SCTP_SS_LISTENING)) - pp->fastreuse = 0; + + if (sk->sk_reuseport) { + pp->fastreuseport = 1; + pp->fastuid = uid; + } else { + pp->fastreuseport = 0; + } + } else { + if (pp->fastreuse && + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) + pp->fastreuse = 0; + + if (pp->fastreuseport && + (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid))) + pp->fastreuseport = 0; + } /* We are set, so fill up all the data in the hash table * entry, tie the socket list information with the rest of the * sockets FIXME: Blurry, NPI (ipg). */ success: - if (!sctp_sk(sk)->bind_hash) { + if (!sp->bind_hash) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &pp->owner); - sctp_sk(sk)->bind_hash = pp; + sp->bind_hash = pp; } ret = 0; @@ -7852,8 +7964,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } sk->sk_max_ack_backlog = backlog; - sctp_hash_endpoint(ep); - return 0; + return sctp_hash_endpoint(ep); } /* diff --git a/net/sctp/stream.c b/net/sctp/stream.c index ffb940d3b57c..3892e7630f3a 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -535,7 +535,6 @@ int sctp_send_add_streams(struct sctp_association *asoc, goto out; } - stream->incnt = incnt; stream->outcnt = outcnt; asoc->strreset_outstanding = !!out + !!in; diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 0a78cdf86463..a6bf21579466 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -140,7 +140,7 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; - struct sk_buff *pos; + struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { @@ -166,23 +166,30 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, return; } + loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && - MID_lt(event->mid, cevent->mid))) + MID_lt(event->mid, cevent->mid))) { + loc = pos; break; - + } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || - event->fsn < cevent->fsn)) + event->fsn < cevent->fsn)) { + loc = pos; break; + } } - __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); + if (!loc) + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + else + __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( @@ -383,7 +390,7 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; - struct sk_buff *pos; + struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { @@ -403,18 +410,25 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, return; } + loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; - if (cevent->stream > event->stream) + if (cevent->stream > event->stream) { + loc = pos; break; - + } if (cevent->stream == event->stream && - MID_lt(event->mid, cevent->mid)) + MID_lt(event->mid, cevent->mid)) { + loc = pos; break; + } } - __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); + if (!loc) + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + else + __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, @@ -489,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, sk_incoming_cpu_update(sk); } - if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; if (skb_list) @@ -980,17 +994,19 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; - if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, - &sctp_sk(sk)->subscribe)) + if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, + SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { + struct sctp_sock *sp = sctp_sk(sk); + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); - if (!sctp_sk(sk)->data_ready_signalled) { - sctp_sk(sk)->data_ready_signalled = 1; + if (!sp->data_ready_signalled) { + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 331cc734e3db..5dde92101743 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -219,7 +219,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ - if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until @@ -1129,16 +1129,16 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; - struct sock *sk; struct sctp_sock *sp; + struct sock *sk; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; sp = sctp_sk(sk); - if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, - &sctp_sk(sk)->subscribe)) + if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, + SCTP_PARTIAL_DELIVERY_EVENT)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, 0, 0, 0, gfp); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 80e2119f1c70..63f08b4e51d6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -127,6 +127,8 @@ static int smc_release(struct socket *sock) smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ + if (smc->connect_info && sk->sk_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); flush_work(&smc->connect_work); kfree(smc->connect_info); smc->connect_info = NULL; @@ -299,14 +301,17 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, optionally send confirm_rkey msg to register with peer */ +/* register a new rmb, send confirm_rkey msg to register with peer */ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, bool conf_rkey) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; + if (!rmb_desc->wr_reg) { + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { + rmb_desc->regerr = 1; + return -EFAULT; + } + rmb_desc->wr_reg = 1; } if (!conf_rkey) return 0; @@ -335,8 +340,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } if (link->llc_confirm_rc) @@ -363,8 +368,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } /* send add link reject message, only one link supported for now */ @@ -533,7 +538,8 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); + return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT, + CLC_WAIT_TIME); } /* setup for RDMA connection of client */ @@ -547,7 +553,8 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_lock(&smc_create_lgr_pending); local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, &aclc->lcl, NULL, 0); + ibport, ntoh24(aclc->qpn), &aclc->lcl, + NULL, 0); if (local_contact < 0) { if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -580,8 +587,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, local_contact); } else { - if (!smc->conn.rmb_desc->reused && - smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, local_contact); } @@ -618,7 +624,7 @@ static int smc_connect_ism(struct smc_sock *smc, int rc = 0; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, + local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, NULL, ismdev, aclc->gid); if (local_contact < 0) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0); @@ -965,8 +971,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } if (link->llc_confirm_resp_rc) @@ -986,8 +992,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); @@ -1083,7 +1089,7 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, int *local_contact) { /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, + *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, &pclc->lcl, NULL, 0); if (*local_contact < 0) { if (*local_contact == -ENOMEM) @@ -1107,7 +1113,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_smcd *pclc_smcd; pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL, + *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, ismdev, pclc_smcd->gid); if (*local_contact < 0) { if (*local_contact == -ENOMEM) @@ -1142,10 +1148,8 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; if (local_contact != SMC_FIRST_CONTACT) { - if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_ERR_REGRMB; - } + if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1181,7 +1185,6 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, return 0; decline: - mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); return reason_code; } @@ -1222,7 +1225,7 @@ static void smc_listen_work(struct work_struct *work) */ pclc = (struct smc_clc_msg_proposal *)&buf; reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL); + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (reason_code) { smc_listen_decline(new_smc, reason_code, 0); return; @@ -1272,7 +1275,7 @@ static void smc_listen_work(struct work_struct *work) /* receive SMC Confirm CLC message */ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); + SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (reason_code) { mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); @@ -1281,8 +1284,10 @@ static void smc_listen_work(struct work_struct *work) /* finish worker */ if (!ism_supported) { - if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) + if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) { + mutex_unlock(&smc_create_lgr_pending); return; + } } smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); @@ -1354,7 +1359,6 @@ static int smc_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); sock_hold(sk); /* sock_hold in tcp_listen_worker */ if (!schedule_work(&smc->tcp_listen_work)) sock_put(sk); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index ed5dcf03fe0b..db83332ac1c8 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -81,7 +81,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); BUILD_BUG_ON_MSG( - sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE, + offsetofend(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); BUILD_BUG_ON_MSG( sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, @@ -177,23 +177,24 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) int smcd_cdc_msg_send(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + union smc_host_cursor curs; struct smcd_cdc_msg cdc; int rc, diff; memset(&cdc, 0, sizeof(cdc)); cdc.common.type = SMC_CDC_MSG_TYPE; - cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap; - cdc.prod_count = conn->local_tx_ctrl.prod.count; - - cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap; - cdc.cons_count = conn->local_tx_ctrl.cons.count; - cdc.prod_flags = conn->local_tx_ctrl.prod_flags; - cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; + curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.prod.acurs); + cdc.prod.wrap = curs.wrap; + cdc.prod.count = curs.count; + curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.cons.acurs); + cdc.cons.wrap = curs.wrap; + cdc.cons.count = curs.count; + cdc.cons.prod_flags = conn->local_tx_ctrl.prod_flags; + cdc.cons.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); if (rc) return rc; - smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons, - conn); + smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -331,13 +332,16 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) static void smcd_cdc_rx_tsklet(unsigned long data) { struct smc_connection *conn = (struct smc_connection *)data; + struct smcd_cdc_msg *data_cdc; struct smcd_cdc_msg cdc; struct smc_sock *smc; if (!conn) return; - memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc)); + data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; + smcd_curs_copy(&cdc.prod, &data_cdc->prod, conn); + smcd_curs_copy(&cdc.cons, &data_cdc->cons, conn); smc = container_of(conn, struct smc_sock, conn); smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc); } diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 934df4473a7c..b5bfe38c7f9b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -48,21 +48,31 @@ struct smc_cdc_msg { struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; u8 reserved[18]; -} __packed; /* format defined in RFC7609 */ +}; + +/* SMC-D cursor format */ +union smcd_cdc_cursor { + struct { + u16 wrap; + u32 count; + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + } __packed; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); /* CDC message for SMC-D */ struct smcd_cdc_msg { struct smc_wr_rx_hdr common; /* Type = 0xFE */ u8 res1[7]; - u16 prod_wrap; - u32 prod_count; - u8 res2[2]; - u16 cons_wrap; - u32 cons_count; - struct smc_cdc_producer_flags prod_flags; - struct smc_cdc_conn_state_flags conn_state_flags; + union smcd_cdc_cursor prod; + union smcd_cdc_cursor cons; u8 res3[8]; -} __packed; +} __aligned(8); static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { @@ -135,6 +145,21 @@ static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt, #endif } +static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt, + union smcd_cdc_cursor *src, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + tgt->acurs = src->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); +#endif +} + /* calculate cursor difference between old and new, where old <= new */ static inline int smc_curs_diff(unsigned int size, union smc_host_cursor *old, @@ -222,12 +247,17 @@ static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local, struct smcd_cdc_msg *peer) { - local->prod.wrap = peer->prod_wrap; - local->prod.count = peer->prod_count; - local->cons.wrap = peer->cons_wrap; - local->cons.count = peer->cons_count; - local->prod_flags = peer->prod_flags; - local->conn_state_flags = peer->conn_state_flags; + union smc_host_cursor temp; + + temp.wrap = peer->prod.wrap; + temp.count = peer->prod.count; + atomic64_set(&local->prod.acurs, atomic64_read(&temp.acurs)); + + temp.wrap = peer->cons.wrap; + temp.count = peer->cons.count; + atomic64_set(&local->cons.acurs, atomic64_read(&temp.acurs)); + local->prod_flags = peer->cons.prod_flags; + local->conn_state_flags = peer->cons.conn_state_flags; } static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 89c3a8c7859a..776e9dfc915d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -265,7 +265,7 @@ out: * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. */ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, - u8 expected_type) + u8 expected_type, unsigned long timeout) { long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; struct sock *clc_sk = smc->clcsock->sk; @@ -285,7 +285,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; - smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + clc_sk->sk_rcvtimeo = timeout; iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -297,7 +297,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clc_sk->sk_err) { reason_code = -clc_sk->sk_err; - smc->sk.sk_err = clc_sk->sk_err; + if (clc_sk->sk_err == EAGAIN && + expected_type == SMC_CLC_DECLINE) + clc_sk->sk_err = 0; /* reset for fallback usage */ + else + smc->sk.sk_err = clc_sk->sk_err; goto out; } if (!len) { /* peer has performed orderly shutdown */ @@ -306,7 +310,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, goto out; } if (len < 0) { - smc->sk.sk_err = -len; + if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE) + smc->sk.sk_err = -len; reason_code = len; goto out; } @@ -346,7 +351,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: - smc->clcsock->sk->sk_rcvtimeo = rcvtimeo; + clc_sk->sk_rcvtimeo = rcvtimeo; return reason_code; } @@ -374,10 +379,8 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(struct smc_clc_msg_decline)); if (len < sizeof(struct smc_clc_msg_decline)) - smc->sk.sk_err = EPROTO; - if (len < 0) - smc->sk.sk_err = -len; - return sock_error(&smc->sk); + len = -EPROTO; + return len > 0 ? 0 : len; } /* send CLC PROPOSAL message across internal TCP socket */ @@ -536,7 +539,6 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) struct smc_link *link; struct msghdr msg; struct kvec vec; - int rc = 0; int len; memset(&aclc, 0, sizeof(aclc)); @@ -589,13 +591,8 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) vec.iov_len = ntohs(aclc.hdr.length); len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, ntohs(aclc.hdr.length)); - if (len < ntohs(aclc.hdr.length)) { - if (len >= 0) - new_smc->sk.sk_err = EPROTO; - else - new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; - rc = sock_error(&new_smc->sk); - } + if (len < ntohs(aclc.hdr.length)) + len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; - return rc; + return len > 0 ? 0 : len; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 18da89b681c2..24658e8c0de4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -27,6 +27,7 @@ #define SMC_TYPE_D 1 /* SMC-D only */ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ #define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ #define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ @@ -182,7 +183,7 @@ struct smcd_dev; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, - u8 expected_type); + u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 18daebcef181..35c1cdc93e1c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -149,6 +149,8 @@ static int smc_link_send_delete(struct smc_link *lnk) return -ENOTCONN; } +static void smc_lgr_free(struct smc_link_group *lgr); + static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), @@ -171,8 +173,11 @@ free: spin_unlock_bh(&smc_lgr_list.lock); if (!lgr->is_smcd && !lgr->terminating) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* try to send del link msg, on error free lgr immediately */ - if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { + if (lnk->state == SMC_LNK_ACTIVE && + !smc_link_send_delete(lnk)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); return; @@ -184,6 +189,8 @@ free: if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) smc_llc_link_inactive(lnk); + if (lgr->is_smcd) + smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); } } @@ -293,8 +300,13 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; + if (!lgr->is_smcd) { + /* unregister rmb with peer */ + smc_llc_do_delete_rkey( + &lgr->lnk[SMC_SINGLE_LINK], + conn->rmb_desc); + } } else { /* buf registration failed, reuse not possible */ write_lock_bh(&lgr->rmbs_lock); @@ -408,7 +420,7 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) } /* remove a link group */ -void smc_lgr_free(struct smc_link_group *lgr) +static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) @@ -485,7 +497,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) } /* Called when SMC-D device is terminated or peer is lost */ -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); @@ -495,7 +507,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (lgr->is_smcd && lgr->smcd == dev && (!peer_gid || lgr->peer_gid == peer_gid) && - !list_empty(&lgr->list)) { + (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { __smc_lgr_terminate(lgr); list_move(&lgr->list, &lgr_free_list); } @@ -506,6 +518,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); cancel_delayed_work_sync(&lgr->free_work); + if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ + smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); } } @@ -559,7 +573,7 @@ out: static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, - enum smc_lgr_role role) + enum smc_lgr_role role, u32 clcqpn) { return !memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) && @@ -567,7 +581,9 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, SMC_GID_SIZE) && !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, sizeof(lcl->mac)) && - lgr->role == role; + lgr->role == role && + (lgr->role == SMC_SERV || + lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -578,7 +594,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, + struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, u64 peer_gid) { @@ -603,7 +619,7 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : - smcr_lgr_match(lgr, lcl, role)) && + smcr_lgr_match(lgr, lcl, role, clcqpn)) && !lgr->sync_err && lgr->vlan_id == vlan_id && (role == SMC_CLNT || @@ -1024,6 +1040,8 @@ void smc_core_exit(void) smc_llc_link_inactive(lnk); } cancel_delayed_work_sync(&lgr->free_work); + if (lgr->is_smcd) + smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); /* free link group */ } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c156674733c9..b00287989a3d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -109,6 +109,9 @@ struct smc_link { int llc_testlink_time; /* testlink interval */ struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ + struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ + int llc_delete_rkey_rc; /* rc from del rkey msg */ + struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -127,7 +130,7 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 reused : 1; /* new created / reused */ + u8 wr_reg : 1; /* mem region registered */ u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ @@ -243,11 +246,11 @@ struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; -void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, + unsigned short vlan); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, @@ -262,7 +265,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, + struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, u64 peer_gid); void smcd_conn_free(struct smc_connection *conn); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index e36f21ce7252..2fff79db1a59 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -187,22 +187,28 @@ struct smc_ism_event_work { #define ISM_EVENT_REQUEST 0x0001 #define ISM_EVENT_RESPONSE 0x0002 #define ISM_EVENT_REQUEST_IR 0x00000001 +#define ISM_EVENT_CODE_SHUTDOWN 0x80 #define ISM_EVENT_CODE_TESTLINK 0x83 +union smcd_sw_event_info { + u64 info; + struct { + u8 uid[SMC_LGR_ID_SIZE]; + unsigned short vlan_id; + u16 code; + }; +}; + static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - union { - u64 info; - struct { - u32 uid; - unsigned short vlanid; - u16 code; - }; - } ev_info; + union smcd_sw_event_info ev_info; + ev_info.info = wrk->event.info; switch (wrk->event.code) { + case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id); + break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ - ev_info.info = wrk->event.info; if (ev_info.code == ISM_EVENT_REQUEST) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, @@ -215,6 +221,21 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) } } +int smc_ism_signal_shutdown(struct smc_link_group *lgr) +{ + int rc; + union smcd_sw_event_info ev_info; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); + return rc; +} + /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { @@ -223,7 +244,7 @@ static void smc_ism_event_work(struct work_struct *work) switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ - smc_smcd_terminate(wrk->smcd, wrk->event.tok); + smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; @@ -289,7 +310,7 @@ void smcd_unregister_dev(struct smcd_dev *smcd) spin_unlock(&smcd_dev_list.lock); flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); - smc_smcd_terminate(smcd, 0); + smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); device_del(&smcd->dev); } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index aee45b860b79..4da946cbfa29 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -45,4 +45,5 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, void *data, size_t len); +int smc_ism_signal_shutdown(struct smc_link_group *lgr); #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9c916c709ca7..a6d3623d06f4 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -238,6 +238,29 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, return rc; } +/* send LLC delete rkey request */ +static int smc_llc_send_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_llc_msg_delete_rkey *rkeyllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; + rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); + rkeyllc->num_rkeys = 1; + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + /* prepare an add link message */ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, struct smc_link *link, u8 mac[], u8 gid[], @@ -509,7 +532,9 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, int i, max; if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + link->llc_delete_rkey_rc = llc->hd.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_delete_rkey); } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { @@ -610,6 +635,8 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_add); init_completion(&link->llc_add_resp); init_completion(&link->llc_confirm_rkey); + init_completion(&link->llc_delete_rkey); + mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; @@ -650,8 +677,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, { int rc; + /* protected by mutex smc_create_lgr_pending */ reinit_completion(&link->llc_confirm_rkey); - smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_send_confirm_rkey(link, rmb_desc); + if (rc) + return rc; /* receive CONFIRM RKEY response from server over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, SMC_LLC_WAIT_TIME); @@ -660,6 +690,29 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, return 0; } +/* unregister an rtoken at the remote peer */ +int smc_llc_do_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + int rc; + + mutex_lock(&link->llc_delete_rkey_mutex); + reinit_completion(&link->llc_delete_rkey); + rc = smc_llc_send_delete_rkey(link, rmb_desc); + if (rc) + goto out; + /* receive DELETE RKEY response from server over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, + SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_delete_rkey_rc) + rc = -EFAULT; + else + rc = 0; +out: + mutex_unlock(&link->llc_delete_rkey_mutex); + return rc; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 9e2ff088e301..461c0c3ef76e 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -49,6 +49,8 @@ void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smc_llc_do_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 3c458d279855..c2694750a6a8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -215,12 +215,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); if (pend->idx < link->wr_tx_cnt) { + u32 idx = pend->idx; + /* clear the full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[pend->idx], 0, sizeof(link->wr_tx_pends[pend->idx])); memset(&link->wr_tx_bufs[pend->idx], 0, sizeof(link->wr_tx_bufs[pend->idx])); - test_and_clear_bit(pend->idx, link->wr_tx_mask); + test_and_clear_bit(idx, link->wr_tx_mask); return 1; } diff --git a/net/socket.c b/net/socket.c index 593826e11a53..334fcc617ef2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -853,7 +853,7 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct socket *sock = file->private_data; if (unlikely(!sock->ops->splice_read)) - return -EINVAL; + return generic_file_splice_read(file, ppos, pipe, len, flags); return sock->ops->splice_read(sock, ppos, pipe, len, flags); } diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index d8831b988b1e..ab4a3be1542a 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -281,13 +281,7 @@ static bool generic_key_to_expire(struct rpc_cred *cred) { struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; - bool ret; - - get_rpccred(cred); - ret = test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); - put_rpccred(cred); - - return ret; + return test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); } static const struct rpc_credops generic_credops = { diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 30f970cdc7f6..ba765473d1f0 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1239,36 +1239,59 @@ gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) return &gss_auth->rpc_auth; } +static struct gss_cred * +gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) +{ + struct gss_cred *new; + + /* Make a copy of the cred so that we can reference count it */ + new = kzalloc(sizeof(*gss_cred), GFP_NOIO); + if (new) { + struct auth_cred acred = { + .uid = gss_cred->gc_base.cr_uid, + }; + struct gss_cl_ctx *ctx = + rcu_dereference_protected(gss_cred->gc_ctx, 1); + + rpcauth_init_cred(&new->gc_base, &acred, + &gss_auth->rpc_auth, + &gss_nullops); + new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + new->gc_service = gss_cred->gc_service; + new->gc_principal = gss_cred->gc_principal; + kref_get(&gss_auth->kref); + rcu_assign_pointer(new->gc_ctx, ctx); + gss_get_ctx(ctx); + } + return new; +} + /* - * gss_destroying_context will cause the RPCSEC_GSS to send a NULL RPC call + * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call * to the server with the GSS control procedure field set to * RPC_GSS_PROC_DESTROY. This should normally cause the server to release * all RPCSEC_GSS state associated with that context. */ -static int -gss_destroying_context(struct rpc_cred *cred) +static void +gss_send_destroy_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); + struct gss_cred *new; struct rpc_task *task; - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) - return 0; + new = gss_dup_cred(gss_auth, gss_cred); + if (new) { + ctx->gc_proc = RPC_GSS_PROC_DESTROY; - ctx->gc_proc = RPC_GSS_PROC_DESTROY; - cred->cr_ops = &gss_nullops; + task = rpc_call_null(gss_auth->client, &new->gc_base, + RPC_TASK_ASYNC|RPC_TASK_SOFT); + if (!IS_ERR(task)) + rpc_put_task(task); - /* Take a reference to ensure the cred will be destroyed either - * by the RPC call or by the put_rpccred() below */ - get_rpccred(cred); - - task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT); - if (!IS_ERR(task)) - rpc_put_task(task); - - put_rpccred(cred); - return 1; + put_rpccred(&new->gc_base); + } } /* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure @@ -1330,8 +1353,8 @@ static void gss_destroy_cred(struct rpc_cred *cred) { - if (gss_destroying_context(cred)) - return; + if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) + gss_send_destroy_context(cred); gss_destroy_nullcred(cred); } @@ -1768,6 +1791,7 @@ priv_release_snd_buf(struct rpc_rqst *rqstp) for (i=0; i < rqstp->rq_enc_pages_num; i++) __free_page(rqstp->rq_enc_pages[i]); kfree(rqstp->rq_enc_pages); + rqstp->rq_release_snd_buf = NULL; } static int @@ -1776,6 +1800,9 @@ alloc_enc_pages(struct rpc_rqst *rqstp) struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; int first, last, i; + if (rqstp->rq_release_snd_buf) + rqstp->rq_release_snd_buf(rqstp); + if (snd_buf->page_len == 0) { rqstp->rq_enc_pages_num = 0; return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ae3b8145da35..c6782aa47525 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1915,6 +1915,13 @@ call_connect_status(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; + /* Check if the task was already transmitted */ + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { + xprt_end_transmit(task); + task->tk_action = call_transmit_status; + return; + } + dprint_status(task); trace_rpc_connect_status(task); @@ -2302,6 +2309,7 @@ out_retry: task->tk_status = 0; /* Note: rpc_verify_header() may have freed the RPC slot */ if (task->tk_rqstp == req) { + xdr_free_bvec(&req->rq_rcv_buf); req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 9062967575c4..7e55cfc69697 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -175,7 +175,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 2bbb8d38d2bf..f302c6eb8779 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(xdr_commit_encode); static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) { - static __be32 *p; + __be32 *p; int space_left; int frag1bytes, frag2bytes; @@ -673,11 +673,10 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) WARN_ON_ONCE(xdr->iov); return; } - if (fraglen) { + if (fraglen) xdr->end = head->iov_base + head->iov_len; - xdr->page_ptr--; - } /* (otherwise assume xdr->end is already set) */ + xdr->page_ptr--; head->iov_len = len; buf->len = len; xdr->p = head->iov_base + head->iov_len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 86bea4520c4d..ce927002862a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -826,8 +826,15 @@ void xprt_connect(struct rpc_task *task) return; if (xprt_test_and_set_connecting(xprt)) return; - xprt->stat.connect_start = jiffies; - xprt->ops->connect(xprt, task); + /* Race breaker */ + if (!xprt_connected(xprt)) { + xprt->stat.connect_start = jiffies; + xprt->ops->connect(xprt, task); + } else { + xprt_clear_connecting(xprt); + task->tk_status = 0; + rpc_wake_up_queued_task(&xprt->pending, task); + } } xprt_release_write(xprt, task); } @@ -1623,6 +1630,8 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.buflen = 0; req->rq_rcv_buf.len = 0; req->rq_rcv_buf.buflen = 0; + req->rq_snd_buf.bvec = NULL; + req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; xprt_reset_majortimeo(req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ae77c71c1f64..8a5e823e0b33 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -330,18 +330,16 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) { size_t i,n; - if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + if (!want || !(buf->flags & XDRBUF_SPARSE_PAGES)) return want; - if (want > buf->page_len) - want = buf->page_len; n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < n; i++) { if (buf->pages[i]) continue; buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); if (!buf->pages[i]) { - buf->page_len = (i * PAGE_SIZE) - buf->page_base; - return buf->page_len; + i *= PAGE_SIZE; + return i > buf->page_base ? i - buf->page_base : 0; } } return want; @@ -378,8 +376,8 @@ static ssize_t xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { - struct kvec kvec = { 0 }; - return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0); + iov_iter_discard(&msg->msg_iter, READ, count); + return sock_recvmsg(sock, msg, flags); } static ssize_t @@ -398,16 +396,17 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) - goto eagain; + goto out; seek = 0; } else { seek -= buf->head[0].iov_len; offset += buf->head[0].iov_len; } - if (seek < buf->page_len) { - want = xs_alloc_sparse_pages(buf, - min_t(size_t, count - offset, buf->page_len), - GFP_NOWAIT); + + want = xs_alloc_sparse_pages(buf, + min_t(size_t, count - offset, buf->page_len), + GFP_NOWAIT); + if (seek < want) { ret = xs_read_bvec(sock, msg, flags, buf->bvec, xdr_buf_pagecount(buf), want + buf->page_base, @@ -418,12 +417,13 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) - goto eagain; + goto out; seek = 0; } else { - seek -= buf->page_len; - offset += buf->page_len; + seek -= want; + offset += want; } + if (seek < buf->tail[0].iov_len) { want = min_t(size_t, count - offset, buf->tail[0].iov_len); ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); @@ -433,17 +433,13 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) - goto eagain; + goto out; } else offset += buf->tail[0].iov_len; ret = -EMSGSIZE; - msg->msg_flags |= MSG_TRUNC; out: *read = offset - seek_init; return ret; -eagain: - ret = -EAGAIN; - goto out; sock_err: offset += seek; goto out; @@ -486,19 +482,20 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, if (transport->recv.offset == transport->recv.len) { if (xs_read_stream_request_done(transport)) msg->msg_flags |= MSG_EOR; - return transport->recv.copied; + return read; } switch (ret) { + default: + break; + case -EFAULT: case -EMSGSIZE: - return transport->recv.copied; + msg->msg_flags |= MSG_TRUNC; + return read; case 0: return -ESHUTDOWN; - default: - if (ret < 0) - return ret; } - return -EAGAIN; + return ret < 0 ? ret : read; } static size_t @@ -537,7 +534,7 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) - xprt_complete_bc_request(req, ret); + xprt_complete_bc_request(req, transport->recv.copied); return ret; } @@ -570,7 +567,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) - xprt_complete_rqst(req->rq_task, ret); + xprt_complete_rqst(req->rq_task, transport->recv.copied); xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); @@ -591,10 +588,8 @@ xs_read_stream(struct sock_xprt *transport, int flags) if (ret <= 0) goto out_err; transport->recv.offset = ret; - if (ret != want) { - ret = -EAGAIN; - goto out_err; - } + if (transport->recv.offset != want) + return transport->recv.offset; transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & RPC_FRAGMENT_SIZE_MASK; transport->recv.offset -= sizeof(transport->recv.fraghdr); @@ -602,6 +597,9 @@ xs_read_stream(struct sock_xprt *transport, int flags) } switch (be32_to_cpu(transport->recv.calldir)) { + default: + msg.msg_flags |= MSG_TRUNC; + break; case RPC_CALL: ret = xs_read_stream_call(transport, &msg, flags); break; @@ -616,6 +614,9 @@ xs_read_stream(struct sock_xprt *transport, int flags) goto out_err; read += ret; if (transport->recv.offset < transport->recv.len) { + if (!(msg.msg_flags & MSG_TRUNC)) + return read; + msg.msg_flags = 0; ret = xs_read_discard(transport->sock, &msg, flags, transport->recv.len - transport->recv.offset); if (ret <= 0) @@ -623,7 +624,7 @@ xs_read_stream(struct sock_xprt *transport, int flags) transport->recv.offset += ret; read += ret; if (transport->recv.offset != transport->recv.len) - return -EAGAIN; + return read; } if (xs_read_stream_request_done(transport)) { trace_xs_stream_read_request(transport); @@ -633,13 +634,7 @@ xs_read_stream(struct sock_xprt *transport, int flags) transport->recv.len = 0; return read; out_err: - switch (ret) { - case 0: - case -ESHUTDOWN: - xprt_force_disconnect(&transport->xprt); - return -ESHUTDOWN; - } - return ret; + return ret != 0 ? ret : -ESHUTDOWN; } static void xs_stream_data_receive(struct sock_xprt *transport) @@ -648,12 +643,12 @@ static void xs_stream_data_receive(struct sock_xprt *transport) ssize_t ret = 0; mutex_lock(&transport->recv_mutex); + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); if (transport->sock == NULL) goto out; - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { ret = xs_read_stream(transport, MSG_DONTWAIT); - if (ret <= 0) + if (ret < 0) break; read += ret; cond_resched(); @@ -1345,10 +1340,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport) int err; mutex_lock(&transport->recv_mutex); + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); sk = transport->inet; if (sk == NULL) goto out; - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { skb = skb_recv_udp(sk, 0, 1, &err); if (skb == NULL) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 74b9d916a58b..5df9d1138ac9 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -353,34 +353,35 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) return 0; } -static int __switchdev_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj, - struct switchdev_trans *trans) +static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, + struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans, + struct netlink_ext_ack *extack) { - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_obj_add) - return ops->switchdev_port_obj_add(dev, obj, trans); + int rc; + int err; - /* Switch device port(s) may be stacked under - * bond/team/vlan dev, so recurse down to add object on - * each port. - */ + struct switchdev_notifier_port_obj_info obj_info = { + .obj = obj, + .trans = trans, + .handled = false, + }; - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = __switchdev_port_obj_add(lower_dev, obj, trans); - if (err) - break; + rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack); + err = notifier_to_errno(rc); + if (err) { + WARN_ON(!obj_info.handled); + return err; } - - return err; + if (!obj_info.handled) + return -EOPNOTSUPP; + return 0; } static int switchdev_port_obj_add_now(struct net_device *dev, - const struct switchdev_obj *obj) + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack) { struct switchdev_trans trans; int err; @@ -397,7 +398,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, */ trans.ph_prepare = true; - err = __switchdev_port_obj_add(dev, obj, &trans); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, &trans, extack); if (err) { /* Prepare phase failed: abort the transaction. Any * resources reserved in the prepare phase are @@ -416,7 +418,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, */ trans.ph_prepare = false; - err = __switchdev_port_obj_add(dev, obj, &trans); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, &trans, extack); WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); switchdev_trans_items_warn_destroy(dev, &trans); @@ -429,7 +432,7 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev, const struct switchdev_obj *obj = data; int err; - err = switchdev_port_obj_add_now(dev, obj); + err = switchdev_port_obj_add_now(dev, obj, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", err, obj->id); @@ -459,38 +462,21 @@ static int switchdev_port_obj_add_defer(struct net_device *dev, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj) + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_add_defer(dev, obj); ASSERT_RTNL(); - return switchdev_port_obj_add_now(dev, obj); + return switchdev_port_obj_add_now(dev, obj, extack); } EXPORT_SYMBOL_GPL(switchdev_port_obj_add); static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_obj_del) - return ops->switchdev_port_obj_del(dev, obj); - - /* Switch device port(s) may be stacked under - * bond/team/vlan dev, so recurse down to delete object on - * each port. - */ - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_obj_del_now(lower_dev, obj); - if (err) - break; - } - - return err; + return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, + dev, obj, NULL, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, @@ -535,6 +521,7 @@ int switchdev_port_obj_del(struct net_device *dev, EXPORT_SYMBOL_GPL(switchdev_port_obj_del); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); +static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier @@ -572,10 +559,38 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info) { info->dev = dev; + info->extack = NULL; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); +int register_switchdev_blocking_notifier(struct notifier_block *nb) +{ + struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + + return blocking_notifier_chain_register(chain, nb); +} +EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); + +int unregister_switchdev_blocking_notifier(struct notifier_block *nb) +{ + struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + + return blocking_notifier_chain_unregister(chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); + +int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info, + struct netlink_ext_ack *extack) +{ + info->dev = dev; + info->extack = extack; + return blocking_notifier_call_chain(&switchdev_blocking_notif_chain, + val, info); +} +EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); + bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { @@ -595,3 +610,109 @@ bool switchdev_port_same_parent_id(struct net_device *a, return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); } EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); + +static int __switchdev_handle_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*add_cb)(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans, + struct netlink_ext_ack *extack)) +{ + struct netlink_ext_ack *extack; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + extack = switchdev_notifier_info_to_extack(&port_obj_info->info); + + if (check_cb(dev)) { + /* This flag is only checked if the return value is success. */ + port_obj_info->handled = true; + return add_cb(dev, port_obj_info->obj, port_obj_info->trans, + extack); + } + + /* Switch ports might be stacked under e.g. a LAG. Ignore the + * unsupported devices, another driver might be able to handle them. But + * propagate to the callers any hard errors. + * + * If the driver does its own bookkeeping of stacked ports, it's not + * necessary to go through this helper. + */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, + check_cb, add_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return err; +} + +int switchdev_handle_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*add_cb)(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans, + struct netlink_ext_ack *extack)) +{ + int err; + + err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, + add_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); + +static int __switchdev_handle_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*del_cb)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) { + /* This flag is only checked if the return value is success. */ + port_obj_info->handled = true; + return del_cb(dev, port_obj_info->obj); + } + + /* Switch ports might be stacked under e.g. a LAG. Ignore the + * unsupported devices, another driver might be able to handle them. But + * propagate to the callers any hard errors. + * + * If the driver does its own bookkeeping of stacked ports, it's not + * necessary to go through this helper. + */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, + check_cb, del_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return err; +} + +int switchdev_handle_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*del_cb)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + int err; + + err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, + del_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 2830709957bd..c138d68e8a69 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -166,7 +166,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, /* Apply trial address if we just left trial period */ if (!trial && !self) { - tipc_net_finalize(net, tn->trial_addr); + tipc_sched_net_finalize(net, tn->trial_addr); + msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } @@ -300,14 +301,12 @@ static void tipc_disc_timeout(struct timer_list *t) goto exit; } - /* Trial period over ? */ - if (!time_before(jiffies, tn->addr_trial_end)) { - /* Did we just leave it ? */ - if (!tipc_own_addr(net)) - tipc_net_finalize(net, tn->trial_addr); - - msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); - msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net)); + /* Did we just leave trial period ? */ + if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { + mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); + spin_unlock_bh(&d->lock); + tipc_sched_net_finalize(net, tn->trial_addr); + return; } /* Adjust timeout interval according to discovery phase */ @@ -319,6 +318,8 @@ static void tipc_disc_timeout(struct timer_list *t) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); diff --git a/net/tipc/link.c b/net/tipc/link.c index 201c3b5bc96b..9e265eb89726 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -105,7 +105,7 @@ struct tipc_stats { * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages - * @last_retransmitted: sequence number of most recently retransmitted message + * @prev_from: sequence number of most previous retransmission request * @stale_cnt: counter for number of identical retransmit attempts * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released @@ -163,7 +163,7 @@ struct tipc_link { u16 limit; } backlog[5]; u16 snd_nxt; - u16 last_retransm; + u16 prev_from; u16 window; u16 stale_cnt; unsigned long stale_limit; @@ -186,9 +186,6 @@ struct tipc_link { u16 acked; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; - unsigned long prev_retr; - u16 prev_from; - u16 prev_to; u8 nack_state; bool bc_peer_is_up; @@ -210,7 +207,7 @@ enum { BC_NACK_SND_SUPPRESS, }; -#define TIPC_BC_RETR_LIMIT 10 /* [ms] */ +#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */ /* * Interval between NACKs when packets arrive out of order @@ -1036,10 +1033,12 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (!skb) return 0; + if (less(to, from)) + return 0; /* Detect repeated retransmit failures on same packet */ - if (r->last_retransm != buf_seqno(skb)) { - r->last_retransm = buf_seqno(skb); + if (r->prev_from != from) { + r->prev_from = from; r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); r->stale_cnt = 0; } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { @@ -1055,6 +1054,11 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; + if (link_is_bc_sndlink(l)) { + if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) + continue; + TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + } _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1594,14 +1598,17 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; - /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ - if (msg_peer_stopping(hdr)) + /* If peer is going down we want full re-establish cycle */ + if (msg_peer_stopping(hdr)) { rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - else if ((mtyp == RESET_MSG) || !link_is_up(l)) + break; + } + /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ + if (mtyp == RESET_MSG || !link_is_up(l)) rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); /* ACTIVATE_MSG takes up link if it was already locally reset */ - if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING)) + if (mtyp == ACTIVATE_MSG && l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); @@ -1734,42 +1741,6 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) l->rcv_nxt = peers_snd_nxt; } -/* link_bc_retr eval()- check if the indicated range can be retransmitted now - * - Adjust permitted range if there is overlap with previous retransmission - */ -static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to) -{ - unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr); - - if (less(*to, *from)) - return false; - - /* New retransmission request */ - if ((elapsed > TIPC_BC_RETR_LIMIT) || - less(*to, l->prev_from) || more(*from, l->prev_to)) { - l->prev_from = *from; - l->prev_to = *to; - l->prev_retr = jiffies; - return true; - } - - /* Inside range of previous retransmit */ - if (!less(*from, l->prev_from) && !more(*to, l->prev_to)) - return false; - - /* Fully or partially outside previous range => exclude overlap */ - if (less(*from, l->prev_from)) { - *to = l->prev_from - 1; - l->prev_from = *from; - } - if (more(*to, l->prev_to)) { - *from = l->prev_to + 1; - l->prev_to = *to; - } - l->prev_retr = jiffies; - return true; -} - /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state */ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, @@ -1800,8 +1771,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - if (link_bc_retr_eval(snd_l, &from, &to)) - rc = tipc_link_retrans(snd_l, l, from, to, xmitq); + rc = tipc_link_retrans(snd_l, l, from, to, xmitq); l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a2879e6ec5b6..a0924956bb61 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -105,6 +105,7 @@ struct tipc_skb_cb { u32 bytes_read; u32 orig_member; struct sk_buff *tail; + unsigned long nxt_retr; bool validated; u16 chain_imp; u16 ackers; diff --git a/net/tipc/net.c b/net/tipc/net.c index 62199cf5a56c..f076edb74338 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -104,6 +104,14 @@ * - A local spin_lock protecting the queue of subscriber events. */ +struct tipc_net_work { + struct work_struct work; + struct net *net; + u32 addr; +}; + +static void tipc_net_finalize(struct net *net, u32 addr); + int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { @@ -119,17 +127,38 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) return 0; } -void tipc_net_finalize(struct net *net, u32 addr) +static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); - if (!cmpxchg(&tn->node_addr, 0, addr)) { - tipc_set_node_addr(net, addr); - tipc_named_reinit(net); - tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, - TIPC_CLUSTER_SCOPE, 0, addr); - } + if (cmpxchg(&tn->node_addr, 0, addr)) + return; + tipc_set_node_addr(net, addr); + tipc_named_reinit(net); + tipc_sk_reinit(net); + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); +} + +static void tipc_net_finalize_work(struct work_struct *work) +{ + struct tipc_net_work *fwork; + + fwork = container_of(work, struct tipc_net_work, work); + tipc_net_finalize(fwork->net, fwork->addr); + kfree(fwork); +} + +void tipc_sched_net_finalize(struct net *net, u32 addr) +{ + struct tipc_net_work *fwork = kzalloc(sizeof(*fwork), GFP_ATOMIC); + + if (!fwork) + return; + INIT_WORK(&fwork->work, tipc_net_finalize_work); + fwork->net = net; + fwork->addr = addr; + schedule_work(&fwork->work); } void tipc_net_stop(struct net *net) diff --git a/net/tipc/net.h b/net/tipc/net.h index 09ad02b50bb1..b7f2e364eb99 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -42,7 +42,7 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); -void tipc_net_finalize(struct net *net, u32 addr); +void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/node.c b/net/tipc/node.c index 2afc4f8c37a7..32556f480a60 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -584,12 +584,15 @@ static void tipc_node_clear_links(struct tipc_node *node) /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ -static int tipc_node_cleanup(struct tipc_node *peer) +static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; - spin_lock_bh(&tn->node_list_lock); + /* If lock held by tipc_node_stop() the node will be deleted anyway */ + if (!spin_trylock_bh(&tn->node_list_lock)) + return false; + tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { @@ -621,6 +624,12 @@ static void tipc_node_timeout(struct timer_list *t) __skb_queue_head_init(&xmitq); + /* Initial node interval to value larger (10 seconds), then it will be + * recalculated with link lowest tolerance + */ + tipc_node_read_lock(n); + n->keepalive_intv = 10000; + tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 636e6131769d..b57b1be7252b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1555,16 +1555,17 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) /** * tipc_sk_anc_data_recv - optionally capture ancillary data for received message * @m: descriptor for message info - * @msg: received message header + * @skb: received message buffer * @tsk: TIPC port associated with message * * Note: Ancillary data is not captured if not requested by receiver. * * Returns 0 if successful, otherwise errno */ -static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, +static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { + struct tipc_msg *msg; u32 anc_data[3]; u32 err; u32 dest_type; @@ -1573,6 +1574,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (likely(m->msg_controllen == 0)) return 0; + msg = buf_msg(skb); /* Optionally capture errored message object(s) */ err = msg ? msg_errcode(msg) : 0; @@ -1583,6 +1585,9 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (res) return res; if (anc_data[1]) { + if (skb_linearize(skb)) + return -ENOMEM; + msg = buf_msg(skb); res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], msg_data(msg)); if (res) @@ -1744,9 +1749,10 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Collect msg meta data, including error code and rejected data */ tipc_sk_set_orig_addr(m, skb); - rc = tipc_sk_anc_data_recv(m, hdr, tsk); + rc = tipc_sk_anc_data_recv(m, skb, tsk); if (unlikely(rc)) goto exit; + hdr = buf_msg(skb); /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { @@ -1856,9 +1862,10 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { tipc_sk_set_orig_addr(m, skb); - rc = tipc_sk_anc_data_recv(m, hdr, tsk); + rc = tipc_sk_anc_data_recv(m, skb, tsk); if (rc) break; + hdr = buf_msg(skb); } /* Copy data if msg ok, otherwise return error/partial data */ diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7b1af8b59cd2..d4ecc66464e6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -687,6 +687,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sock *sk_redir; struct tls_rec *rec; int err = 0, send; + u32 delta = 0; bool enospc; psock = sk_psock_get(sk); @@ -694,8 +695,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (delta < msg->sg.size) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; @@ -743,7 +750,7 @@ more_data: msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); - *copied -= send; + *copied -= (send + delta); err = -EACCES; } diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 12b3edf70a7b..1615e503f8e3 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -272,11 +272,11 @@ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); - for (i = 0; i<sizeof(*ht_capa); i++) + for (i = 0; i < sizeof(*ht_capa); i++) p1[i] &= p2[i]; } -/* Do a logical ht_capa &= ht_capa_mask. */ +/* Do a logical vht_capa &= vht_capa_mask. */ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 744b5851bbf9..8d763725498c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7870,6 +7870,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) } memset(¶ms, 0, sizeof(params)); + params.beacon_csa.ftm_responder = -1; if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d536b07582f8..f741d8376a46 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -642,11 +642,15 @@ static bool cfg80211_is_all_idle(void) * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. + * Also if there is any other active beaconing interface we + * need not issue a disconnect hint and reset any info such + * as chan dfs state, etc. */ list_for_each_entry(rdev, &cfg80211_rdev_list, list) { list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); - if (wdev->conn || wdev->current_bss) + if (wdev->conn || wdev->current_bss || + cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; wdev_unlock(wdev); } @@ -1171,6 +1175,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); + cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, + rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; diff --git a/net/wireless/util.c b/net/wireless/util.c index ef14d80ca03e..d473bd135da8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1421,6 +1421,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); + else + break; } } else { pos = skip_ie(ies, ielen, pos); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d49aa79b7997..5121729b8b63 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb, } len = *skb->data; - needed = 1 + (len >> 4) + (len & 0x0f); + needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims @@ -288,7 +288,7 @@ static struct sock *x25_find_listener(struct x25_address *addr, sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || - !strcmp(addr->x25_addr, + !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* @@ -688,11 +688,15 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - len = strlen(addr->sx25_addr.x25_addr); - for (i = 0; i < len; i++) { - if (!isdigit(addr->sx25_addr.x25_addr[i])) { - rc = -EINVAL; - goto out; + /* check for the null_x25_address */ + if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { + + len = strlen(addr->sx25_addr.x25_addr); + for (i = 0; i < len; i++) { + if (!isdigit(addr->sx25_addr.x25_addr[i])) { + rc = -EINVAL; + goto out; + } } } diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 3c12cae32001..afb26221d8a8 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -142,6 +142,15 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->sk_state_change(sk); break; } + case X25_CALL_REQUEST: + /* call collision */ + x25->causediag.cause = 0x01; + x25->causediag.diagnostic = 0x48; + + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_disconnect(sk, EISCONN, 0x01, 0x48); + break; + case X25_CLEAR_REQUEST: if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2)) goto out_clear; |