diff options
Diffstat (limited to 'net')
387 files changed, 10024 insertions, 6165 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 09f1ec589b80..785a7bb6a539 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -412,8 +412,9 @@ static void p9_tag_cleanup(struct p9_client *c) /** * p9_client_cb - call back from transport to client - * c: client state - * req: request received + * @c: client state + * @req: request received + * @status: request status, one of REQ_STATUS_* * */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) @@ -555,6 +556,7 @@ out_err: * p9_check_zc_errors - check 9p packet for error return and process it * @c: current client instance * @req: request to parse and check for error conditions + * @uidata: external buffer containing error * @in_hdrlen: Size of response protocol buffer. * * returns error code if one is discovered, otherwise returns 0 diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index 3dff68f05fb9..6ea5ea548cd4 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -17,7 +17,9 @@ #include "trans_common.h" /** - * p9_release_pages - Release pages after the transaction. + * p9_release_pages - Release pages after the transaction. + * @pages: array of pages to be put + * @nr_pages: size of array */ void p9_release_pages(struct page **pages, int nr_pages) { diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 8f528e783a6c..fa158397bb63 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -45,7 +45,7 @@ static struct p9_trans_module p9_fd_trans; * @rfd: file descriptor for reading (trans=fd) * @wfd: file descriptor for writing (trans=fd) * @port: port to connect to (trans=tcp) - * + * @privport: port is privileged */ struct p9_fd_opts { @@ -95,6 +95,8 @@ struct p9_poll_wait { * @err: error state * @req_list: accounting for requests which have been sent * @unsent_req_list: accounting for requests that haven't been sent + * @rreq: read request + * @wreq: write request * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header * @rc: temporary fcall for reading current frame diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 2885ff9c76f0..af0a8a6cd3fd 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -99,6 +99,7 @@ struct p9_rdma_req; /** * struct p9_rdma_context - Keeps track of in-process WR * + * @cqe: completion queue entry * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) @@ -115,6 +116,7 @@ struct p9_rdma_context { /** * struct p9_rdma_opts - Collection of mount options * @port: port of connection + * @privport: Whether a privileged port may be used * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index a3cd90a74012..93f2f8654882 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -50,7 +50,11 @@ static atomic_t vp_pinned = ATOMIC_INIT(0); * @client: client instance * @vdev: virtio dev associated with this channel * @vq: virtio queue associated with this channel + * @ring_bufs_avail: flag to indicate there is some available in the ring buf + * @vc_wq: wait queue for waiting for thing to be added to ring buf + * @p9_max_pages: maximum number of pinned pages * @sg: scatter gather list which is used to pack a request (protected?) + * @chan_list: linked list of channels * * We keep all per-channel information in a structure. * This structure is allocated within the devices dev->mem space. @@ -74,8 +78,8 @@ struct virtio_chan { unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; - /* - * tag name to identify a mount null terminated + /** + * @tag: name to identify a mount null terminated */ char *tag; @@ -204,6 +208,7 @@ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) * this takes a list of pages. * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at + * @limit: maximum number of pages in sg list. * @pdata: a list of pages to add into sg. * @nr_pages: number of pages to pack into the scatter/gather list * @offs: amount of data in the beginning of first page _not_ to pack diff --git a/net/Kconfig b/net/Kconfig index d6567162c1cf..f4c32d982af6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -386,8 +386,6 @@ source "net/mac80211/Kconfig" endif # WIRELESS -source "net/wimax/Kconfig" - source "net/rfkill/Kconfig" source "net/9p/Kconfig" source "net/caif/Kconfig" diff --git a/net/Makefile b/net/Makefile index 5744bf1997fd..d96b0aa8f39f 100644 --- a/net/Makefile +++ b/net/Makefile @@ -66,7 +66,6 @@ obj-$(CONFIG_MAC802154) += mac802154/ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o endif -obj-$(CONFIG_WIMAX) += wimax/ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 45f584171de7..be18af481d7d 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -44,15 +44,15 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; /* Lists of aarp entries */ /** * struct aarp_entry - AARP entry - * @last_sent - Last time we xmitted the aarp request - * @packet_queue - Queue of frames wait for resolution - * @status - Used for proxy AARP - * expires_at - Entry expiry time - * target_addr - DDP Address - * dev - Device to use - * hwaddr - Physical i/f address of target/router - * xmit_count - When this hits 10 we give up - * next - Next entry in chain + * @last_sent: Last time we xmitted the aarp request + * @packet_queue: Queue of frames wait for resolution + * @status: Used for proxy AARP + * @expires_at: Entry expiry time + * @target_addr: DDP Address + * @dev: Device to use + * @hwaddr: Physical i/f address of target/router + * @xmit_count: When this hits 10 we give up + * @next: Next entry in chain */ struct aarp_entry { /* These first two are only used for unresolved entries */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 1d48708c5a2e..ca1a0d07a087 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1407,9 +1407,10 @@ drop: /** * atalk_rcv - Receive a packet (in skb) from device dev - * @skb - packet received - * @dev - network device where the packet comes from - * @pt - packet type + * @skb: packet received + * @dev: network device where the packet comes from + * @pt: packet type + * @orig_dev: the original receive net device * * Receive a packet (in skb) from device dev. This has come from the SNAP * decoder, and on entry skb->transport_header is the DDP header, skb->len diff --git a/net/atm/lec.c b/net/atm/lec.c index dbabb65d8b67..7226c784dbe0 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -954,9 +954,8 @@ static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct lec_state *state = seq->private; - v = lec_get_idx(state, 1); - *pos += !!PTR_ERR(v); - return v; + ++*pos; + return lec_get_idx(state, 1); } static int lec_seq_show(struct seq_file *seq, void *v) diff --git a/net/atm/raw.c b/net/atm/raw.c index b3ba44aab0ee..2b5f78a7ec3e 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -54,6 +54,8 @@ static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb) kfree_skb(skb); return -EADDRNOTAVAIL; } + if (vcc->dev->ops->send_bh) + return vcc->dev->ops->send_bh(vcc, skb); return vcc->dev->ops->send(vcc, skb); } @@ -71,7 +73,10 @@ int atm_init_aal34(struct atm_vcc *vcc) vcc->push = atm_push_raw; vcc->pop = atm_pop_raw; vcc->push_oam = NULL; - vcc->send = vcc->dev->ops->send; + if (vcc->dev->ops->send_bh) + vcc->send = vcc->dev->ops->send_bh; + else + vcc->send = vcc->dev->ops->send; return 0; } @@ -80,7 +85,10 @@ int atm_init_aal5(struct atm_vcc *vcc) vcc->push = atm_push_raw; vcc->pop = atm_pop_raw; vcc->push_oam = NULL; - vcc->send = vcc->dev->ops->send; + if (vcc->dev->ops->send_bh) + vcc->send = vcc->dev->ops->send_bh; + else + vcc->send = vcc->dev->ops->send; return 0; } EXPORT_SYMBOL(atm_init_aal5); diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 9a47ef8b95c4..1f1f5b0873b2 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -391,6 +391,7 @@ out: /** * batadv_frag_create() - create a fragment from skb + * @net_dev: outgoing device for fragment * @skb: skb to create fragment from * @frag_head: header to use in new fragment * @fragment_size: size of new fragment @@ -401,22 +402,25 @@ out: * * Return: the new fragment, NULL on error. */ -static struct sk_buff *batadv_frag_create(struct sk_buff *skb, +static struct sk_buff *batadv_frag_create(struct net_device *net_dev, + struct sk_buff *skb, struct batadv_frag_packet *frag_head, unsigned int fragment_size) { + unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev); + unsigned int tailroom = net_dev->needed_tailroom; struct sk_buff *skb_fragment; unsigned int header_size = sizeof(*frag_head); unsigned int mtu = fragment_size + header_size; - skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); + skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom); if (!skb_fragment) goto err; skb_fragment->priority = skb->priority; /* Eat the last mtu-bytes of the skb */ - skb_reserve(skb_fragment, header_size + ETH_HLEN); + skb_reserve(skb_fragment, ll_reserved + header_size); skb_split(skb, skb_fragment, skb->len - fragment_size); /* Add the header */ @@ -439,11 +443,12 @@ int batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node) { + struct net_device *net_dev = neigh_node->if_incoming->net_dev; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; - unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; + unsigned int mtu = net_dev->mtu; unsigned int header_size = sizeof(frag_header); unsigned int max_fragment_size, num_fragments; int ret; @@ -503,7 +508,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto put_primary_if; } - skb_fragment = batadv_frag_create(skb, &frag_header, + skb_fragment = batadv_frag_create(net_dev, skb, &frag_header, max_fragment_size); if (!skb_fragment) { ret = -ENOMEM; @@ -522,13 +527,14 @@ int batadv_frag_send_packet(struct sk_buff *skb, frag_header.no++; } - /* Make room for the fragment header. */ - if (batadv_skb_head_push(skb, header_size) < 0 || - pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) { - ret = -ENOMEM; + /* make sure that there is at least enough head for the fragmentation + * and ethernet headers + */ + ret = skb_cow_head(skb, ETH_HLEN + header_size); + if (ret < 0) goto put_primary_if; - } + skb_push(skb, header_size); memcpy(skb->data, &frag_header, header_size); /* Send the last fragment */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index dad99641df2a..33904595fc56 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -554,6 +554,9 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); + /* fragmentation headers don't strip the unicast/... header */ + needed_headroom += sizeof(struct batadv_frag_packet); + soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index a67b2b091447..c0ca5fbe5b08 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -180,6 +180,7 @@ static const struct file_operations batadv_log_fops = { .read = batadv_log_read, .poll = batadv_log_poll, .llseek = no_llseek, + .owner = THIS_MODULE, }; /** diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 8579bfeb2836..4b39534a14a1 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -12,12 +12,13 @@ struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; + struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; - __u8 evt_prefix[0]; + __u8 evt_prefix[]; } __packed; struct msft_data { diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index 80879196560c..3c8ded7d3e84 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -73,3 +73,14 @@ config BRIDGE_MRP Say N to exclude this support and reduce the binary size. If unsure, say N. + +config BRIDGE_CFM + bool "CFM protocol" + depends on BRIDGE + help + If you say Y here, then the Ethernet bridge will be able to run CFM + protocol according to 802.1Q section 12.14 + + Say N to exclude this support and reduce the binary size. + + If unsure, say N. diff --git a/net/bridge/Makefile b/net/bridge/Makefile index ccb394236fbd..4702702a74d3 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -27,3 +27,5 @@ bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o + +bridge-$(CONFIG_BRIDGE_CFM) += br_cfm.o br_cfm_netlink.o diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c new file mode 100644 index 000000000000..001064f7583d --- /dev/null +++ b/net/bridge/br_cfm.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/cfm_bridge.h> +#include <uapi/linux/cfm_bridge.h> +#include "br_private_cfm.h" + +static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance) +{ + struct br_cfm_mep *mep; + + hlist_for_each_entry(mep, &br->mep_list, head) + if (mep->instance == instance) + return mep; + + return NULL; +} + +static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br, + u32 ifindex) +{ + struct br_cfm_mep *mep; + + hlist_for_each_entry_rcu(mep, &br->mep_list, head, + lockdep_rtnl_is_held()) + if (mep->create.ifindex == ifindex) + return mep; + + return NULL; +} + +static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep, + u32 mepid) +{ + struct br_cfm_peer_mep *peer_mep; + + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head, + lockdep_rtnl_is_held()) + if (peer_mep->mepid == mepid) + return peer_mep; + + return NULL; +} + +static struct net_bridge_port *br_mep_get_port(struct net_bridge *br, + u32 ifindex) +{ + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) + if (port->dev->ifindex == ifindex) + return port; + + return NULL; +} + +/* Calculate the CCM interval in us. */ +static u32 interval_to_us(enum br_cfm_ccm_interval interval) +{ + switch (interval) { + case BR_CFM_CCM_INTERVAL_NONE: + return 0; + case BR_CFM_CCM_INTERVAL_3_3_MS: + return 3300; + case BR_CFM_CCM_INTERVAL_10_MS: + return 10 * 1000; + case BR_CFM_CCM_INTERVAL_100_MS: + return 100 * 1000; + case BR_CFM_CCM_INTERVAL_1_SEC: + return 1000 * 1000; + case BR_CFM_CCM_INTERVAL_10_SEC: + return 10 * 1000 * 1000; + case BR_CFM_CCM_INTERVAL_1_MIN: + return 60 * 1000 * 1000; + case BR_CFM_CCM_INTERVAL_10_MIN: + return 10 * 60 * 1000 * 1000; + } + return 0; +} + +/* Convert the interface interval to CCM PDU value. */ +static u32 interval_to_pdu(enum br_cfm_ccm_interval interval) +{ + switch (interval) { + case BR_CFM_CCM_INTERVAL_NONE: + return 0; + case BR_CFM_CCM_INTERVAL_3_3_MS: + return 1; + case BR_CFM_CCM_INTERVAL_10_MS: + return 2; + case BR_CFM_CCM_INTERVAL_100_MS: + return 3; + case BR_CFM_CCM_INTERVAL_1_SEC: + return 4; + case BR_CFM_CCM_INTERVAL_10_SEC: + return 5; + case BR_CFM_CCM_INTERVAL_1_MIN: + return 6; + case BR_CFM_CCM_INTERVAL_10_MIN: + return 7; + } + return 0; +} + +/* Convert the CCM PDU value to interval on interface. */ +static u32 pdu_to_interval(u32 value) +{ + switch (value) { + case 0: + return BR_CFM_CCM_INTERVAL_NONE; + case 1: + return BR_CFM_CCM_INTERVAL_3_3_MS; + case 2: + return BR_CFM_CCM_INTERVAL_10_MS; + case 3: + return BR_CFM_CCM_INTERVAL_100_MS; + case 4: + return BR_CFM_CCM_INTERVAL_1_SEC; + case 5: + return BR_CFM_CCM_INTERVAL_10_SEC; + case 6: + return BR_CFM_CCM_INTERVAL_1_MIN; + case 7: + return BR_CFM_CCM_INTERVAL_10_MIN; + } + return BR_CFM_CCM_INTERVAL_NONE; +} + +static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) +{ + u32 interval_us; + + interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval); + /* Function ccm_rx_dwork must be called with 1/4 + * of the configured CC 'expected_interval' + * in order to detect CCM defect after 3.25 interval. + */ + queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork, + usecs_to_jiffies(interval_us / 4)); +} + +static void br_cfm_notify(int event, const struct net_bridge_port *port) +{ + u32 filter = RTEXT_FILTER_CFM_STATUS; + + return br_info_notify(event, port->br, NULL, filter); +} + +static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep) +{ + memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status)); + peer_mep->ccm_rx_count_miss = 0; + + ccm_rx_timer_start(peer_mep); +} + +static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep) +{ + cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); +} + +static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep, + const struct br_cfm_cc_ccm_tx_info *const tx_info) + +{ + struct br_cfm_common_hdr *common_hdr; + struct net_bridge_port *b_port; + struct br_cfm_maid *maid; + u8 *itu_reserved, *e_tlv; + struct ethhdr *eth_hdr; + struct sk_buff *skb; + __be32 *status_tlv; + __be32 *snumber; + __be16 *mepid; + + skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH); + if (!skb) + return NULL; + + rcu_read_lock(); + b_port = rcu_dereference(mep->b_port); + if (!b_port) { + kfree_skb(skb); + rcu_read_unlock(); + return NULL; + } + skb->dev = b_port->dev; + rcu_read_unlock(); + /* The device cannot be deleted until the work_queue functions has + * completed. This function is called from ccm_tx_work_expired() + * that is a work_queue functions. + */ + + skb->protocol = htons(ETH_P_CFM); + skb->priority = CFM_FRAME_PRIO; + + /* Ethernet header */ + eth_hdr = skb_put(skb, sizeof(*eth_hdr)); + ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr); + ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr); + eth_hdr->h_proto = htons(ETH_P_CFM); + + /* Common CFM Header */ + common_hdr = skb_put(skb, sizeof(*common_hdr)); + common_hdr->mdlevel_version = mep->config.mdlevel << 5; + common_hdr->opcode = BR_CFM_OPCODE_CCM; + common_hdr->flags = (mep->rdi << 7) | + interval_to_pdu(mep->cc_config.exp_interval); + common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET; + + /* Sequence number */ + snumber = skb_put(skb, sizeof(*snumber)); + if (tx_info->seq_no_update) { + *snumber = cpu_to_be32(mep->ccm_tx_snumber); + mep->ccm_tx_snumber += 1; + } else { + *snumber = 0; + } + + mepid = skb_put(skb, sizeof(*mepid)); + *mepid = cpu_to_be16((u16)mep->config.mepid); + + maid = skb_put(skb, sizeof(*maid)); + memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data)); + + /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */ + itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE); + memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE); + + /* Generel CFM TLV format: + * TLV type: one byte + * TLV value length: two bytes + * TLV value: 'TLV value length' bytes + */ + + /* Port status TLV. The value length is 1. Total of 4 bytes. */ + if (tx_info->port_tlv) { + status_tlv = skb_put(skb, sizeof(*status_tlv)); + *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) | + (1 << 8) | /* Value length */ + (tx_info->port_tlv_value & 0xFF)); + } + + /* Interface status TLV. The value length is 1. Total of 4 bytes. */ + if (tx_info->if_tlv) { + status_tlv = skb_put(skb, sizeof(*status_tlv)); + *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) | + (1 << 8) | /* Value length */ + (tx_info->if_tlv_value & 0xFF)); + } + + /* End TLV */ + e_tlv = skb_put(skb, sizeof(*e_tlv)); + *e_tlv = CFM_ENDE_TLV_TYPE; + + return skb; +} + +static void ccm_frame_tx(struct sk_buff *skb) +{ + skb_reset_network_header(skb); + dev_queue_xmit(skb); +} + +/* This function is called with the configured CC 'expected_interval' + * in order to drive CCM transmission when enabled. + */ +static void ccm_tx_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work; + struct br_cfm_mep *mep; + struct sk_buff *skb; + u32 interval_us; + + del_work = to_delayed_work(work); + mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork); + + if (time_before_eq(mep->ccm_tx_end, jiffies)) { + /* Transmission period has ended */ + mep->cc_ccm_tx_info.period = 0; + return; + } + + skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info); + if (skb) + ccm_frame_tx(skb); + + interval_us = interval_to_us(mep->cc_config.exp_interval); + queue_delayed_work(system_wq, &mep->ccm_tx_dwork, + usecs_to_jiffies(interval_us)); +} + +/* This function is called with 1/4 of the configured CC 'expected_interval' + * in order to detect CCM defect after 3.25 interval. + */ +static void ccm_rx_work_expired(struct work_struct *work) +{ + struct br_cfm_peer_mep *peer_mep; + struct net_bridge_port *b_port; + struct delayed_work *del_work; + + del_work = to_delayed_work(work); + peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork); + + /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */ + if (peer_mep->ccm_rx_count_miss < 13) { + /* 3.25 intervals are NOT expired without CCM reception */ + peer_mep->ccm_rx_count_miss++; + + /* Start timer again */ + ccm_rx_timer_start(peer_mep); + } else { + /* 3.25 intervals are expired without CCM reception. + * CCM defect detected + */ + peer_mep->cc_status.ccm_defect = true; + + /* Change in CCM defect status - notify */ + rcu_read_lock(); + b_port = rcu_dereference(peer_mep->mep->b_port); + if (b_port) + br_cfm_notify(RTM_NEWLINK, b_port); + rcu_read_unlock(); + } +} + +static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index, + struct br_cfm_peer_mep *peer_mep) +{ + __be32 *s_tlv; + __be32 _s_tlv; + u32 h_s_tlv; + u8 *e_tlv; + u8 _e_tlv; + + e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv); + if (!e_tlv) + return 0; + + /* TLV is present - get the status TLV */ + s_tlv = skb_header_pointer(skb, + index, + sizeof(_s_tlv), &_s_tlv); + if (!s_tlv) + return 0; + + h_s_tlv = ntohl(*s_tlv); + if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) { + /* Interface status TLV */ + peer_mep->cc_status.tlv_seen = true; + peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF); + } + + if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) { + /* Port status TLV */ + peer_mep->cc_status.tlv_seen = true; + peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF); + } + + /* The Sender ID TLV is not handled */ + /* The Organization-Specific TLV is not handled */ + + /* Return the length of this tlv. + * This is the length of the value field plus 3 bytes for size of type + * field and length field + */ + return ((h_s_tlv >> 8) & 0xFFFF) + 3; +} + +/* note: already called with rcu_read_lock */ +static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb) +{ + u32 mdlevel, interval, size, index, max; + const struct br_cfm_common_hdr *hdr; + struct br_cfm_peer_mep *peer_mep; + const struct br_cfm_maid *maid; + struct br_cfm_common_hdr _hdr; + struct br_cfm_maid _maid; + struct br_cfm_mep *mep; + struct net_bridge *br; + __be32 *snumber; + __be32 _snumber; + __be16 *mepid; + __be16 _mepid; + + if (port->state == BR_STATE_DISABLED) + return 0; + + hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr); + if (!hdr) + return 1; + + br = port->br; + mep = br_mep_find_ifindex(br, port->dev->ifindex); + if (unlikely(!mep)) + /* No MEP on this port - must be forwarded */ + return 0; + + mdlevel = hdr->mdlevel_version >> 5; + if (mdlevel > mep->config.mdlevel) + /* The level is above this MEP level - must be forwarded */ + return 0; + + if ((hdr->mdlevel_version & 0x1F) != 0) { + /* Invalid version */ + mep->status.version_unexp_seen = true; + return 1; + } + + if (mdlevel < mep->config.mdlevel) { + /* The level is below this MEP level */ + mep->status.rx_level_low_seen = true; + return 1; + } + + if (hdr->opcode == BR_CFM_OPCODE_CCM) { + /* CCM PDU received. */ + /* MA ID is after common header + sequence number + MEP ID */ + maid = skb_header_pointer(skb, + CFM_CCM_PDU_MAID_OFFSET, + sizeof(_maid), &_maid); + if (!maid) + return 1; + if (memcmp(maid->data, mep->cc_config.exp_maid.data, + sizeof(maid->data))) + /* MA ID not as expected */ + return 1; + + /* MEP ID is after common header + sequence number */ + mepid = skb_header_pointer(skb, + CFM_CCM_PDU_MEPID_OFFSET, + sizeof(_mepid), &_mepid); + if (!mepid) + return 1; + peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid)); + if (!peer_mep) + return 1; + + /* Interval is in common header flags */ + interval = hdr->flags & 0x07; + if (mep->cc_config.exp_interval != pdu_to_interval(interval)) + /* Interval not as expected */ + return 1; + + /* A valid CCM frame is received */ + if (peer_mep->cc_status.ccm_defect) { + peer_mep->cc_status.ccm_defect = false; + + /* Change in CCM defect status - notify */ + br_cfm_notify(RTM_NEWLINK, port); + + /* Start CCM RX timer */ + ccm_rx_timer_start(peer_mep); + } + + peer_mep->cc_status.seen = true; + peer_mep->ccm_rx_count_miss = 0; + + /* RDI is in common header flags */ + peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false; + + /* Sequence number is after common header */ + snumber = skb_header_pointer(skb, + CFM_CCM_PDU_SEQNR_OFFSET, + sizeof(_snumber), &_snumber); + if (!snumber) + return 1; + if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1)) + /* Unexpected sequence number */ + peer_mep->cc_status.seq_unexp_seen = true; + + mep->ccm_rx_snumber = ntohl(*snumber); + + /* TLV end is after common header + sequence number + MEP ID + + * MA ID + ITU reserved + */ + index = CFM_CCM_PDU_TLV_OFFSET; + max = 0; + do { /* Handle all TLVs */ + size = ccm_tlv_extract(skb, index, peer_mep); + index += size; + max += 1; + } while (size != 0 && max < 4); /* Max four TLVs possible */ + + return 1; + } + + mep->status.opcode_unexp_seen = true; + + return 1; +} + +static struct br_frame_type cfm_frame_type __read_mostly = { + .type = cpu_to_be16(ETH_P_CFM), + .frame_handler = br_cfm_frame_rx, +}; + +int br_cfm_mep_create(struct net_bridge *br, + const u32 instance, + struct br_cfm_mep_create *const create, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + if (create->domain == BR_CFM_VLAN) { + NL_SET_ERR_MSG_MOD(extack, + "VLAN domain not supported"); + return -EINVAL; + } + if (create->domain != BR_CFM_PORT) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid domain value"); + return -EINVAL; + } + if (create->direction == BR_CFM_MEP_DIRECTION_UP) { + NL_SET_ERR_MSG_MOD(extack, + "Up-MEP not supported"); + return -EINVAL; + } + if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid direction value"); + return -EINVAL; + } + p = br_mep_get_port(br, create->ifindex); + if (!p) { + NL_SET_ERR_MSG_MOD(extack, + "Port is not related to bridge"); + return -EINVAL; + } + mep = br_mep_find(br, instance); + if (mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance already exists"); + return -EEXIST; + } + + /* In PORT domain only one instance can be created per port */ + if (create->domain == BR_CFM_PORT) { + mep = br_mep_find_ifindex(br, create->ifindex); + if (mep) { + NL_SET_ERR_MSG_MOD(extack, + "Only one Port MEP on a port allowed"); + return -EINVAL; + } + } + + mep = kzalloc(sizeof(*mep), GFP_KERNEL); + if (!mep) + return -ENOMEM; + + mep->create = *create; + mep->instance = instance; + rcu_assign_pointer(mep->b_port, p); + + INIT_HLIST_HEAD(&mep->peer_mep_list); + INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired); + + if (hlist_empty(&br->mep_list)) + br_add_frame(br, &cfm_frame_type); + + hlist_add_tail_rcu(&mep->head, &br->mep_list); + + return 0; +} + +static void mep_delete_implementation(struct net_bridge *br, + struct br_cfm_mep *mep) +{ + struct br_cfm_peer_mep *peer_mep; + struct hlist_node *n_store; + + ASSERT_RTNL(); + + /* Empty and free peer MEP list */ + hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) { + cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); + hlist_del_rcu(&peer_mep->head); + kfree_rcu(peer_mep, rcu); + } + + cancel_delayed_work_sync(&mep->ccm_tx_dwork); + + RCU_INIT_POINTER(mep->b_port, NULL); + hlist_del_rcu(&mep->head); + kfree_rcu(mep, rcu); + + if (hlist_empty(&br->mep_list)) + br_del_frame(br, &cfm_frame_type); +} + +int br_cfm_mep_delete(struct net_bridge *br, + const u32 instance, + struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + mep_delete_implementation(br, mep); + + return 0; +} + +int br_cfm_mep_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_mep_config *const config, + struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + mep->config = *config; + + return 0; +} + +int br_cfm_cc_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_cc_config *const config, + struct netlink_ext_ack *extack) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + /* Check for no change in configuration */ + if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0) + return 0; + + if (config->enable && !mep->cc_config.enable) + /* CC is enabled */ + hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) + cc_peer_enable(peer_mep); + + if (!config->enable && mep->cc_config.enable) + /* CC is disabled */ + hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) + cc_peer_disable(peer_mep); + + mep->cc_config = *config; + mep->ccm_rx_snumber = 0; + mep->ccm_tx_snumber = 1; + + return 0; +} + +int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, + u32 mepid, + struct netlink_ext_ack *extack) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + peer_mep = br_peer_mep_find(mep, mepid); + if (peer_mep) { + NL_SET_ERR_MSG_MOD(extack, + "Peer MEP-ID already exists"); + return -EEXIST; + } + + peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL); + if (!peer_mep) + return -ENOMEM; + + peer_mep->mepid = mepid; + peer_mep->mep = mep; + INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired); + + if (mep->cc_config.enable) + cc_peer_enable(peer_mep); + + hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list); + + return 0; +} + +int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, + u32 mepid, + struct netlink_ext_ack *extack) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + peer_mep = br_peer_mep_find(mep, mepid); + if (!peer_mep) { + NL_SET_ERR_MSG_MOD(extack, + "Peer MEP-ID does not exists"); + return -ENOENT; + } + + cc_peer_disable(peer_mep); + + hlist_del_rcu(&peer_mep->head); + kfree_rcu(peer_mep, rcu); + + return 0; +} + +int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance, + const bool rdi, struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + mep->rdi = rdi; + + return 0; +} + +int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, + const struct br_cfm_cc_ccm_tx_info *const tx_info, + struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) { + /* No change in tx_info. */ + if (mep->cc_ccm_tx_info.period == 0) + /* Transmission is not enabled - just return */ + return 0; + + /* Transmission is ongoing, the end time is recalculated */ + mep->ccm_tx_end = jiffies + + usecs_to_jiffies(tx_info->period * 1000000); + return 0; + } + + if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0) + /* Some change in info and transmission is not ongoing */ + goto save; + + if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) { + /* Some change in info and transmission is ongoing + * The end time is recalculated + */ + mep->ccm_tx_end = jiffies + + usecs_to_jiffies(tx_info->period * 1000000); + + goto save; + } + + if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) { + cancel_delayed_work_sync(&mep->ccm_tx_dwork); + goto save; + } + + /* Start delayed work to transmit CCM frames. It is done with zero delay + * to send first frame immediately + */ + mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); + queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0); + +save: + mep->cc_ccm_tx_info = *tx_info; + + return 0; +} + +int br_cfm_mep_count(struct net_bridge *br, u32 *count) +{ + struct br_cfm_mep *mep; + + *count = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mep, &br->mep_list, head) + *count += 1; + rcu_read_unlock(); + + return 0; +} + +int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + *count = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mep, &br->mep_list, head) + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) + *count += 1; + rcu_read_unlock(); + + return 0; +} + +bool br_cfm_created(struct net_bridge *br) +{ + return !hlist_empty(&br->mep_list); +} + +/* Deletes the CFM instances on a specific bridge port + */ +void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port) +{ + struct hlist_node *n_store; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head) + if (mep->create.ifindex == port->dev->ifindex) + mep_delete_implementation(br, mep); +} diff --git a/net/bridge/br_cfm_netlink.c b/net/bridge/br_cfm_netlink.c new file mode 100644 index 000000000000..5c4c369f8536 --- /dev/null +++ b/net/bridge/br_cfm_netlink.c @@ -0,0 +1,726 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/genetlink.h> + +#include "br_private.h" +#include "br_private_cfm.h" + +static const struct nla_policy +br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = { + [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = { + [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = { + [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR, + [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7), + [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), +}; + +static const struct nla_policy +br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = { + .type = NLA_BINARY, .len = CFM_MAID_LENGTH }, +}; + +static const struct nla_policy +br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), +}; + +static const struct nla_policy +br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR, + [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 }, +}; + +static const struct nla_policy +br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = { + [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_CREATE] = + NLA_POLICY_NESTED(br_cfm_mep_create_policy), + [IFLA_BRIDGE_CFM_MEP_DELETE] = + NLA_POLICY_NESTED(br_cfm_mep_delete_policy), + [IFLA_BRIDGE_CFM_MEP_CONFIG] = + NLA_POLICY_NESTED(br_cfm_mep_config_policy), + [IFLA_BRIDGE_CFM_CC_CONFIG] = + NLA_POLICY_NESTED(br_cfm_cc_config_policy), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] = + NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] = + NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), + [IFLA_BRIDGE_CFM_CC_RDI] = + NLA_POLICY_NESTED(br_cfm_cc_rdi_policy), + [IFLA_BRIDGE_CFM_CC_CCM_TX] = + NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy), +}; + +static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1]; + struct br_cfm_mep_create create; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr, + br_cfm_mep_create_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) { + NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) { + NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) { + NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute"); + return -EINVAL; + } + + memset(&create, 0, sizeof(create)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]); + create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]); + create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]); + create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]); + + return br_cfm_mep_create(br, instance, &create, extack); +} + +static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1]; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr, + br_cfm_mep_delete_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing INSTANCE attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]); + + return br_cfm_mep_delete(br, instance, extack); +} + +static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1]; + struct br_cfm_mep_config config; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr, + br_cfm_mep_config_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) { + NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute"); + return -EINVAL; + } + + memset(&config, 0, sizeof(config)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]); + nla_memcpy(&config.unicast_mac.addr, + tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC], + sizeof(config.unicast_mac.addr)); + config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]); + config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]); + + return br_cfm_mep_config_set(br, instance, &config, extack); +} + +static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1]; + struct br_cfm_cc_config config; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr, + br_cfm_cc_config_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute"); + return -EINVAL; + } + + memset(&config, 0, sizeof(config)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]); + config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]); + config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]); + nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID], + sizeof(config.exp_maid.data)); + + return br_cfm_cc_config_set(br, instance, &config, extack); +} + +static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; + u32 instance, peer_mep_id; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, + br_cfm_cc_peer_mep_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); + peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); + + return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack); +} + +static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; + u32 instance, peer_mep_id; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, + br_cfm_cc_peer_mep_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); + peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); + + return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack); +} + +static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1]; + u32 instance, rdi; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr, + br_cfm_cc_rdi_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) { + NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); + rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]); + + return br_cfm_cc_rdi_set(br, instance, rdi, extack); +} + +static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1]; + struct br_cfm_cc_ccm_tx_info tx_info; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr, + br_cfm_cc_ccm_tx_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) { + NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) { + NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute"); + return -EINVAL; + } + + memset(&tx_info, 0, sizeof(tx_info)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); + nla_memcpy(&tx_info.dmac.addr, + tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC], + sizeof(tx_info.dmac.addr)); + tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]); + tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]); + tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]); + tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]); + tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]); + tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]); + + return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack); +} + +int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1]; + int err; + + /* When this function is called for a port then the br pointer is + * invalid, therefor set the br to point correctly + */ + if (p) + br = p->br; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr, + br_cfm_policy, extack); + if (err) + return err; + + if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) { + err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) { + err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) { + err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) { + err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) { + err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) { + err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_RDI]) { + err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) { + err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX], + extack); + if (err) + return err; + } + + return 0; +} + +int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + struct nlattr *tb; + + hlist_for_each_entry_rcu(mep, &br->mep_list, head) { + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN, + mep->create.domain)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION, + mep->create.direction)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX, + mep->create.ifindex)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC, + sizeof(mep->config.unicast_mac.addr), + mep->config.unicast_mac.addr)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL, + mep->config.mdlevel)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID, + mep->config.mepid)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE, + mep->cc_config.enable)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL, + mep->cc_config.exp_interval)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID, + sizeof(mep->cc_config.exp_maid.data), + mep->cc_config.exp_maid.data)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI, + mep->rdi)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC, + sizeof(mep->cc_ccm_tx_info.dmac), + mep->cc_ccm_tx_info.dmac.addr)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE, + mep->cc_ccm_tx_info.seq_no_update)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD, + mep->cc_ccm_tx_info.period)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV, + mep->cc_ccm_tx_info.if_tlv)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE, + mep->cc_ccm_tx_info.if_tlv_value)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV, + mep->cc_ccm_tx_info.port_tlv)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE, + mep->cc_ccm_tx_info.port_tlv_value)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { + tb = nla_nest_start(skb, + IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID, + peer_mep->mepid)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + } + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tb); + +nla_info_failure: + return -EMSGSIZE; +} + +int br_cfm_status_fill_info(struct sk_buff *skb, + struct net_bridge *br, + bool getlink) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + struct nlattr *tb; + + hlist_for_each_entry_rcu(mep, &br->mep_list, head) { + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN, + mep->status.opcode_unexp_seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN, + mep->status.version_unexp_seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN, + mep->status.rx_level_low_seen)) + goto nla_put_failure; + + /* Only clear if this is a GETLINK */ + if (getlink) { + /* Clear all 'seen' indications */ + mep->status.opcode_unexp_seen = false; + mep->status.version_unexp_seen = false; + mep->status.rx_level_low_seen = false; + } + + nla_nest_end(skb, tb); + + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { + tb = nla_nest_start(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID, + peer_mep->mepid)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT, + peer_mep->cc_status.ccm_defect)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI, + peer_mep->cc_status.rdi)) + goto nla_put_failure; + + if (nla_put_u8(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE, + peer_mep->cc_status.port_tlv_value)) + goto nla_put_failure; + + if (nla_put_u8(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE, + peer_mep->cc_status.if_tlv_value)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN, + peer_mep->cc_status.seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN, + peer_mep->cc_status.tlv_seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN, + peer_mep->cc_status.seq_unexp_seen)) + goto nla_put_failure; + + if (getlink) { /* Only clear if this is a GETLINK */ + /* Clear all 'seen' indications */ + peer_mep->cc_status.seen = false; + peer_mep->cc_status.tlv_seen = false; + peer_mep->cc_status.seq_unexp_seen = false; + } + + nla_nest_end(skb, tb); + } + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tb); + +nla_info_failure: + return -EMSGSIZE; +} diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 6f742fee874a..adb674a860d3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -30,7 +30,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; - struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; const unsigned char *dest; @@ -45,10 +44,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } - u64_stats_update_begin(&brstats->syncp); - brstats->tx_packets++; - brstats->tx_bytes += skb->len; - u64_stats_update_end(&brstats->syncp); + dev_sw_netstats_tx_add(dev, 1, skb->len); br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; @@ -93,7 +89,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb))) + br_multicast_querier_exists(br, eth_hdr(skb), mdst)) br_multicast_flood(mdst, skb, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true); @@ -119,26 +115,26 @@ static int br_dev_init(struct net_device *dev) struct net_bridge *br = netdev_priv(dev); int err; - br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!br->stats) + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) return -ENOMEM; err = br_fdb_hash_init(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); return err; } err = br_mdb_hash_init(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); br_fdb_hash_fini(br); return err; } err = br_vlan_init(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; @@ -146,7 +142,7 @@ static int br_dev_init(struct net_device *dev) err = br_multicast_init_stats(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); @@ -165,7 +161,7 @@ static void br_dev_uninit(struct net_device *dev) br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); - free_percpu(br->stats); + free_percpu(dev->tstats); } static int br_dev_open(struct net_device *dev) @@ -202,14 +198,6 @@ static int br_dev_stop(struct net_device *dev) return 0; } -static void br_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - struct net_bridge *br = netdev_priv(dev); - - dev_fetch_sw_netstats(stats, br->stats); -} - static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); @@ -403,7 +391,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_init = br_dev_init, .ndo_uninit = br_dev_uninit, .ndo_start_xmit = br_dev_xmit, - .ndo_get_stats64 = br_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = br_set_mac_address, .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_rx_flags = br_dev_change_rx_flags, @@ -454,8 +442,12 @@ void br_dev_setup(struct net_device *dev) spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); + INIT_HLIST_HEAD(&br->frame_type_list); #if IS_ENABLED(CONFIG_BRIDGE_MRP) - INIT_LIST_HEAD(&br->mrp_list); + INIT_HLIST_HEAD(&br->mrp_list); +#endif +#if IS_ENABLED(CONFIG_BRIDGE_CFM) + INIT_HLIST_HEAD(&br->mep_list); #endif spin_lock_init(&br->hash_lock); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a0e9a7937412..f7d2f472ae24 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -334,6 +334,7 @@ static void del_nbp(struct net_bridge_port *p) spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); + br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 59a318b9f646..8ca1f1bc6d12 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -35,12 +35,8 @@ static int br_pass_frame_up(struct sk_buff *skb) struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; - struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); - u64_stats_update_begin(&brstats->syncp); - brstats->rx_packets++; - brstats->rx_bytes += skb->len; - u64_stats_update_end(&brstats->syncp); + dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Bridge is just like any other port. Make sure the @@ -134,7 +130,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb case BR_PKT_MULTICAST: mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb))) { + br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(br)) { local_rcv = true; @@ -254,6 +250,21 @@ frame_finish: return RX_HANDLER_CONSUMED; } +/* Return 0 if the frame was not processed otherwise 1 + * note: already called with rcu_read_lock + */ +static int br_process_frame_type(struct net_bridge_port *p, + struct sk_buff *skb) +{ + struct br_frame_type *tmp; + + hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) + if (unlikely(tmp->type == skb->protocol)) + return tmp->frame_handler(p, skb); + + return 0; +} + /* * Return NULL if skb is handled * note: already called with rcu_read_lock @@ -343,7 +354,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } } - if (unlikely(br_mrp_process(p, skb))) + if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: @@ -380,3 +391,19 @@ rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) return br_handle_frame; } + +void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) +{ + hlist_add_head_rcu(&ft->list, &br->frame_type_list); +} + +void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) +{ + struct br_frame_type *tmp; + + hlist_for_each_entry(tmp, &br->frame_type_list, list) + if (ft == tmp) { + hlist_del_rcu(&ft->list); + return; + } +} diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index e15bab19a012..8846c5bcd075 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -87,6 +87,8 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif + default: + ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } @@ -174,9 +176,11 @@ static int __mdb_fill_info(struct sk_buff *skb, if (mp->addr.proto == htons(ETH_P_IP)) e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (mp->addr.proto == htons(ETH_P_IPV6)) + else if (mp->addr.proto == htons(ETH_P_IPV6)) e.addr.u.ip6 = mp->addr.dst.ip6; #endif + else + ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); @@ -210,6 +214,8 @@ static int __mdb_fill_info(struct sk_buff *skb, } break; #endif + default: + ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) @@ -562,9 +568,12 @@ void br_mdb_notify(struct net_device *dev, if (mp->addr.proto == htons(ETH_P_IP)) ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); #if IS_ENABLED(CONFIG_IPV6) - else + else if (mp->addr.proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); #endif + else + ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr); + mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: @@ -693,6 +702,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry, return false; } #endif + } else if (entry->addr.proto == 0) { + /* L2 mdb */ + if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { + NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast"); + return false; + } } else { NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol"); return false; @@ -831,6 +846,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_port_group __rcu **pp; struct br_ip group, star_group; unsigned long now = jiffies; + unsigned char flags = 0; u8 filter_mode; int err; @@ -849,6 +865,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } } + if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) { + NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); + return -EINVAL; + } + mp = br_mdb_ip_get(br, &group); if (!mp) { mp = br_multicast_new_group(br, &group); @@ -884,7 +905,10 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : MCAST_INCLUDE; - p = br_multicast_new_port_group(port, &group, *pp, entry->state, NULL, + if (entry->state == MDB_PERMANENT) + flags |= MDB_PG_FLAGS_PERMANENT; + + p = br_multicast_new_port_group(port, &group, *pp, flags, NULL, filter_mode, RTPROT_STATIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index b36689e6e7cb..cec2c4e4561d 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -6,6 +6,13 @@ static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 }; +static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); + +static struct br_frame_type mrp_frame_type __read_mostly = { + .type = cpu_to_be16(ETH_P_MRP), + .frame_handler = br_mrp_process, +}; + static bool br_mrp_is_ring_port(struct net_bridge_port *p_port, struct net_bridge_port *s_port, struct net_bridge_port *port) @@ -47,8 +54,8 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) struct br_mrp *res = NULL; struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { if (mrp->ring_id == ring_id) { res = mrp; break; @@ -63,8 +70,8 @@ static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id) struct br_mrp *res = NULL; struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { if (mrp->in_id == in_id) { res = mrp; break; @@ -78,8 +85,8 @@ static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) { struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { struct net_bridge_port *p; p = rtnl_dereference(mrp->p_port); @@ -104,8 +111,8 @@ static struct br_mrp *br_mrp_find_port(struct net_bridge *br, struct br_mrp *res = NULL; struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { if (rcu_access_pointer(mrp->p_port) == p || rcu_access_pointer(mrp->s_port) == p || rcu_access_pointer(mrp->i_port) == p) { @@ -443,8 +450,11 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) rcu_assign_pointer(mrp->i_port, NULL); } - list_del_rcu(&mrp->list); + hlist_del_rcu(&mrp->list); kfree_rcu(mrp, rcu); + + if (hlist_empty(&br->mrp_list)) + br_del_frame(br, &mrp_frame_type); } /* Adds a new MRP instance. @@ -493,9 +503,12 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) spin_unlock_bh(&br->lock); rcu_assign_pointer(mrp->s_port, p); + if (hlist_empty(&br->mrp_list)) + br_add_frame(br, &mrp_frame_type); + INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired); - list_add_tail_rcu(&mrp->list, &br->mrp_list); + hlist_add_tail_rcu(&mrp->list, &br->mrp_list); err = br_mrp_switchdev_add(br, mrp); if (err) @@ -845,7 +858,8 @@ static bool br_mrp_in_frame(struct sk_buff *skb) if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST || hdr->type == BR_MRP_TLV_HEADER_IN_TOPO || hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN || - hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP) + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP || + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) return true; return false; @@ -1113,9 +1127,9 @@ static int br_mrp_rcv(struct net_bridge_port *p, goto no_forward; } } else { - /* MIM should forward IntLinkChange and + /* MIM should forward IntLinkChange/Status and * IntTopoChange between ring ports but MIM - * should not forward IntLinkChange and + * should not forward IntLinkChange/Status and * IntTopoChange if the frame was received at * the interconnect port */ @@ -1142,6 +1156,17 @@ static int br_mrp_rcv(struct net_bridge_port *p, in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN)) goto forward; + /* MIC should forward IntLinkStatus frames only to + * interconnect port if it was received on a ring port. + * If it is received on interconnect port then, it + * should be forward on both ring ports + */ + if (br_mrp_is_ring_port(p_port, s_port, p) && + in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) { + p_dst = NULL; + s_dst = NULL; + } + /* Should forward the InTopo frames only between the * ring ports */ @@ -1172,20 +1197,18 @@ no_forward: * normal forwarding. * note: already called with rcu_read_lock */ -int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) { /* If there is no MRP instance do normal forwarding */ if (likely(!(p->flags & BR_MRP_AWARE))) goto out; - if (unlikely(skb->protocol == htons(ETH_P_MRP))) - return br_mrp_rcv(p, skb, p->dev); - + return br_mrp_rcv(p, skb, p->dev); out: return 0; } bool br_mrp_enabled(struct net_bridge *br) { - return !list_empty(&br->mrp_list); + return !hlist_empty(&br->mrp_list); } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 2a2fdf3500c5..ce6f63c77cc0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -453,7 +453,7 @@ int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) if (!mrp_tb) return -EMSGSIZE; - list_for_each_entry_rcu(mrp, &br->mrp_list, list) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list) { struct net_bridge_port *p; tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index eae898c3cff7..484820c223a3 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -179,7 +179,8 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, break; #endif default: - return NULL; + ip.proto = 0; + ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest); } return br_mdb_ip_get_rcu(br, &ip); @@ -1203,6 +1204,10 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } + + if (br_group_is_l2(&mp->addr)) + return; + mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); } @@ -1254,8 +1259,8 @@ __br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode, - RTPROT_KERNEL); + p = br_multicast_new_port_group(port, group, *pp, 0, src, + filter_mode, RTPROT_KERNEL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; @@ -3690,7 +3695,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) memset(ð, 0, sizeof(eth)); eth.h_proto = htons(proto); - ret = br_multicast_querier_exists(br, ð); + ret = br_multicast_querier_exists(br, ð, NULL); unlock: rcu_read_unlock(); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 04c3f9a82650..8edfb98ae1d5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -735,6 +735,11 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; @@ -835,8 +840,6 @@ static unsigned int br_nf_post_routing(void *priv, else return NF_ACCEPT; - /* We assume any code from br_dev_queue_push_xmit onwards doesn't care - * about the value of skb->pkt_type. */ if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 92d64abffa87..49700ce0e919 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -16,6 +16,7 @@ #include "br_private.h" #include "br_private_stp.h" +#include "br_private_cfm.h" #include "br_private_tunnel.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, @@ -93,9 +94,11 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, { struct net_bridge_vlan_group *vg = NULL; struct net_bridge_port *p = NULL; - struct net_bridge *br; - int num_vlan_infos; + struct net_bridge *br = NULL; + u32 num_cfm_peer_mep_infos; + u32 num_cfm_mep_infos; size_t vinfo_sz = 0; + int num_vlan_infos; rcu_read_lock(); if (netif_is_bridge_port(dev)) { @@ -114,6 +117,49 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); + if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) + return vinfo_sz; + + if (!br) + return vinfo_sz; + + /* CFM status info must be added */ + br_cfm_mep_count(br, &num_cfm_mep_infos); + br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos); + + vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */ + /* For each status struct the MEP instance (u32) is added */ + /* MEP instance (u32) + br_cfm_mep_status */ + vinfo_sz += num_cfm_mep_infos * + /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */ + (nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */ + + nla_total_size(sizeof(u32))); + /* MEP instance (u32) + br_cfm_cc_peer_status */ + vinfo_sz += num_cfm_peer_mep_infos * + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */ + (nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */ + + nla_total_size(sizeof(u8)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */ + + nla_total_size(sizeof(u8)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */ + + nla_total_size(sizeof(u32))); + return vinfo_sz; } @@ -377,7 +423,8 @@ nla_put_failure: static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, - u32 filter_mask, const struct net_device *dev) + u32 filter_mask, const struct net_device *dev, + bool getlink) { u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct nlattr *af = NULL; @@ -426,7 +473,9 @@ static int br_fill_ifinfo(struct sk_buff *skb, if (filter_mask & (RTEXT_FILTER_BRVLAN | RTEXT_FILTER_BRVLAN_COMPRESSED | - RTEXT_FILTER_MRP)) { + RTEXT_FILTER_MRP | + RTEXT_FILTER_CFM_CONFIG | + RTEXT_FILTER_CFM_STATUS)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; @@ -475,6 +524,36 @@ static int br_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } + if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) { + struct nlattr *cfm_nest = NULL; + int err; + + if (!br_cfm_created(br) || port) + goto done; + + cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM); + if (!cfm_nest) + goto nla_put_failure; + + if (filter_mask & RTEXT_FILTER_CFM_CONFIG) { + rcu_read_lock(); + err = br_cfm_config_fill_info(skb, br); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + } + + if (filter_mask & RTEXT_FILTER_CFM_STATUS) { + rcu_read_lock(); + err = br_cfm_status_fill_info(skb, br, getlink); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + } + + nla_nest_end(skb, cfm_nest); + } + done: if (af) nla_nest_end(skb, af); @@ -486,11 +565,9 @@ nla_put_failure: return -EMSGSIZE; } -/* Notify listeners of a change in bridge or port information */ -void br_ifinfo_notify(int event, const struct net_bridge *br, - const struct net_bridge_port *port) +void br_info_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port, u32 filter) { - u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; @@ -515,7 +592,7 @@ void br_ifinfo_notify(int event, const struct net_bridge *br, if (skb == NULL) goto errout; - err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev); + err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -528,6 +605,15 @@ errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } +/* Notify listeners of a change in bridge or port information */ +void br_ifinfo_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port) +{ + u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; + + return br_info_notify(event, br, port, filter); +} + /* * Dump information about all ports, in response to GETLINK */ @@ -538,11 +624,13 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && - !(filter_mask & RTEXT_FILTER_MRP)) + !(filter_mask & RTEXT_FILTER_MRP) && + !(filter_mask & RTEXT_FILTER_CFM_CONFIG) && + !(filter_mask & RTEXT_FILTER_CFM_STATUS)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, - filter_mask, dev); + filter_mask, dev, true); } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, @@ -700,6 +788,11 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_CFM: + err = br_cfm_parse(br, p, attr, cmd, extack); + if (err) + return err; + break; } } @@ -1631,7 +1724,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; - struct br_vlan_stats stats; + struct pcpu_sw_netstats stats; if (++vl_idx < *prividx) continue; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 345118e35c42..d538ccec0acd 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -89,14 +89,6 @@ struct bridge_mcast_stats { }; #endif -struct br_vlan_stats { - u64 rx_bytes; - u64 rx_packets; - u64 tx_bytes; - u64 tx_packets; - struct u64_stats_sync syncp; -}; - struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst *tunnel_dst; @@ -137,7 +129,7 @@ struct net_bridge_vlan { u16 flags; u16 priv_flags; u8 state; - struct br_vlan_stats __percpu *stats; + struct pcpu_sw_netstats __percpu *stats; union { struct net_bridge *br; struct net_bridge_port *port; @@ -383,9 +375,8 @@ enum net_bridge_opts { struct net_bridge { spinlock_t lock; spinlock_t hash_lock; - struct list_head port_list; + struct hlist_head frame_type_list; struct net_device *dev; - struct pcpu_sw_netstats __percpu *stats; unsigned long options; /* These fields are accessed on each packet */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING @@ -395,6 +386,7 @@ struct net_bridge { #endif struct rhashtable fdb_hash_tbl; + struct list_head port_list; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { struct rtable fake_rtable; @@ -481,7 +473,10 @@ struct net_bridge { struct hlist_head fdb_list; #if IS_ENABLED(CONFIG_BRIDGE_MRP) - struct list_head mrp_list; + struct hlist_head mrp_list; +#endif +#if IS_ENABLED(CONFIG_BRIDGE_CFM) + struct hlist_head mep_list; #endif }; @@ -755,6 +750,16 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); rx_handler_func_t *br_get_rx_handler(const struct net_device *dev); +struct br_frame_type { + __be16 type; + int (*frame_handler)(struct net_bridge_port *port, + struct sk_buff *skb); + struct hlist_node list; +}; + +void br_add_frame(struct net_bridge *br, struct br_frame_type *ft); +void br_del_frame(struct net_bridge *br, struct br_frame_type *ft); + static inline bool br_rx_handler_check_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev); @@ -840,6 +845,11 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg); +static inline bool br_group_is_l2(const struct br_ip *group) +{ + return group->proto == 0; +} + #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -871,7 +881,8 @@ __br_multicast_querier_exists(struct net_bridge *br, } static inline bool br_multicast_querier_exists(struct net_bridge *br, - struct ethhdr *eth) + struct ethhdr *eth, + const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): @@ -883,7 +894,7 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, &br->ip6_other_query, true); #endif default: - return false; + return !!mdb && br_group_is_l2(&mdb->addr); } } @@ -993,7 +1004,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br) } static inline bool br_multicast_querier_exists(struct net_bridge *br, - struct ethhdr *eth) + struct ethhdr *eth, + const struct net_bridge_mdb_entry *mdb) { return false; } @@ -1072,7 +1084,7 @@ void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, - struct br_vlan_stats *stats); + struct pcpu_sw_netstats *stats); void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); @@ -1268,7 +1280,7 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( } static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, - struct br_vlan_stats *stats) + struct pcpu_sw_netstats *stats) { } @@ -1417,7 +1429,6 @@ extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) #if IS_ENABLED(CONFIG_BRIDGE_MRP) int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); -int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); bool br_mrp_enabled(struct net_bridge *br); void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br); @@ -1429,11 +1440,6 @@ static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return -EOPNOTSUPP; } -static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) -{ - return 0; -} - static inline bool br_mrp_enabled(struct net_bridge *br) { return false; @@ -1451,12 +1457,67 @@ static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) #endif +/* br_cfm.c */ +#if IS_ENABLED(CONFIG_BRIDGE_CFM) +int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); +bool br_cfm_created(struct net_bridge *br); +void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p); +int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br); +int br_cfm_status_fill_info(struct sk_buff *skb, + struct net_bridge *br, + bool getlink); +int br_cfm_mep_count(struct net_bridge *br, u32 *count); +int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count); +#else +static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline bool br_cfm_created(struct net_bridge *br) +{ + return false; +} + +static inline void br_cfm_port_del(struct net_bridge *br, + struct net_bridge_port *p) +{ +} + +static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + return -EOPNOTSUPP; +} + +static inline int br_cfm_status_fill_info(struct sk_buff *skb, + struct net_bridge *br, + bool getlink) +{ + return -EOPNOTSUPP; +} + +static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count) +{ + return -EOPNOTSUPP; +} + +static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) +{ + return -EOPNOTSUPP; +} +#endif + /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); +void br_info_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port, u32 filter); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); diff --git a/net/bridge/br_private_cfm.h b/net/bridge/br_private_cfm.h new file mode 100644 index 000000000000..a43a5e7fa2c3 --- /dev/null +++ b/net/bridge/br_private_cfm.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _BR_PRIVATE_CFM_H_ +#define _BR_PRIVATE_CFM_H_ + +#include "br_private.h" +#include <uapi/linux/cfm_bridge.h> + +struct br_cfm_mep_create { + enum br_cfm_domain domain; /* Domain for this MEP */ + enum br_cfm_mep_direction direction; /* Up or Down MEP direction */ + u32 ifindex; /* Residence port */ +}; + +int br_cfm_mep_create(struct net_bridge *br, + const u32 instance, + struct br_cfm_mep_create *const create, + struct netlink_ext_ack *extack); + +int br_cfm_mep_delete(struct net_bridge *br, + const u32 instance, + struct netlink_ext_ack *extack); + +struct br_cfm_mep_config { + u32 mdlevel; + u32 mepid; /* MEPID for this MEP */ + struct mac_addr unicast_mac; /* The MEP unicast MAC */ +}; + +int br_cfm_mep_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_mep_config *const config, + struct netlink_ext_ack *extack); + +struct br_cfm_maid { + u8 data[CFM_MAID_LENGTH]; +}; + +struct br_cfm_cc_config { + /* Expected received CCM PDU MAID. */ + struct br_cfm_maid exp_maid; + + /* Expected received CCM PDU interval. */ + /* Transmitting CCM PDU interval when CCM tx is enabled. */ + enum br_cfm_ccm_interval exp_interval; + + bool enable; /* Enable/disable CCM PDU handling */ +}; + +int br_cfm_cc_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_cc_config *const config, + struct netlink_ext_ack *extack); + +int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, + u32 peer_mep_id, + struct netlink_ext_ack *extack); +int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, + u32 peer_mep_id, + struct netlink_ext_ack *extack); + +/* Transmitted CCM Remote Defect Indication status set. + * This RDI is inserted in transmitted CCM PDUs if CCM transmission is enabled. + * See br_cfm_cc_ccm_tx() with interval != BR_CFM_CCM_INTERVAL_NONE + */ +int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance, + const bool rdi, struct netlink_ext_ack *extack); + +/* OAM PDU Tx information */ +struct br_cfm_cc_ccm_tx_info { + struct mac_addr dmac; + /* The CCM will be transmitted for this period in seconds. + * Call br_cfm_cc_ccm_tx before timeout to keep transmission alive. + * When period is zero any ongoing transmission will be stopped. + */ + u32 period; + + bool seq_no_update; /* Update Tx CCM sequence number */ + bool if_tlv; /* Insert Interface Status TLV */ + u8 if_tlv_value; /* Interface Status TLV value */ + bool port_tlv; /* Insert Port Status TLV */ + u8 port_tlv_value; /* Port Status TLV value */ + /* Sender ID TLV ?? + * Organization-Specific TLV ?? + */ +}; + +int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, + const struct br_cfm_cc_ccm_tx_info *const tx_info, + struct netlink_ext_ack *extack); + +struct br_cfm_mep_status { + /* Indications that an OAM PDU has been seen. */ + bool opcode_unexp_seen; /* RX of OAM PDU with unexpected opcode */ + bool version_unexp_seen; /* RX of OAM PDU with unexpected version */ + bool rx_level_low_seen; /* Rx of OAM PDU with level low */ +}; + +struct br_cfm_cc_peer_status { + /* This CCM related status is based on the latest received CCM PDU. */ + u8 port_tlv_value; /* Port Status TLV value */ + u8 if_tlv_value; /* Interface Status TLV value */ + + /* CCM has not been received for 3.25 intervals */ + u8 ccm_defect:1; + + /* (RDI == 1) for last received CCM PDU */ + u8 rdi:1; + + /* Indications that a CCM PDU has been seen. */ + u8 seen:1; /* CCM PDU received */ + u8 tlv_seen:1; /* CCM PDU with TLV received */ + /* CCM PDU with unexpected sequence number received */ + u8 seq_unexp_seen:1; +}; + +struct br_cfm_mep { + /* list header of MEP instances */ + struct hlist_node head; + u32 instance; + struct br_cfm_mep_create create; + struct br_cfm_mep_config config; + struct br_cfm_cc_config cc_config; + struct br_cfm_cc_ccm_tx_info cc_ccm_tx_info; + /* List of multiple peer MEPs */ + struct hlist_head peer_mep_list; + struct net_bridge_port __rcu *b_port; + unsigned long ccm_tx_end; + struct delayed_work ccm_tx_dwork; + u32 ccm_tx_snumber; + u32 ccm_rx_snumber; + struct br_cfm_mep_status status; + bool rdi; + struct rcu_head rcu; +}; + +struct br_cfm_peer_mep { + struct hlist_node head; + struct br_cfm_mep *mep; + struct delayed_work ccm_rx_dwork; + u32 mepid; + struct br_cfm_cc_peer_status cc_status; + u32 ccm_rx_count_miss; + struct rcu_head rcu; +}; + +#endif /* _BR_PRIVATE_CFM_H_ */ diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index af0e9eff6549..1883118aae55 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -8,7 +8,7 @@ struct br_mrp { /* list of mrp instances */ - struct list_head list; + struct hlist_node list; struct net_bridge_port __rcu *p_port; struct net_bridge_port __rcu *s_port; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3e493eb85bb2..d07008678d32 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -270,7 +270,8 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, goto out_filt; v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { - v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + v->stats = + netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; @@ -421,7 +422,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { - struct br_vlan_stats *stats; + struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; @@ -474,7 +475,7 @@ static bool __allowed_ingress(const struct net_bridge *br, struct sk_buff *skb, u16 *vid, u8 *state) { - struct br_vlan_stats *stats; + struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; @@ -708,7 +709,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, if (!vlan) return -ENOMEM; - vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; @@ -853,15 +854,25 @@ EXPORT_SYMBOL_GPL(br_vlan_get_proto); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { + struct switchdev_attr attr = { + .orig_dev = br->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.vlan_protocol = ntohs(proto), + }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; - __be16 oldproto; + __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; + err = switchdev_port_attr_set(br->dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); @@ -872,7 +883,6 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) } } - oldproto = br->vlan_proto; br->vlan_proto = proto; recalculate_group_addr(br); @@ -888,6 +898,9 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) return 0; err_filt: + attr.u.vlan_protocol = ntohs(oldproto); + switchdev_port_attr_set(br->dev, &attr); + list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, proto, vlan->vid); @@ -1262,14 +1275,14 @@ void nbp_vlan_flush(struct net_bridge_port *port) } void br_vlan_get_stats(const struct net_bridge_vlan *v, - struct br_vlan_stats *stats) + struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; - struct br_vlan_stats *cpu_stats; + struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); @@ -1585,7 +1598,7 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) static bool br_vlan_stats_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { - struct br_vlan_stats stats; + struct pcpu_sw_netstats stats; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 5040fe43f4b4..e4d287afc2c9 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -17,7 +17,7 @@ config NFT_BRIDGE_META config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" - depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 + depends on NFT_REJECT help Add support to reject packets. diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index deae2c9a0f69..eba0efe64d05 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -39,30 +39,6 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, } } -static int nft_bridge_iphdr_validate(struct sk_buff *skb) -{ - struct iphdr *iph; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return 0; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return 0; - - len = ntohs(iph->tot_len); - if (skb->len < len) - return 0; - else if (len < (iph->ihl*4)) - return 0; - - if (!pskb_may_pull(skb, iph->ihl*4)) - return 0; - - return 1; -} - /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ @@ -72,29 +48,11 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, int hook) { struct sk_buff *nskb; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _oth; - if (!nft_bridge_iphdr_validate(oldskb)) - return; - - oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); - if (!oth) - return; - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv4.sysctl_ip_default_ttl); - nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); @@ -106,139 +64,32 @@ static void nft_reject_br_send_v4_unreach(struct net *net, int hook, u8 code) { struct sk_buff *nskb; - struct iphdr *niph; - struct icmphdr *icmph; - unsigned int len; - __wsum csum; - u8 proto; - - if (!nft_bridge_iphdr_validate(oldskb)) - return; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - /* RFC says return as much as we can without exceeding 576 bytes. */ - len = min_t(unsigned int, 536, oldskb->len); - - if (!pskb_may_pull(oldskb, len)) - return; - - if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) - return; - - proto = ip_hdr(oldskb)->protocol; - - if (!skb_csum_unnecessary(oldskb) && - nf_reject_verify_csum(proto) && - nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) - return; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) + - LL_MAX_HEADER + len, GFP_ATOMIC); + nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - net->ipv4.sysctl_ip_default_ttl); - - skb_reset_transport_header(nskb); - icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); - icmph->type = ICMP_DEST_UNREACH; - icmph->code = code; - - skb_put_data(nskb, skb_network_header(oldskb), len); - - csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0); - icmph->checksum = csum_fold(csum); - - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); } -static int nft_bridge_ip6hdr_validate(struct sk_buff *skb) -{ - struct ipv6hdr *hdr; - u32 pkt_len; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return 0; - - hdr = ipv6_hdr(skb); - if (hdr->version != 6) - return 0; - - pkt_len = ntohs(hdr->payload_len); - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return 0; - - return 1; -} - static void nft_reject_br_send_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook) { struct sk_buff *nskb; - const struct tcphdr *oth; - struct tcphdr _oth; - unsigned int otcplen; - struct ipv6hdr *nip6h; - if (!nft_bridge_ip6hdr_validate(oldskb)) - return; - - oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook); - if (!oth) - return; - - nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv6.devconf_all->hop_limit); - nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); - nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); } -static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - int thoff; - __be16 fo; - u8 proto = ip6h->nexthdr; - - if (skb_csum_unnecessary(skb)) - return true; - - if (ip6h->payload_len && - pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) - return false; - - ip6h = ipv6_hdr(skb); - thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); - if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) - return false; - - if (!nf_reject_verify_csum(proto)) - return true; - - return nf_ip6_checksum(skb, hook, thoff, proto) == 0; -} static void nft_reject_br_send_v6_unreach(struct net *net, struct sk_buff *oldskb, @@ -246,49 +97,11 @@ static void nft_reject_br_send_v6_unreach(struct net *net, int hook, u8 code) { struct sk_buff *nskb; - struct ipv6hdr *nip6h; - struct icmp6hdr *icmp6h; - unsigned int len; - - if (!nft_bridge_ip6hdr_validate(oldskb)) - return; - /* Include "As much of invoking packet as possible without the ICMPv6 - * packet exceeding the minimum IPv6 MTU" in the ICMP payload. - */ - len = min_t(unsigned int, 1220, oldskb->len); - - if (!pskb_may_pull(oldskb, len)) - return; - - if (!reject6_br_csum_ok(oldskb, hook)) - return; - - nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + - LL_MAX_HEADER + len, GFP_ATOMIC); + nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, - net->ipv6.devconf_all->hop_limit); - - skb_reset_transport_header(nskb); - icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); - icmp6h->icmp6_type = ICMPV6_DEST_UNREACH; - icmp6h->icmp6_code = code; - - skb_put_data(nskb, skb_network_header(oldskb), len); - nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); - - icmp6h->icmp6_cksum = - csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, - nskb->len - sizeof(struct ipv6hdr), - IPPROTO_ICMPV6, - csum_partial(icmp6h, - nskb->len - sizeof(struct ipv6hdr), - 0)); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); @@ -364,69 +177,13 @@ static int nft_reject_bridge_validate(const struct nft_ctx *ctx, (1 << NF_BR_LOCAL_IN)); } -static int nft_reject_bridge_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - - icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - if (priv->type == NFT_REJECT_ICMPX_UNREACH && - icmp_code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; - - priv->icmp_code = icmp_code; - break; - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - return 0; -} - -static int nft_reject_bridge_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - default: - break; - } - - return 0; - -nla_put_failure: - return -1; -} - static struct nft_expr_type nft_reject_bridge_type; static const struct nft_expr_ops nft_reject_bridge_ops = { .type = &nft_reject_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_bridge_eval, - .init = nft_reject_bridge_init, - .dump = nft_reject_bridge_dump, + .init = nft_reject_init, + .dump = nft_reject_dump, .validate = nft_reject_bridge_validate, }; diff --git a/net/can/Kconfig b/net/can/Kconfig index 224e5e0283a9..7c9958df91d3 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -62,8 +62,9 @@ config CAN_ISOTP communication between CAN nodes via two defined CAN Identifiers. As CAN frames can only transport a small amount of data bytes (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this - segmentation is needed to transport longer PDUs as needed e.g. for - vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. + segmentation is needed to transport longer Protocol Data Units (PDU) + as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN + traffic. This protocol driver implements data transfers according to ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. If you want to perform automotive vehicle diagnostic services (UDS), diff --git a/net/can/af_can.c b/net/can/af_can.c index 6373ab9c5507..837bb8af0ec3 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -541,10 +541,13 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. + * As this case may potentially happen when closing a socket while + * the notifier for removing the CAN netdev is running we just print + * a warning here. */ if (!rcv) { - WARN(1, "BUG: receive list entry not found for dev %s, id %03X, mask %03X\n", - DNAME(dev), can_id, mask); + pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); goto out; } @@ -677,16 +680,25 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN)) { - pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", + dev->type, skb->len); + goto free_skb; + } + + /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ + if (unlikely(cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d, datalen %d\n", dev->type, skb->len, cfd->len); - kfree_skb(skb); - return NET_RX_DROP; + goto free_skb; } can_receive(skb, dev); return NET_RX_SUCCESS; + +free_skb: + kfree_skb(skb); + return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, @@ -694,16 +706,25 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN)) { - pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", + dev->type, skb->len); + goto free_skb; + } + + /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ + if (unlikely(cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d, datalen %d\n", dev->type, skb->len, cfd->len); - kfree_skb(skb); - return NET_RX_DROP; + goto free_skb; } can_receive(skb, dev); return NET_RX_SUCCESS; + +free_skb: + kfree_skb(skb); + return NET_RX_DROP; } /* af_can protocol functions */ @@ -870,7 +891,7 @@ static __init int can_init(void) int err; /* check for correct padding to be able to use the structs similarly */ - BUILD_BUG_ON(offsetof(struct can_frame, can_dlc) != + BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); diff --git a/net/can/gw.c b/net/can/gw.c index 6b790b6ff8d2..8598d9da0e5f 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -199,6 +199,68 @@ static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } +/* retrieve valid CC DLC value and store it into 'len' */ +static void mod_retrieve_ccdlc(struct canfd_frame *cf) +{ + struct can_frame *ccf = (struct can_frame *)cf; + + /* len8_dlc is only valid if len == CAN_MAX_DLEN */ + if (ccf->len != CAN_MAX_DLEN) + return; + + /* do we have a valid len8_dlc value from 9 .. 15 ? */ + if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) + ccf->len = ccf->len8_dlc; +} + +/* convert valid CC DLC value in 'len' into struct can_frame elements */ +static void mod_store_ccdlc(struct canfd_frame *cf) +{ + struct can_frame *ccf = (struct can_frame *)cf; + + /* clear potential leftovers */ + ccf->len8_dlc = 0; + + /* plain data length 0 .. 8 - that was easy */ + if (ccf->len <= CAN_MAX_DLEN) + return; + + /* potentially broken values are catched in can_can_gw_rcv() */ + if (ccf->len > CAN_MAX_RAW_DLC) + return; + + /* we have a valid dlc value from 9 .. 15 in ccf->len */ + ccf->len8_dlc = ccf->len; + ccf->len = CAN_MAX_DLEN; +} + +static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_retrieve_ccdlc(cf); + mod_and_len(cf, mod); + mod_store_ccdlc(cf); +} + +static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_retrieve_ccdlc(cf); + mod_or_len(cf, mod); + mod_store_ccdlc(cf); +} + +static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_retrieve_ccdlc(cf); + mod_xor_len(cf, mod); + mod_store_ccdlc(cf); +} + +static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_set_len(cf, mod); + mod_store_ccdlc(cf); +} + static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized @@ -207,7 +269,7 @@ static void canframecpy(struct canfd_frame *dst, struct can_frame *src) */ dst->can_id = src->can_id; - dst->len = src->can_dlc; + dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } @@ -842,8 +904,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_and_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; @@ -858,8 +920,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_or_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; @@ -874,8 +936,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_xor_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; @@ -890,8 +952,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_set_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; diff --git a/net/can/isotp.c b/net/can/isotp.c index 4c2062875893..d78ab13bd8be 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -252,14 +252,16 @@ static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) static u8 padlen(u8 datalen) { - const u8 plen[] = {8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ - 12, 12, 12, 12, /* 9 - 12 */ - 16, 16, 16, 16, /* 13 - 16 */ - 20, 20, 20, 20, /* 17 - 20 */ - 24, 24, 24, 24, /* 21 - 24 */ - 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ - 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ - 48, 48, 48, 48, 48, 48, 48, 48}; /* 41 - 48 */ + static const u8 plen[] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ + 12, 12, 12, 12, /* 9 - 12 */ + 16, 16, 16, 16, /* 13 - 16 */ + 20, 20, 20, 20, /* 17 - 20 */ + 24, 24, 24, 24, /* 21 - 24 */ + 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ + 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ + 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ + }; if (datalen > 48) return 64; @@ -569,10 +571,6 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, return 0; } - /* no creation of flow control frames */ - if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) - return 0; - /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ @@ -581,6 +579,10 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, return 0; } + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 137054bff9ec..bb914d8b4216 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -62,7 +62,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) skb_pull(skb, J1939_CAN_HDR); /* fix length, set to dlc, with 8 maximum */ - skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8)); + skb_trim(skb, min_t(uint8_t, cf->len, 8)); /* set addr */ skcb = j1939_skb_to_cb(skb); @@ -335,7 +335,7 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) canid |= skcb->addr.da << 8; cf->can_id = canid; - cf->can_dlc = dlc; + cf->len = dlc; return can_send(skb, 1); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 1be4c898b2fa..f23966526a88 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -475,6 +475,12 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) goto out_release_sock; } + if (!(ndev->flags & IFF_UP)) { + dev_put(ndev); + ret = -ENETDOWN; + goto out_release_sock; + } + priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { diff --git a/net/can/proc.c b/net/can/proc.c index 550928b8b8a2..5ea8695f507e 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -462,6 +462,9 @@ void can_init_proc(struct net *net) */ void can_remove_proc(struct net *net) { + if (!net->can.proc_dir) + return; + if (net->can.pde_stats) remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir); @@ -486,6 +489,5 @@ void can_remove_proc(struct net *net) if (net->can.pde_rcvlist_sff) remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir); - if (net->can.proc_dir) - remove_proc_entry("can", net->proc_net); + remove_proc_entry("can", net->proc_net); } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index c907f0dc7f87..a32037daa933 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> @@ -15,20 +16,8 @@ DEFINE_BPF_STORAGE_CACHE(sk_cache); -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - static struct bpf_local_storage_data * -sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; @@ -41,11 +30,11 @@ sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } -static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { struct bpf_local_storage_data *sdata; - sdata = sk_storage_lookup(sk, map, false); + sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; @@ -94,7 +83,7 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void sk_storage_map_free(struct bpf_map *map) +static void bpf_sk_storage_map_free(struct bpf_map *map) { struct bpf_local_storage_map *smap; @@ -103,7 +92,7 @@ static void sk_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap); } -static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; @@ -130,7 +119,7 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_lookup(sock->sk, map, true); + sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } @@ -166,7 +155,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - err = sk_storage_delete(sock->sk, map); + err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } @@ -272,7 +261,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; - sdata = sk_storage_lookup(sk, map, true); + sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; @@ -305,7 +294,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; - err = sk_storage_delete(sk, map); + err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } @@ -313,14 +302,23 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } -static int sk_storage_charge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { - return omem_charge(owner, size); + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; } -static void sk_storage_uncharge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { struct sock *sk = owner; @@ -328,7 +326,7 @@ static void sk_storage_uncharge(struct bpf_local_storage_map *smap, } static struct bpf_local_storage __rcu ** -sk_storage_ptr(void *owner) +bpf_sk_storage_ptr(void *owner) { struct sock *sk = owner; @@ -339,8 +337,8 @@ static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, - .map_alloc = sk_storage_map_alloc, - .map_free = sk_storage_map_free, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, @@ -348,9 +346,9 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, - .map_local_storage_charge = sk_storage_charge, - .map_local_storage_uncharge = sk_storage_uncharge, - .map_owner_storage_ptr = sk_storage_ptr, + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -381,6 +379,79 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (in_irq() || in_nmi()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (in_irq() || in_nmi()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; diff --git a/net/core/datagram.c b/net/core/datagram.c index 9fcaa544f11a..81809fa735a7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -709,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) EXPORT_SYMBOL(zerocopy_sg_from_iter); /** - * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/core/dev.c b/net/core/dev.c index 82dc6b48e45f..ce8fea2e2788 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1069,19 +1069,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -3206,7 +3193,7 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } @@ -3495,6 +3482,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now @@ -3867,6 +3859,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + qdisc_skb_cb(skb)->mru = 0; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4180,7 +4173,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) } EXPORT_SYMBOL(dev_queue_xmit_accel); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; @@ -4210,17 +4203,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) dev_xmit_recursion_dec(); local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - return ret; drop: atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return NET_XMIT_DROP; } -EXPORT_SYMBOL(dev_direct_xmit); +EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines @@ -4954,6 +4943,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_skb_cb(skb)->mru = 0; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); @@ -6458,7 +6448,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6495,10 +6486,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } + + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6515,29 +6526,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6565,17 +6580,23 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6588,7 +6609,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6599,7 +6620,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); @@ -6650,8 +6671,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6709,6 +6732,7 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6781,6 +6805,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + goto out_unlock; + } + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. @@ -6919,7 +6956,7 @@ bool netdev_has_upper_dev(struct net_device *dev, EXPORT_SYMBOL(netdev_has_upper_dev); /** - * netdev_has_upper_dev_all - Check if device is linked to an upper device + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * @@ -8157,7 +8194,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); /** - * netdev_lower_change - Dispatch event about lower device state change + * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * @@ -8902,7 +8939,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) return dev->netdev_ops->ndo_bpf; default: return NULL; - }; + } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, @@ -9763,7 +9800,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } @@ -10366,6 +10403,21 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 205e92e604ef..db8a0ff86f36 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -230,7 +230,7 @@ static int dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; - int err = -EOPNOTSUPP; + int err; err = dsa_ndo_do_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) diff --git a/net/core/devlink.c b/net/core/devlink.c index a578634052a3..88c0ac8ed444 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -517,7 +517,7 @@ devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_l return test_bit(limit, &devlink->ops->reload_limits); } -static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_action action, +static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_limit limit, u32 value) { struct nlattr *reload_stats_entry; @@ -526,8 +526,7 @@ static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_acti if (!reload_stats_entry) return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, action) || - nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) goto nla_put_failure; nla_nest_end(msg, reload_stats_entry); @@ -540,7 +539,7 @@ nla_put_failure: static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) { - struct nlattr *reload_stats_attr; + struct nlattr *reload_stats_attr, *act_info, *act_stats; int i, j, stat_idx; u32 value; @@ -552,17 +551,29 @@ static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink if (!reload_stats_attr) return -EMSGSIZE; - for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - /* Remote stats are shown even if not locally supported. Stats - * of actions with unspecified limit are shown though drivers - * don't need to register unspecified limit. - */ - if (!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && - !devlink_reload_limit_is_supported(devlink, j)) + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if ((!is_remote && + !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC) continue; - for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if ((!is_remote && !devlink_reload_action_is_supported(devlink, i)) || - i == DEVLINK_RELOAD_ACTION_UNSPEC || + act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); + if (!act_info) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) + goto action_info_nest_cancel; + act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); + if (!act_stats) + goto action_info_nest_cancel; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + /* Remote stats are shown even if not locally supported. + * Stats of actions with unspecified limit are shown + * though drivers don't need to register unspecified + * limit. + */ + if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) || devlink_reload_combination_is_invalid(i, j)) continue; @@ -571,13 +582,19 @@ static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink value = devlink->stats.reload_stats[stat_idx]; else value = devlink->stats.remote_reload_stats[stat_idx]; - if (devlink_reload_stat_put(msg, i, j, value)) - goto nla_put_failure; + if (devlink_reload_stat_put(msg, j, value)) + goto action_stats_nest_cancel; } + nla_nest_end(msg, act_stats); + nla_nest_end(msg, act_info); } nla_nest_end(msg, reload_stats_attr); return 0; +action_stats_nest_cancel: + nla_nest_cancel(msg, act_stats); +action_info_nest_cancel: + nla_nest_cancel(msg, act_info); nla_put_failure: nla_nest_cancel(msg, reload_stats_attr); return -EMSGSIZE; @@ -755,6 +772,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; + /* Hold rtnl lock while accessing port's netdev attributes. */ + rtnl_lock(); spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; @@ -763,9 +782,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net *net = devlink_net(devlink_port->devlink); struct net_device *netdev = devlink_port->type_dev; - if (netdev && + if (netdev && net_eq(net, dev_net(netdev)) && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, netdev->ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, @@ -781,6 +801,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); + rtnl_unlock(); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) @@ -791,6 +812,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); + rtnl_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -1448,7 +1470,7 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, pool_index, &cur, &max); if (err && err != -EOPNOTSUPP) - return err; + goto sb_occ_get_failure; if (!err) { if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) goto nla_put_failure; @@ -1461,8 +1483,10 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, return 0; nla_put_failure: + err = -EMSGSIZE; +sb_occ_get_failure: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, @@ -3370,7 +3394,7 @@ out_free_msg: nlmsg_free(msg); } -void devlink_flash_update_begin_notify(struct devlink *devlink) +static void devlink_flash_update_begin_notify(struct devlink *devlink) { struct devlink_flash_notify params = { 0 }; @@ -3378,9 +3402,8 @@ void devlink_flash_update_begin_notify(struct devlink *devlink) DEVLINK_CMD_FLASH_UPDATE, ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); -void devlink_flash_update_end_notify(struct devlink *devlink) +static void devlink_flash_update_end_notify(struct devlink *devlink) { struct devlink_flash_notify params = { 0 }; @@ -3388,7 +3411,6 @@ void devlink_flash_update_end_notify(struct devlink *devlink) DEVLINK_CMD_FLASH_UPDATE_END, ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); void devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg, @@ -3429,10 +3451,12 @@ EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *nla_component, *nla_overwrite_mask; + struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; + const char *file_name; u32 supported_params; + int ret; if (!devlink->ops->flash_update) return -EOPNOTSUPP; @@ -3442,8 +3466,6 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, supported_params = devlink->ops->supported_flash_update_params; - params.file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); - nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; if (nla_component) { if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) { @@ -3467,7 +3489,21 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, params.overwrite_mask = sections.value & sections.selector; } - return devlink->ops->flash_update(devlink, ¶ms, info->extack); + nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; + file_name = nla_data(nla_file_name); + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); + return ret; + } + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); + + return ret; } static const struct devlink_param devlink_param_generic[] = { @@ -4213,10 +4249,12 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - if (region->port) - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index)) + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) goto nla_put_failure; + } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) @@ -4265,10 +4303,12 @@ devlink_nl_region_notify_build(struct devlink_region *region, if (err) goto out_cancel_msg; - if (region->port) - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index)) + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) goto out_cancel_msg; + } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); @@ -4915,8 +4955,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; + if (!port) { + err = -ENODEV; + goto out_unlock; + } } region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); @@ -4962,10 +5004,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, if (err) goto nla_put_failure; - if (region->port) - if (nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, - region->port->index)) + if (region->port) { + err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) goto nla_put_failure; + } err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); if (err) @@ -8246,8 +8290,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (WARN_ON(devlink_port->registered)) - return -EEXIST; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { @@ -8271,6 +8313,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; + if (WARN_ON(devlink_port->registered)) + return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -8293,6 +8337,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -8318,6 +8364,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) @@ -9464,6 +9512,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(DCCP_PARSING, DROP), DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), + DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -10213,12 +10262,18 @@ int devlink_compat_flash_update(struct net_device *dev, const char *file_name) goto out; } - params.file_name = file_name; + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) + goto out; mutex_lock(&devlink->lock); + devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, ¶ms, NULL); + devlink_flash_update_end_notify(devlink); mutex_unlock(&devlink->lock); + release_firmware(params.fw); + out: rtnl_lock(); dev_put(dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7bcfb16854cb..cd80ffed6d26 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->iifindex = -1; - nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; @@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->oifindex = -1; - nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; diff --git a/net/core/filter.c b/net/core/filter.c index 2ca5eecebacf..77001a35768f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4910,6 +4910,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->notsent_lowat = val; sk->sk_write_space(sk); break; + case TCP_WINDOW_CLAMP: + ret = tcp_set_window_clamp(sk, val); + break; default: ret = -EINVAL; } @@ -6995,6 +6998,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_setsockopt_proto; @@ -7003,6 +7008,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_getsockopt_proto; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index e21950a2c897..6f1adba6695f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -48,7 +48,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { - /* User should make sure that every key target offset is withing + /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index e095fb871d91..6eb2e5ec2c50 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -99,9 +99,14 @@ void gro_cells_destroy(struct gro_cells *gcells) struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); - netif_napi_del(&cell->napi); + __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } + /* This barrier is needed because netpoll could access dev->napi_list + * under rcu protection. + */ + synchronize_net(); + free_percpu(gcells->cells); gcells->cells = NULL; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e39e28b0a8d..9500d28a43b0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -235,6 +235,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || + (tbl->is_multicast && + tbl->is_multicast(n->primary_key)) || time_after(tref, n->updated)) remove = true; write_unlock(&n->lock); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c310c7c1cef7..960948290001 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -29,6 +29,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/if_vlan.h> +#include <net/dsa.h> #include <net/tcp.h> #include <net/udp.h> #include <net/addrconf.h> @@ -657,15 +658,15 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { - struct net_device *ndev = NULL; + struct net_device *ndev = NULL, *dev = NULL; + struct net *net = current->nsproxy->net_ns; struct in_device *in_dev; int err; rtnl_lock(); - if (np->dev_name[0]) { - struct net *net = current->nsproxy->net_ns; + if (np->dev_name[0]) ndev = __dev_get_by_name(net, np->dev_name); - } + if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); err = -ENODEV; @@ -673,6 +674,19 @@ int netpoll_setup(struct netpoll *np) } dev_hold(ndev); + /* bring up DSA management network devices up first */ + for_each_netdev(net, dev) { + if (!netdev_uses_dsa(dev)) + continue; + + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); + if (err < 0) { + np_err(np, "%s failed to open %s\n", + np->dev_name, dev->name); + goto put; + } + } + if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ef98372facf6..f3c690b8c8e3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include <linux/device.h> #include <net/page_pool.h> +#include <net/xdp.h> + #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> @@ -362,8 +364,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -379,15 +382,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -405,9 +405,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7d7223691783..60917ff4a00b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1939,7 +1939,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; - nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } @@ -2953,9 +2953,9 @@ static struct net_device *rtnl_dev_get(struct net *net, if (!ifname) { ifname = buffer; if (ifname_attr) - nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + nla_strscpy(ifname, ifname_attr, IFNAMSIZ); else if (altifname_attr) - nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ); else return NULL; } @@ -2983,7 +2983,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3264,7 +3264,7 @@ replay: return err; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3296,7 +3296,7 @@ replay: memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1ba8f0163744..bfa5c9969393 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -249,6 +249,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, fclones->skb2.fclone = SKB_FCLONE_CLONE; } + + skb_set_kcov_handle(skb, kcov_common_handle()); + out: return skb; nodata: @@ -282,6 +285,8 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb, memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); + skb_set_kcov_handle(skb, kcov_common_handle()); + return skb; } @@ -837,7 +842,7 @@ EXPORT_SYMBOL(consume_skb); #endif /** - * consume_stateless_skb - free an skbuff, assuming it is stateless + * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last @@ -897,6 +902,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } + lockdep_assert_in_softirq(); + if (!skb_unref(skb)) return; @@ -4549,7 +4556,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); if (icmp_next) - sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; } spin_unlock_irqrestore(&q->lock, flags); @@ -5430,7 +5437,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) goto err_free; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; @@ -5786,6 +5794,9 @@ int skb_mpls_dec_ttl(struct sk_buff *skb) if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; if (!--ttl) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 654182ecf87b..25cdbb20f3a0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -170,10 +170,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; - if (charge) - sk_mem_uncharge(sk, len); - if (!msg->skb) + /* When the skb owns the memory we free it from consume_skb path. */ + if (!msg->skb) { + if (charge) + sk_mem_uncharge(sk, len); put_page(sg_page(sge)); + } memset(sge, 0, sizeof(*sge)); return len; } @@ -397,28 +399,45 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) { - struct sock *sk = psock->sk; - int copied = 0, num_sge; struct sk_msg *msg; + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return NULL; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return NULL; + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); if (unlikely(!msg)) - return -EAGAIN; - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(msg); - return -EAGAIN; - } + return NULL; sk_msg_init(msg); + return msg; +} + +static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + struct sk_psock *psock, + struct sock *sk, + struct sk_msg *msg) +{ + int num_sge, copied; + + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + */ + if (skb_linearize(skb)) + return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); if (unlikely(num_sge < 0)) { kfree(msg); return num_sge; } - sk_mem_charge(sk, skb->len); copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; @@ -430,6 +449,48 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) return copied; } +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + struct sk_msg *msg; + + /* If we are receiving on the same sock skb->sk is already assigned, + * skip memory accounting and owner transition seeing it already set + * correctly. + */ + if (unlikely(skb->sk == sk)) + return sk_psock_skb_ingress_self(psock, skb); + msg = sk_psock_create_ingress_msg(sk, skb); + if (!msg) + return -EAGAIN; + + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + +/* Puts an skb on the ingress queue of the socket already assigned to the + * skb. In this case we do not need to check memory limits or skb_set_owner_r + * because the skb is already accounted for here. + */ +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sock *sk = psock->sk; + + if (unlikely(!msg)) + return -EAGAIN; + sk_msg_init(msg); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { @@ -789,7 +850,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { - err = sk_psock_skb_ingress(psock, skb); + err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { skb_queue_tail(&psock->ingress_skb, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..4fd7e785f177 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1159,6 +1159,22 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1539,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: @@ -2486,7 +2505,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -3078,7 +3097,7 @@ EXPORT_SYMBOL(release_sock); * * sk_lock.slock unlocked, owned = 1, BH enabled */ -bool lock_sock_fast(struct sock *sk) +bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3096,6 +3115,7 @@ bool lock_sock_fast(struct sock *sk) * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + __acquire(&sk->sk_lock.slock); local_bh_enable(); return true; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ddc899e83313..64b5ec14ff50 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -27,8 +27,6 @@ struct bpf_stab { static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -39,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER); + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - /* Make sure page count doesn't overflow. */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&stab->map.memory, cost); - if (err) - goto free_stab; - stab->sks = bpf_map_area_alloc(stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); - if (stab->sks) - return &stab->map; - err = -ENOMEM; - bpf_map_charge_finish(&stab->map.memory); -free_stab: - kfree(stab); - return ERR_PTR(err); + if (!stab->sks) { + kfree(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) @@ -975,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, } } - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); @@ -1103,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; - u64 cost; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -1116,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); @@ -1131,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + - (u64) htab->elem_size * htab->map.max_entries; - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_htab; - } - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { - bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..17ffd33c6b18 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) /* Returns 0 on success, negative on failure */ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) + struct net_device *dev, u32 queue_index, unsigned int napi_id) { if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); @@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; @@ -380,6 +381,60 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + + if (unlikely(!xa || !bq->count)) + return; + + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + __xdp_return(xdpf->data, &xdpf->mem, false); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + void xdp_return_buff(struct xdp_buff *xdp) { __xdp_return(xdp->data, &xdp->rxq->mem, true); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 16014ad19406..084e159a12ba 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1827,6 +1827,8 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex) /** * dcb_getapp - retrieve the DCBX application user priority + * @dev: network interface + * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to @@ -1849,6 +1851,8 @@ EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list + * @dev: network interface + * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is @@ -1890,6 +1894,8 @@ EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority + * @dev: network interface + * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was @@ -1912,6 +1918,8 @@ EXPORT_SYMBOL(dcb_ieee_getapp_mask); /** * dcb_ieee_setapp - add IEEE dcb application data to app list + * @dev: network interface + * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long @@ -1946,6 +1954,8 @@ EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list + * @dev: network interface + * @del: application data to delete * * This removes a matching APP data from the APP list */ @@ -1975,7 +1985,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); -/** +/* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for @@ -2004,7 +2014,7 @@ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); -/** +/* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a @@ -2031,7 +2041,7 @@ dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); -/** +/* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 8f3dd3b1d2d0..c4bbac99740d 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -242,6 +242,8 @@ static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, /** * dccp_ackvec_input - Register incoming packet in the buffer + * @av: Ack Vector to register packet to + * @skb: Packet to register */ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) { @@ -273,6 +275,9 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) /** * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection + * @av: Ack Vector record to clean + * @ackno: last Ack Vector which has been acknowledged + * * This routine is called when the peer acknowledges the receipt of Ack Vectors * up to and including @ackno. While based on section A.3 of RFC 4340, here * are additional precautions to prevent corrupted buffer state. In particular, diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 1e9bb121ba72..6beac5d348e2 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -76,7 +76,7 @@ int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, return err; } -static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) +static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) { struct kmem_cache *slab; va_list args; diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 3da1f77bd039..4d9823d6dced 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -181,6 +181,9 @@ MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); /** * ccid2_update_used_window - Track how much of cwnd is actually used + * @hc: socket to update window + * @new_wnd: new window values to add into the filter + * * This is done in addition to CWV. The sender needs to have an idea of how many * packets may be in flight, to set the local Sequence Window value accordingly * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the @@ -349,6 +352,8 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * @sk: socket to perform estimator on + * * This code is almost identical with TCP's tcp_rtt_estimator(), since * - it has a higher sampling frequency (recommended by RFC 1323), * - the RTO does not collapse into RTT due to RTTVAR going towards zero, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index b9ee1a4a8955..ca8670f78ac6 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -79,6 +79,8 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) /** * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst + * @hc: socket to have the send interval updated + * * This respects the granularity of X_inst (64 * bytes/second). */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) @@ -99,6 +101,7 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) /** * ccid3_hc_tx_update_x - Update allowed sending rate X + * @sk: socket to be updated * @stamp: most recent time if available - can be left NULL. * * This function tracks draft rfc3448bis, check there for latest details. @@ -151,6 +154,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) /** * ccid3_hc_tx_update_s - Track the mean packet size `s' + * @hc: socket to be updated * @len: DCCP packet payload size in bytes * * cf. RFC 4342, 5.3 and RFC 3448, 4.1 @@ -259,6 +263,7 @@ out: /** * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @sk: socket to send packet from * @skb: next packet candidate to send on @sk * * This function uses the convention of ccid_packet_dequeue_eval() and @@ -655,6 +660,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) /** * ccid3_first_li - Implements [RFC 5348, 6.3.1] + * @sk: socket to calculate loss interval for * * Determine the length of the first loss interval via inverse lookup. * Assume that X_recv can be computed by the throughput equation diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 67abad695e66..da95319842bb 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -79,6 +79,9 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) /** * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 + * @lh: histogram to update + * @skb: received socket triggering loss interval update + * * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev */ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index af08e2df7108..0cdda3c66fb5 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -385,6 +385,9 @@ static inline struct tfrc_rx_hist_entry * /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal + * @h: receive histogram + * @skb: packet containing timestamp. + * * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able * to compute a sample with given data - calling function should check this. */ diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 788dd629c420..305f56804832 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -996,6 +996,8 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp) /** * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features + * @dreq: server socket to resolve + * * It is the server which resolves the dependencies once the CCID has been * fully negotiated. If no CCID has been negotiated, it uses the default CCID. */ @@ -1033,6 +1035,10 @@ static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) /** * dccp_feat_prefer - Move preferred entry to the start of array + * @preferred_value: entry to move to start of array + * @array: array of preferred entries + * @array_len: size of the array + * * Reorder the @array_len elements in @array so that @preferred_value comes * first. Returns >0 to indicate that @preferred_value does occur in @array. */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bb3d70664dde..b0b6e6a4784e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -427,7 +427,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); if (*own_req) ireq->ireq_opt = NULL; else diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ef4ab28cfde0..78ee1b5acf1f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -533,7 +533,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, dccp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); diff --git a/net/dccp/output.c b/net/dccp/output.c index 50e6d5699bb2..b8a24734385e 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -143,6 +143,8 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) /** * dccp_determine_ccmps - Find out about CCID-specific packet-size limits + * @dp: socket to find packet size limits of + * * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), * since the RX CCID is restricted to feedback packets (Acks), which are small * in comparison with the data traffic. A value of 0 means "no current CCMPS". @@ -236,6 +238,8 @@ static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) /** * dccp_xmit_packet - Send data packet under control of CCID + * @sk: socket to send data packet on + * * Transmits next-queued payload and informs CCID to account for the packet. */ static void dccp_xmit_packet(struct sock *sk) @@ -296,6 +300,9 @@ static void dccp_xmit_packet(struct sock *sk) /** * dccp_flush_write_queue - Drain queue at end of connection + * @sk: socket to be drained + * @time_budget: time allowed to drain the queue + * * Since dccp_sendmsg queues packets without waiting for them to be sent, it may * happen that the TX queue is not empty at the end of a connection. We give the * HC-sender CCID a grace period of up to @time_budget jiffies. If this function @@ -367,6 +374,8 @@ void dccp_write_xmit(struct sock *sk) /** * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets + * @sk: socket to perform retransmit on + * * There are only four retransmittable packet types in DCCP: * - Request in client-REQUEST state (sec. 8.1.1), * - CloseReq in server-CLOSEREQ state (sec. 8.3), diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c index db2448c33a62..5ba204ec0aca 100644 --- a/net/dccp/qpolicy.c +++ b/net/dccp/qpolicy.c @@ -65,14 +65,16 @@ static bool qpolicy_prio_full(struct sock *sk) * @push: add a new @skb to the write queue * @full: indicates that no more packets will be admitted * @top: peeks at whatever the queueing policy defines as its `top' + * @params: parameter passed to policy operation */ -static struct dccp_qpolicy_operations { +struct dccp_qpolicy_operations { void (*push) (struct sock *sk, struct sk_buff *skb); bool (*full) (struct sock *sk); struct sk_buff* (*top) (struct sock *sk); __be32 params; +}; -} qpol_table[DCCPQ_POLICY_MAX] = { +static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { [DCCPQ_POLICY_SIMPLE] = { .push = qpolicy_simple_push, .full = qpolicy_simple_full, diff --git a/net/dccp/timer.c b/net/dccp/timer.c index a934d2932373..db768f223ef7 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -215,13 +215,14 @@ out: /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @data: Socket to act on + * @t: pointer to the tasklet associated with this handler * * See the comments above %ccid_dequeueing_decision for supported modes. */ -static void dccp_write_xmitlet(unsigned long data) +static void dccp_write_xmitlet(struct tasklet_struct *t) { - struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); + struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -235,16 +236,15 @@ static void dccp_write_xmitlet(unsigned long data) static void dccp_write_xmit_timer(struct timer_list *t) { struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - dccp_write_xmitlet((unsigned long)sk); + dccp_write_xmitlet(&dp->dccps_xmitlet); } void dccp_init_xmit_timers(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 15d42353f1a3..d1c50a48614b 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -658,7 +658,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa->ifa_dev = dn_db; if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 1f9b9b11008c..dfecd7b22fd7 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -56,20 +56,31 @@ config NET_DSA_TAG_BRCM_PREPEND Broadcom switches which places the tag before the Ethernet header (prepended). +config NET_DSA_TAG_HELLCREEK + tristate "Tag driver for Hirschmann Hellcreek TSN switches" + help + Say Y or M if you want to enable support for tagging frames + for the Hirschmann Hellcreek TSN switches. + config NET_DSA_TAG_GSWIP tristate "Tag driver for Lantiq / Intel GSWIP switches" help Say Y or M if you want to enable support for tagging frames for the Lantiq / Intel GSWIP switches. +config NET_DSA_TAG_DSA_COMMON + tristate + config NET_DSA_TAG_DSA tristate "Tag driver for Marvell switches using DSA headers" + select NET_DSA_TAG_DSA_COMMON help Say Y or M if you want to enable support for tagging frames for the Marvell switches which use DSA headers. config NET_DSA_TAG_EDSA tristate "Tag driver for Marvell switches using EtherType DSA headers" + select NET_DSA_TAG_DSA_COMMON help Say Y or M if you want to enable support for tagging frames for the Marvell switches which use EtherType DSA headers. diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 4f47b2025ff5..0fb2b75a7ae3 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -7,9 +7,9 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o -obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o -obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o +obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 2131bf2b3a67..a1b1dc8a4d87 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -201,7 +201,6 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, { struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; - struct pcpu_sw_netstats *s; struct dsa_slave_priv *p; if (unlikely(!cpu_dp)) { @@ -234,11 +233,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, skb = nskb; } - s = this_cpu_ptr(p->stats64); - u64_stats_update_begin(&s->syncp); - s->rx_packets++; - s->rx_bytes += skb->len; - u64_stats_update_end(&s->syncp); + dev_sw_netstats_rx_add(skb->dev, skb->len); if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 12998bf04e55..7c96aae9062c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -78,8 +78,6 @@ struct dsa_slave_priv { struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); - struct pcpu_sw_netstats __percpu *stats64; - struct gro_cells gcells; /* DSA port data, such as switch, port index, etc. */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3bc5ca40c9fb..7efc753e4d9d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -522,10 +522,10 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!clone) return; - DSA_SKB_CB(skb)->clone = clone; - - if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) { + DSA_SKB_CB(skb)->clone = clone; return; + } kfree_skb(clone); } @@ -548,17 +548,36 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); +static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the master interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct pcpu_sw_netstats *s; struct sk_buff *nskb; - s = this_cpu_ptr(p->stats64); - u64_stats_update_begin(&s->syncp); - s->tx_packets++; - s->tx_bytes += skb->len; - u64_stats_update_end(&s->syncp); + dev_sw_netstats_tx_add(dev, 1, skb->len); DSA_SKB_CB(skb)->clone = NULL; @@ -567,6 +586,17 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ dsa_skb_tx_timestamp(p, skb); + if (dsa_realloc_skb(skb, dev)) { + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + /* needed_tailroom should still be 'warm' in the cache line from + * dsa_realloc_skb(), which has also ensured that padding is safe. + */ + if (dev->needed_tailroom) + eth_skb_pad(skb); + /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ @@ -679,7 +709,6 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; @@ -688,7 +717,7 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; - s = per_cpu_ptr(p->stats64, i); + s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin_irq(&s->syncp); tx_packets = s->tx_packets; @@ -1217,15 +1246,6 @@ static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } -static void dsa_slave_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - netdev_stats_to_stats64(stats, &dev->stats); - dev_fetch_sw_netstats(stats, p->stats64); -} - static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { @@ -1601,7 +1621,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { #endif .ndo_get_phys_port_name = dsa_slave_get_phys_port_name, .ndo_setup_tc = dsa_slave_setup_tc, - .ndo_get_stats64 = dsa_slave_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_port_parent_id = dsa_slave_get_port_parent_id, .ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, @@ -1791,6 +1811,16 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->netdev_ops = &dsa_slave_netdev_ops; if (ds->ops->port_max_mtu) slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); + if (cpu_dp->tag_ops->tail_tag) + slave_dev->needed_tailroom = cpu_dp->tag_ops->overhead; + else + slave_dev->needed_headroom = cpu_dp->tag_ops->overhead; + /* Try to save one extra realloc later in the TX path (in the master) + * by also inheriting the master's needed headroom and tailroom. + * The 8021q driver also does this. + */ + slave_dev->needed_headroom += master->needed_headroom; + slave_dev->needed_tailroom += master->needed_tailroom; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, @@ -1801,8 +1831,8 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); - p->stats64 = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!p->stats64) { + slave_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!slave_dev->tstats) { free_netdev(slave_dev); return -ENOMEM; } @@ -1864,7 +1894,7 @@ out_phy: out_gcells: gro_cells_destroy(&p->gcells); out_free: - free_percpu(p->stats64); + free_percpu(slave_dev->tstats); free_netdev(slave_dev); port->slave = NULL; return ret; @@ -1886,7 +1916,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); phylink_destroy(dp->pl); gro_cells_destroy(&p->gcells); - free_percpu(p->stats64); + free_percpu(slave_dev->tstats); free_netdev(slave_dev); } @@ -1987,10 +2017,22 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; + struct dsa_switch *ds; + struct dsa_port *dp; + int err; if (!dsa_slave_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, ptr); + dp = dsa_slave_to_port(dev); + ds = dp->ds; + + if (ds->ops->port_prechangeupper) { + err = ds->ops->port_prechangeupper(ds, dp->index, info); + if (err) + return notifier_from_errno(err); + } + if (is_vlan_dev(info->upper_dev)) return dsa_slave_check_8021q_upper(dev, ptr); break; diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 55b00694cdba..002cf7f952e2 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -31,9 +31,6 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, __le16 *phdr; u16 hdr; - if (skb_cow_head(skb, AR9331_HDR_LEN) < 0) - return NULL; - phdr = skb_push(skb, AR9331_HDR_LEN); hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index ad72dff8d524..e934dace3922 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -66,9 +66,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, u16 queue = skb_get_queue_mapping(skb); u8 *brcm_tag; - if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) - return NULL; - /* The Ethernet switch we are interfaced with needs packets to be at * least 64 bytes (including FCS) otherwise they will be discarded when * they enter the switch port logic. When Broadcom tags are enabled, we diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 0b756fae68a5..112c7c6dd568 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -1,7 +1,48 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging + * Regular and Ethertype DSA tagging * Copyright (c) 2008-2009 Marvell Semiconductor + * + * Regular DSA + * ----------- + + * For untagged (in 802.1Q terms) packets, the switch will splice in + * the tag between the SA and the ethertype of the original + * packet. Tagged frames will instead have their outermost .1Q tag + * converted to a DSA tag. It expects the same layout when receiving + * packets from the CPU. + * + * Example: + * + * .----.----.----.--------- + * Pu: | DA | SA | ET | Payload ... + * '----'----'----'--------- + * 6 6 2 N + * .----.----.--------.-----.----.--------- + * Pt: | DA | SA | 0x8100 | TCI | ET | Payload ... + * '----'----'--------'-----'----'--------- + * 6 6 2 2 2 N + * .----.----.-----.----.--------- + * Pd: | DA | SA | DSA | ET | Payload ... + * '----'----'-----'----'--------- + * 6 6 4 2 N + * + * No matter if a packet is received untagged (Pu) or tagged (Pt), + * they will both have the same layout (Pd) when they are sent to the + * CPU. This is done by ignoring 802.3, replacing the ethertype field + * with more metadata, among which is a bit to signal if the original + * packet was tagged or not. + * + * Ethertype DSA + * ------------- + * Uses the exact same tag format as regular DSA, but also includes a + * proper ethertype field (which the mv88e6xxx driver sets to + * ETH_P_EDSA/0xdada) followed by two zero bytes: + * + * .----.----.--------.--------.-----.----.--------- + * | DA | SA | 0xdada | 0x0000 | DSA | ET | Payload ... + * '----'----'--------'--------'-----'----'--------- + * 6 6 2 2 4 2 N */ #include <linux/etherdevice.h> @@ -12,46 +53,104 @@ #define DSA_HLEN 4 -static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) +/** + * enum dsa_cmd - DSA Command + * @DSA_CMD_TO_CPU: Set on packets that were trapped or mirrored to + * the CPU port. This is needed to implement control protocols, + * e.g. STP and LLDP, that must not allow those control packets to + * be switched according to the normal rules. + * @DSA_CMD_FROM_CPU: Used by the CPU to send a packet to a specific + * port, ignoring all the barriers that the switch normally + * enforces (VLANs, STP port states etc.). No source address + * learning takes place. "sudo send packet" + * @DSA_CMD_TO_SNIFFER: Set on the copies of packets that matched some + * user configured ingress or egress monitor criteria. These are + * forwarded by the switch tree to the user configured ingress or + * egress monitor port, which can be set to the CPU port or a + * regular port. If the destination is a regular port, the tag + * will be removed before egressing the port. If the destination + * is the CPU port, the tag will not be removed. + * @DSA_CMD_FORWARD: This tag is used on all bulk traffic passing + * through the switch tree, including the flows that are directed + * towards the CPU. Its device/port tuple encodes the original + * source port on which the packet ingressed. It can also be used + * on transmit by the CPU to defer the forwarding decision to the + * hardware, based on the current config of PVT/VTU/ATU + * etc. Source address learning takes places if enabled on the + * receiving DSA/CPU port. + */ +enum dsa_cmd { + DSA_CMD_TO_CPU = 0, + DSA_CMD_FROM_CPU = 1, + DSA_CMD_TO_SNIFFER = 2, + DSA_CMD_FORWARD = 3 +}; + +/** + * enum dsa_code - TO_CPU Code + * + * @DSA_CODE_MGMT_TRAP: DA was classified as a management + * address. Typical examples include STP BPDUs and LLDP. + * @DSA_CODE_FRAME2REG: Response to a "remote management" request. + * @DSA_CODE_IGMP_MLD_TRAP: IGMP/MLD signaling. + * @DSA_CODE_POLICY_TRAP: Frame matched some policy configuration on + * the device. Typical examples are matching on DA/SA/VID and DHCP + * snooping. + * @DSA_CODE_ARP_MIRROR: The name says it all really. + * @DSA_CODE_POLICY_MIRROR: Same as @DSA_CODE_POLICY_TRAP, but the + * particular policy was set to trigger a mirror instead of a + * trap. + * @DSA_CODE_RESERVED_6: Unused on all devices up to at least 6393X. + * @DSA_CODE_RESERVED_7: Unused on all devices up to at least 6393X. + * + * A 3-bit code is used to relay why a particular frame was sent to + * the CPU. We only use this to determine if the packet was mirrored + * or trapped, i.e. whether the packet has been forwarded by hardware + * or not. + * + * This is the superset of all possible codes. Any particular device + * may only implement a subset. + */ +enum dsa_code { + DSA_CODE_MGMT_TRAP = 0, + DSA_CODE_FRAME2REG = 1, + DSA_CODE_IGMP_MLD_TRAP = 2, + DSA_CODE_POLICY_TRAP = 3, + DSA_CODE_ARP_MIRROR = 4, + DSA_CODE_POLICY_MIRROR = 5, + DSA_CODE_RESERVED_6 = 6, + DSA_CODE_RESERVED_7 = 7 +}; + +static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, + u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); u8 *dsa_header; - /* - * Convert the outermost 802.1q tag to a DSA tag for tagged - * packets, or insert a DSA tag between the addresses and - * the ethertype field for untagged packets. - */ if (skb->protocol == htons(ETH_P_8021Q)) { - if (skb_cow_head(skb, 0) < 0) - return NULL; + if (extra) { + skb_push(skb, extra); + memmove(skb->data, skb->data + extra, 2 * ETH_ALEN); + } - /* - * Construct tagged FROM_CPU DSA tag from 802.1q tag. - */ - dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60 | dp->ds->index; + /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */ + dsa_header = skb->data + 2 * ETH_ALEN + extra; + dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index; dsa_header[1] = dp->index << 3; - /* - * Move CFI field from byte 2 to byte 1. - */ + /* Move CFI field from byte 2 to byte 1. */ if (dsa_header[2] & 0x10) { dsa_header[1] |= 0x01; dsa_header[2] &= ~0x10; } } else { - if (skb_cow_head(skb, DSA_HLEN) < 0) - return NULL; - skb_push(skb, DSA_HLEN); + skb_push(skb, DSA_HLEN + extra); + memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN); - memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); - - /* - * Construct untagged FROM_CPU DSA tag. - */ - dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40 | dp->ds->index; + /* Construct untagged FROM_CPU DSA tag. */ + dsa_header = skb->data + 2 * ETH_ALEN + extra; + dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index; dsa_header[1] = dp->index << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; @@ -60,30 +159,60 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, + u8 extra) { + int source_device, source_port; + enum dsa_code code; + enum dsa_cmd cmd; u8 *dsa_header; - int source_device; - int source_port; - if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) - return NULL; - - /* - * The ethertype field is part of the DSA header. - */ + /* The ethertype field is part of the DSA header. */ dsa_header = skb->data - 2; - /* - * Check that frame type is either TO_CPU or FORWARD. - */ - if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0) + cmd = dsa_header[0] >> 6; + switch (cmd) { + case DSA_CMD_FORWARD: + skb->offload_fwd_mark = 1; + break; + + case DSA_CMD_TO_CPU: + code = (dsa_header[1] & 0x6) | ((dsa_header[2] >> 4) & 1); + + switch (code) { + case DSA_CODE_FRAME2REG: + /* Remote management is not implemented yet, + * drop. + */ + return NULL; + case DSA_CODE_ARP_MIRROR: + case DSA_CODE_POLICY_MIRROR: + /* Mark mirrored packets to notify any upper + * device (like a bridge) that forwarding has + * already been done by hardware. + */ + skb->offload_fwd_mark = 1; + break; + case DSA_CODE_MGMT_TRAP: + case DSA_CODE_IGMP_MLD_TRAP: + case DSA_CODE_POLICY_TRAP: + /* Traps have, by definition, not been + * forwarded by hardware, so don't mark them. + */ + break; + default: + /* Reserved code, this could be anything. Drop + * seems like the safest option. + */ + return NULL; + } + + break; + + default: return NULL; + } - /* - * Determine source device and port. - */ source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; @@ -91,16 +220,15 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - /* - * Convert the DSA header to an 802.1q header if the 'tagged' - * bit in the DSA header is set. If the 'tagged' bit is clear, - * delete the DSA header entirely. + /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q + * tag, and delete the ethertype (extra) if applicable. If the + * 'tagged' bit is cleared; delete the DSA tag, and ethertype + * if applicable. */ if (dsa_header[0] & 0x20) { u8 new_header[4]; - /* - * Insert 802.1q ethertype and copy the VLAN-related + /* Insert 802.1Q ethertype and copy the VLAN-related * fields, but clear the bit that will hold CFI (since * DSA uses that bit location for another purpose). */ @@ -109,16 +237,13 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, new_header[2] = dsa_header[2] & ~0x10; new_header[3] = dsa_header[3]; - /* - * Move CFI bit from its place in the DSA header to - * its 802.1q-designated place. + /* Move CFI bit from its place in the DSA header to + * its 802.1Q-designated place. */ if (dsa_header[1] & 0x01) new_header[2] |= 0x10; - /* - * Update packet checksum if skb is CHECKSUM_COMPLETE. - */ + /* Update packet checksum if skb is CHECKSUM_COMPLETE. */ if (skb->ip_summed == CHECKSUM_COMPLETE) { __wsum c = skb->csum; c = csum_add(c, csum_partial(new_header + 2, 2, 0)); @@ -127,30 +252,101 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, } memcpy(dsa_header, new_header, DSA_HLEN); + + if (extra) + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - extra, + 2 * ETH_ALEN); } else { - /* - * Remove DSA tag and update checksum. - */ skb_pull_rcsum(skb, DSA_HLEN); memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - DSA_HLEN, + skb->data - ETH_HLEN - DSA_HLEN - extra, 2 * ETH_ALEN); } - skb->offload_fwd_mark = 1; - return skb; } +#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA) + +static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) +{ + return dsa_xmit_ll(skb, dev, 0); +} + +static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) + return NULL; + + return dsa_rcv_ll(skb, dev, 0); +} + static const struct dsa_device_ops dsa_netdev_ops = { - .name = "dsa", - .proto = DSA_TAG_PROTO_DSA, - .xmit = dsa_xmit, - .rcv = dsa_rcv, + .name = "dsa", + .proto = DSA_TAG_PROTO_DSA, + .xmit = dsa_xmit, + .rcv = dsa_rcv, .overhead = DSA_HLEN, }; -MODULE_LICENSE("GPL"); +DSA_TAG_DRIVER(dsa_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA); +#endif /* CONFIG_NET_DSA_TAG_DSA */ + +#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA) + +#define EDSA_HLEN 8 + +static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) +{ + u8 *edsa_header; -module_dsa_tag_driver(dsa_netdev_ops); + skb = dsa_xmit_ll(skb, dev, EDSA_HLEN - DSA_HLEN); + if (!skb) + return NULL; + + edsa_header = skb->data + 2 * ETH_ALEN; + edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; + edsa_header[1] = ETH_P_EDSA & 0xff; + edsa_header[2] = 0x00; + edsa_header[3] = 0x00; + return skb; +} + +static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) + return NULL; + + skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN); + + return dsa_rcv_ll(skb, dev, EDSA_HLEN - DSA_HLEN); +} + +static const struct dsa_device_ops edsa_netdev_ops = { + .name = "edsa", + .proto = DSA_TAG_PROTO_EDSA, + .xmit = edsa_xmit, + .rcv = edsa_rcv, + .overhead = EDSA_HLEN, +}; + +DSA_TAG_DRIVER(edsa_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA); +#endif /* CONFIG_NET_DSA_TAG_EDSA */ + +static struct dsa_tag_driver *dsa_tag_drivers[] = { +#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA) + &DSA_TAG_DRIVER_NAME(dsa_netdev_ops), +#endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA) + &DSA_TAG_DRIVER_NAME(edsa_netdev_ops), +#endif +}; + +module_dsa_tag_drivers(dsa_tag_drivers); + +MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c deleted file mode 100644 index 120614240319..000000000000 --- a/net/dsa/tag_edsa.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * net/dsa/tag_edsa.c - Ethertype DSA tagging - * Copyright (c) 2008-2009 Marvell Semiconductor - */ - -#include <linux/etherdevice.h> -#include <linux/list.h> -#include <linux/slab.h> - -#include "dsa_priv.h" - -#define DSA_HLEN 4 -#define EDSA_HLEN 8 - -#define FRAME_TYPE_TO_CPU 0x00 -#define FRAME_TYPE_FORWARD 0x03 - -#define TO_CPU_CODE_MGMT_TRAP 0x00 -#define TO_CPU_CODE_FRAME2REG 0x01 -#define TO_CPU_CODE_IGMP_MLD_TRAP 0x02 -#define TO_CPU_CODE_POLICY_TRAP 0x03 -#define TO_CPU_CODE_ARP_MIRROR 0x04 -#define TO_CPU_CODE_POLICY_MIRROR 0x05 - -static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - u8 *edsa_header; - - /* - * Convert the outermost 802.1q tag to a DSA tag and prepend - * a DSA ethertype field is the packet is tagged, or insert - * a DSA ethertype plus DSA tag between the addresses and the - * current ethertype field if the packet is untagged. - */ - if (skb->protocol == htons(ETH_P_8021Q)) { - if (skb_cow_head(skb, DSA_HLEN) < 0) - return NULL; - skb_push(skb, DSA_HLEN); - - memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); - - /* - * Construct tagged FROM_CPU DSA tag from 802.1q tag. - */ - edsa_header = skb->data + 2 * ETH_ALEN; - edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; - edsa_header[1] = ETH_P_EDSA & 0xff; - edsa_header[2] = 0x00; - edsa_header[3] = 0x00; - edsa_header[4] = 0x60 | dp->ds->index; - edsa_header[5] = dp->index << 3; - - /* - * Move CFI field from byte 6 to byte 5. - */ - if (edsa_header[6] & 0x10) { - edsa_header[5] |= 0x01; - edsa_header[6] &= ~0x10; - } - } else { - if (skb_cow_head(skb, EDSA_HLEN) < 0) - return NULL; - skb_push(skb, EDSA_HLEN); - - memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN); - - /* - * Construct untagged FROM_CPU DSA tag. - */ - edsa_header = skb->data + 2 * ETH_ALEN; - edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; - edsa_header[1] = ETH_P_EDSA & 0xff; - edsa_header[2] = 0x00; - edsa_header[3] = 0x00; - edsa_header[4] = 0x40 | dp->ds->index; - edsa_header[5] = dp->index << 3; - edsa_header[6] = 0x00; - edsa_header[7] = 0x00; - } - - return skb; -} - -static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) -{ - u8 *edsa_header; - int frame_type; - int code; - int source_device; - int source_port; - - if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) - return NULL; - - /* - * Skip the two null bytes after the ethertype. - */ - edsa_header = skb->data + 2; - - /* - * Check that frame type is either TO_CPU or FORWARD. - */ - frame_type = edsa_header[0] >> 6; - - switch (frame_type) { - case FRAME_TYPE_TO_CPU: - code = (edsa_header[1] & 0x6) | ((edsa_header[2] >> 4) & 1); - - /* - * Mark the frame to never egress on any port of the same switch - * unless it's a trapped IGMP/MLD packet, in which case the - * bridge might want to forward it. - */ - if (code != TO_CPU_CODE_IGMP_MLD_TRAP) - skb->offload_fwd_mark = 1; - - break; - - case FRAME_TYPE_FORWARD: - skb->offload_fwd_mark = 1; - break; - - default: - return NULL; - } - - /* - * Determine source device and port. - */ - source_device = edsa_header[0] & 0x1f; - source_port = (edsa_header[1] >> 3) & 0x1f; - - skb->dev = dsa_master_find_slave(dev, source_device, source_port); - if (!skb->dev) - return NULL; - - /* - * If the 'tagged' bit is set, convert the DSA tag to a 802.1q - * tag and delete the ethertype part. If the 'tagged' bit is - * clear, delete the ethertype and the DSA tag parts. - */ - if (edsa_header[0] & 0x20) { - u8 new_header[4]; - - /* - * Insert 802.1q ethertype and copy the VLAN-related - * fields, but clear the bit that will hold CFI (since - * DSA uses that bit location for another purpose). - */ - new_header[0] = (ETH_P_8021Q >> 8) & 0xff; - new_header[1] = ETH_P_8021Q & 0xff; - new_header[2] = edsa_header[2] & ~0x10; - new_header[3] = edsa_header[3]; - - /* - * Move CFI bit from its place in the DSA header to - * its 802.1q-designated place. - */ - if (edsa_header[1] & 0x01) - new_header[2] |= 0x10; - - skb_pull_rcsum(skb, DSA_HLEN); - - /* - * Update packet checksum if skb is CHECKSUM_COMPLETE. - */ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - __wsum c = skb->csum; - c = csum_add(c, csum_partial(new_header + 2, 2, 0)); - c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0)); - skb->csum = c; - } - - memcpy(edsa_header, new_header, DSA_HLEN); - - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - DSA_HLEN, - 2 * ETH_ALEN); - } else { - /* - * Remove DSA tag and update checksum. - */ - skb_pull_rcsum(skb, EDSA_HLEN); - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - EDSA_HLEN, - 2 * ETH_ALEN); - } - - return skb; -} - -static const struct dsa_device_ops edsa_netdev_ops = { - .name = "edsa", - .proto = DSA_TAG_PROTO_EDSA, - .xmit = edsa_xmit, - .rcv = edsa_rcv, - .overhead = EDSA_HLEN, -}; - -MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA); - -module_dsa_tag_driver(edsa_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 408d4af390a0..2f5bd5e338ab 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -60,13 +60,8 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - int err; u8 *gswip_tag; - err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN); - if (err) - return NULL; - skb_push(skb, GSWIP_TX_HEADER_LEN); gswip_tag = skb->data; diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c new file mode 100644 index 000000000000..a09805c8e1ab --- /dev/null +++ b/net/dsa/tag_hellcreek.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* + * net/dsa/tag_hellcreek.c - Hirschmann Hellcreek switch tag format handling + * + * Copyright (C) 2019,2020 Linutronix GmbH + * Author Kurt Kanzenbach <kurt@linutronix.de> + * + * Based on tag_ksz.c. + */ + +#include <linux/skbuff.h> +#include <net/dsa.h> + +#include "dsa_priv.h" + +#define HELLCREEK_TAG_LEN 1 + +static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *tag; + + /* Tag encoding */ + tag = skb_put(skb, HELLCREEK_TAG_LEN); + *tag = BIT(dp->index); + + return skb; +} + +static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + /* Tag decoding */ + u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN; + unsigned int port = tag[0] & 0x03; + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { + netdev_warn(dev, "Failed to get source port: %d\n", port); + return NULL; + } + + pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN); + + skb->offload_fwd_mark = true; + + return skb; +} + +static const struct dsa_device_ops hellcreek_netdev_ops = { + .name = "hellcreek", + .proto = DSA_TAG_PROTO_HELLCREEK, + .xmit = hellcreek_xmit, + .rcv = hellcreek_rcv, + .overhead = HELLCREEK_TAG_LEN, + .tail_tag = true, +}; + +MODULE_LICENSE("Dual MIT/GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK); + +module_dsa_tag_driver(hellcreek_netdev_ops); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0a5aa982c60d..4820dbcedfa2 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -14,46 +14,6 @@ #define KSZ_EGRESS_TAG_LEN 1 #define KSZ_INGRESS_TAG_LEN 1 -static struct sk_buff *ksz_common_xmit(struct sk_buff *skb, - struct net_device *dev, int len) -{ - struct sk_buff *nskb; - int padlen; - - padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; - - if (skb_tailroom(skb) >= padlen + len) { - /* Let dsa_slave_xmit() free skb */ - if (__skb_put_padto(skb, skb->len + padlen, false)) - return NULL; - - nskb = skb; - } else { - nskb = alloc_skb(NET_IP_ALIGN + skb->len + - padlen + len, GFP_ATOMIC); - if (!nskb) - return NULL; - skb_reserve(nskb, NET_IP_ALIGN); - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, - skb_network_header(skb) - skb->head); - skb_set_transport_header(nskb, - skb_transport_header(skb) - skb->head); - skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - - /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free - * skb - */ - if (skb_put_padto(nskb, nskb->len + padlen)) - return NULL; - - consume_skb(skb); - } - - return nskb; -} - static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int port, unsigned int len) @@ -90,23 +50,18 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; u8 *tag; u8 *addr; - nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN); - if (!nskb) - return NULL; - /* Tag encoding */ - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); - addr = skb_mac_header(nskb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); + addr = skb_mac_header(skb); *tag = 1 << dp->index; if (is_link_local_ether_addr(addr)) *tag |= KSZ8795_TAIL_TAG_OVERRIDE; - return nskb; + return skb; } static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev, @@ -156,18 +111,13 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; __be16 *tag; u8 *addr; u16 val; - nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN); - if (!nskb) - return NULL; - /* Tag encoding */ - tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN); - addr = skb_mac_header(nskb); + tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); + addr = skb_mac_header(skb); val = BIT(dp->index); @@ -176,7 +126,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, *tag = cpu_to_be16(val); - return nskb; + return skb; } static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, @@ -213,24 +163,19 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; u8 *addr; u8 *tag; - nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN); - if (!nskb) - return NULL; - /* Tag encoding */ - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); - addr = skb_mac_header(nskb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); + addr = skb_mac_header(skb); *tag = BIT(dp->index); if (is_link_local_ether_addr(addr)) *tag |= KSZ9893_TAIL_TAG_OVERRIDE; - return nskb; + return skb; } static const struct dsa_device_ops ksz9893_netdev_ops = { diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index ccfb6f641bbf..aa1318dccaf0 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -58,15 +58,6 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) __be16 *lan9303_tag; u16 tag; - /* insert a special VLAN tag between the MAC addresses - * and the current ethertype field. - */ - if (skb_cow_head(skb, LAN9303_TAG_LEN) < 0) { - dev_dbg(&dev->dev, - "Cannot make room for the special tag. Dropping packet\n"); - return NULL; - } - /* provide 'LAN9303_TAG_LEN' bytes additional space */ skb_push(skb, LAN9303_TAG_LEN); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 4cdd9cf428fb..38dcdded74c0 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -34,9 +34,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, * table with VID. */ if (!skb_vlan_tagged(skb)) { - if (skb_cow_head(skb, MTK_HDR_LEN) < 0) - return NULL; - skb_push(skb, MTK_HDR_LEN); memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); is_vlan_skb = false; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 3b468aca5c53..16a1afd5b8e1 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -143,13 +143,6 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct ocelot_port *ocelot_port; u8 *prefix, *injection; u64 qos_class, rew_op; - int err; - - err = skb_cow_head(skb, OCELOT_TOTAL_TAG_LEN); - if (unlikely(err < 0)) { - netdev_err(netdev, "Cannot make room for tag.\n"); - return NULL; - } ocelot_port = ocelot->ports[dp->index]; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1b9e8507112b..88181b52f480 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -34,9 +34,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) __be16 *phdr; u16 hdr; - if (skb_cow_head(skb, QCA_HDR_LEN) < 0) - return NULL; - skb_push(skb, QCA_HDR_LEN); memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 3a1cc24a4f0a..5b97ede56a0f 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -13,42 +13,15 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; - int padlen; u8 *trailer; - /* - * We have to make sure that the trailer ends up as the very - * last 4 bytes of the packet. This means that we have to pad - * the packet to the minimum ethernet frame size, if necessary, - * before adding the trailer. - */ - padlen = 0; - if (skb->len < 60) - padlen = 60 - skb->len; - - nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC); - if (!nskb) - return NULL; - skb_reserve(nskb, NET_IP_ALIGN); - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb_network_header(skb) - skb->head); - skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); - skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - consume_skb(skb); - - if (padlen) { - skb_put_zero(nskb, padlen); - } - - trailer = skb_put(nskb, 4); + trailer = skb_put(skb, 4); trailer[0] = 0x80; trailer[1] = 1 << dp->index; trailer[2] = 0x10; trailer[3] = 0x00; - return nskb; + return skb; } static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index dac65180c4ef..4106373180c6 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -272,7 +272,7 @@ void eth_header_cache_update(struct hh_cache *hh, EXPORT_SYMBOL(eth_header_cache_update); /** - * eth_header_parser_protocol - extract protocol from L2 header + * eth_header_parse_protocol - extract protocol from L2 header * @skb: packet to extract protocol from */ __be16 eth_header_parse_protocol(const struct sk_buff *skb) @@ -523,8 +523,8 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) EXPORT_SYMBOL(eth_platform_get_mac_address); /** - * Obtain the MAC address from an nvmem cell named 'mac-address' associated - * with given device. + * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named + * 'mac-address' associated with given device. * * @dev: Device with which the mac-address cell is associated. * @addrbuf: Buffer to which the MAC address will be copied on success. diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 8ee4cdbd6b82..1c9f4df273bd 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -280,7 +280,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) active_diff_mask, compact); } if (mod) - ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); + netdev_features_change(dev); out_rtnl: rtnl_unlock(); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index ec2cd7aab5ad..771688e1b0da 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2433,7 +2433,7 @@ static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); default: return -EOPNOTSUPP; - }; + } } static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 6d091e419d3e..9c640d670ffe 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -149,7 +149,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; - nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], + nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], sizeof(name)); dev = dev_get_by_name(&init_net, name); } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b7260c8cef2e..b94fa8eb831b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); if (err) return err; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 687971d83b4e..922dd73e5740 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -125,6 +125,7 @@ static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); +static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, @@ -156,6 +157,7 @@ struct neigh_table arp_tbl = { .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, + .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, @@ -928,6 +930,10 @@ static void parp_redo(struct sk_buff *skb) arp_process(dev_net(skb->dev), NULL, skb); } +static int arp_is_multicast(const void *pkey) +{ + return ipv4_is_multicast(*((__be32 *)pkey)); +} /* * Receive an arp request from the device layer. diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764..d520e61649c8 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, } static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id) @@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, size_t end; if (atype == BPF_READ) - return btf_struct_access(log, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 123a6d39438f..75f67994fc85 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -650,8 +650,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; - - int err = -EINVAL; + int err; ASSERT_RTNL(); @@ -881,7 +880,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 86a23e4a6a50..b87140a1fa28 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -696,7 +696,7 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: -#ifdef CONFIG_IPV6 +#if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1f75dc686b6b..b5400cec4f69 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -973,7 +973,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; - nla_strlcpy(tmp, nla, sizeof(tmp)); + nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) @@ -1641,9 +1641,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, break; } - *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); - if (nhc->nhc_flags & RTNH_F_OFFLOAD) - *flags |= RTNH_F_OFFLOAD; + *flags |= (nhc->nhc_flags & + (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); if (!skip_oif && nhc->nhc_dev && nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ffc5332f1390..28117c05dc35 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2100,15 +2100,6 @@ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); - - /* call_fib_entry_notifiers will be removed when - * in-kernel notifier is implemented and supported - * for nexthop objects - */ - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); } } } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4148f5f78f31..f60869acbef0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -787,7 +787,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL); + inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 366a4507b5a3..93474b1bea4e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -479,8 +479,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8cbe74313f38..45fb450b4522 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,6 +20,9 @@ #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/inet6_hashtables.h> +#endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> @@ -508,10 +511,52 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -/* insert a socket into ehash, and eventually remove another one - * (The another one can be a SYN_RECV or TIMEWAIT +/* Searches for an exsiting socket in the ehash bucket list. + * Returns true if found, false otherwise. */ -bool inet_ehash_insert(struct sock *sk, struct sock *osk) +static bool inet_ehash_lookup_by_sk(struct sock *sk, + struct hlist_nulls_head *list) +{ + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); + const int sdif = sk->sk_bound_dev_if; + const int dif = sk->sk_bound_dev_if; + const struct hlist_nulls_node *node; + struct net *net = sock_net(sk); + struct sock *esk; + + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); + + sk_nulls_for_each_rcu(esk, node, list) { + if (esk->sk_hash != sk->sk_hash) + continue; + if (sk->sk_family == AF_INET) { + if (unlikely(INET_MATCH(esk, net, acookie, + sk->sk_daddr, + sk->sk_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (unlikely(INET6_MATCH(esk, net, + &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#endif + } + return false; +} + +/* Insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT) + * If an existing socket already exists, socket sk is not inserted, + * and sets found_dup_sk parameter to true. + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; @@ -530,16 +575,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk) if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); + if (*found_dup_sk) + ret = false; } + if (ret) __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); + return ret; } -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - bool ok = inet_ehash_insert(sk, osk); + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -583,7 +635,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { - inet_ehash_nolisten(sk, osk); + inet_ehash_nolisten(sk, osk, NULL); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -679,7 +731,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); spin_unlock_bh(&head->lock); return 0; } @@ -758,7 +810,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e70291748889..a68bf4c6fe9b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -920,7 +920,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_start_xmit = ipgre_xmit, .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; @@ -1275,7 +1275,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; @@ -1308,7 +1308,7 @@ static const struct net_device_ops erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 8b04d1dcfec4..ee65c9225178 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -608,9 +608,6 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - if (!df && skb->protocol == htons(ETH_P_IP)) - df = inner_iph->frag_off & htons(IP_DF); - headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (headroom > dev->needed_headroom) dev->needed_headroom = headroom; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 25f1caf5abf9..7ca338fbe8ba 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -263,7 +263,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); - if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + if (mtu < 576 || iph->frag_off != htons(IP_DF)) return 0; if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || @@ -359,7 +359,7 @@ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) __be16 frag_off; int offset; - if (mtu <= IPV6_MIN_MTU) + if (mtu < IPV6_MIN_MTU) return 0; if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || @@ -429,15 +429,6 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, } EXPORT_SYMBOL(skb_tunnel_check_pmtu); -/* Often modified stats are per cpu, other are shared (netdev->stats) */ -void ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - netdev_stats_to_stats64(tot, &dev->stats); - dev_fetch_sw_netstats(tot, dev->tstats); -} -EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); - static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b957cbee2cf7..abc171e79d3e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -404,7 +404,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_start_xmit = vti_tunnel_xmit, .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 561f15b5a944..3cd13e1bc6a7 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1441,7 +1441,7 @@ static int __init ip_auto_config(void) int retries = CONF_OPEN_RETRIES; #endif int err; - unsigned int i; + unsigned int i, count; /* Initialise all name servers and NTP servers to NONE (but only if the * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded, @@ -1575,7 +1575,7 @@ static int __init ip_auto_config(void) if (ic_dev_mtu) pr_cont(", mtu=%d", ic_dev_mtu); /* Name servers (if any): */ - for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { + for (i = 0, count = 0; i < CONF_NAMESERVERS_MAX; i++) { if (ic_nameservers[i] != NONE) { if (i == 0) pr_info(" nameserver%u=%pI4", @@ -1583,12 +1583,14 @@ static int __init ip_auto_config(void) else pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); + + count++; } - if (i + 1 == CONF_NAMESERVERS_MAX) + if ((i + 1 == CONF_NAMESERVERS_MAX) && count > 0) pr_cont("\n"); } /* NTP servers (if any): */ - for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) { + for (i = 0, count = 0; i < CONF_NTP_SERVERS_MAX; i++) { if (ic_ntp_servers[i] != NONE) { if (i == 0) pr_info(" ntpserver%u=%pI4", @@ -1596,8 +1598,10 @@ static int __init ip_auto_config(void) else pr_cont(", ntpserver%u=%pI4", i, &ic_ntp_servers[i]); + + count++; } - if (i + 1 == CONF_NTP_SERVERS_MAX) + if ((i + 1 == CONF_NTP_SERVERS_MAX) && count > 0) pr_cont("\n"); } #endif /* !SILENT */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 75d35e76bec2..d5bfa087c23a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, }; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 3205d5f7c8c9..25ea6ac44db9 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; - nla_strlcpy(tmp, nla, sizeof(tmp)); + nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index a058213b77a7..7c841037c533 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,17 +17,19 @@ #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type) +int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; - const struct sock *sk = skb_to_full_sk(skb); - __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0; + __u8 flags; struct net_device *dev = skb_dst(skb)->dev; unsigned int hh_len; + sk = sk_to_full_sk(sk); + flags = sk ? inet_sk_flowi_flags(sk) : 0; + if (addr_type == RTN_UNSPEC) addr_type = inet_addr_type_dev_table(net, dev, saddr); if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index f703a717ab1d..833079589273 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -62,7 +62,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 9dcfa4e461b6..4109055cdea6 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,128 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> +static int nf_reject_iphdr_validate(struct sk_buff *skb) +{ + struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return 0; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return 0; + + len = ntohs(iph->tot_len); + if (skb->len < len) + return 0; + else if (len < (iph->ihl*4)) + return 0; + + if (!pskb_may_pull(skb, iph->ihl*4)) + return 0; + + return 1; +} + +struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + const struct tcphdr *oth; + struct sk_buff *nskb; + struct iphdr *niph; + struct tcphdr _oth; + + if (!nf_reject_iphdr_validate(oldskb)) + return NULL; + + oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); + if (!oth) + return NULL; + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, + net->ipv4.sysctl_ip_default_ttl); + nf_reject_ip_tcphdr_put(nskb, oldskb, oth); + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); + +struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + struct iphdr *niph; + struct icmphdr *icmph; + unsigned int len; + __wsum csum; + u8 proto; + + if (!nf_reject_iphdr_validate(oldskb)) + return NULL; + + /* IP header checks: fragment. */ + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) + return NULL; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + len = min_t(unsigned int, 536, oldskb->len); + + if (!pskb_may_pull(oldskb, len)) + return NULL; + + if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) + return NULL; + + proto = ip_hdr(oldskb)->protocol; + + if (!skb_csum_unnecessary(oldskb) && + nf_reject_verify_csum(proto) && + nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) + return NULL; + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) + + LL_MAX_HEADER + len, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, + net->ipv4.sysctl_ip_default_ttl); + + skb_reset_transport_header(nskb); + icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = code; + + skb_put_data(nskb, skb_network_header(oldskb), len); + + csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0); + icmph->checksum = csum_fold(csum); + + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); + const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook) { @@ -124,7 +246,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (!oth) return; - if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb)) + if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && + nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -145,7 +268,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) goto free_nskb; niph = ip_hdr(nskb); @@ -193,7 +316,8 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; - if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in)) + if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && + nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 0dc43ad28eb9..5e1b22d4f939 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -36,14 +36,145 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_FDB] = { .type = NLA_FLAG }, }; +static bool nexthop_notifiers_is_empty(struct net *net) +{ + return !net->nexthop.notifier_chain.head; +} + +static void +__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, + const struct nexthop *nh) +{ + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + nh_info->dev = nhi->fib_nhc.nhc_dev; + nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; + if (nh_info->gw_family == AF_INET) + nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; + else if (nh_info->gw_family == AF_INET6) + nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; + + nh_info->is_reject = nhi->reject_nh; + nh_info->is_fdb = nhi->fdb_nh; + nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; +} + +static int nh_notifier_single_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); + if (!info->nh) + return -ENOMEM; + + __nh_notifier_single_info_init(info->nh, nh); + + return 0; +} + +static void nh_notifier_single_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh); +} + +static int nh_notifier_grp_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + u16 num_nh = nhg->num_nh; + int i; + + info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), + GFP_KERNEL); + if (!info->nh_grp) + return -ENOMEM; + + info->nh_grp->num_nh = num_nh; + info->nh_grp->is_fdb = nhg->fdb_nh; + + for (i = 0; i < num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + info->nh_grp->nh_entries[i].id = nhge->nh->id; + info->nh_grp->nh_entries[i].weight = nhge->weight; + __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, + nhge->nh); + } + + return 0; +} + +static void nh_notifier_grp_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh_grp); +} + +static int nh_notifier_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + info->id = nh->id; + info->is_grp = nh->is_group; + + if (info->is_grp) + return nh_notifier_grp_info_init(info, nh); + else + return nh_notifier_single_info_init(info, nh); +} + +static void nh_notifier_info_fini(struct nh_notifier_info *info) +{ + if (info->is_grp) + nh_notifier_grp_info_fini(info); + else + nh_notifier_single_info_fini(info); +} + static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, - struct nexthop *nh) + struct nexthop *nh, + struct netlink_ext_ack *extack) { + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; int err; + ASSERT_RTNL(); + + if (nexthop_notifiers_is_empty(net)) + return 0; + + err = nh_notifier_info_init(&info, nh); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); + return err; + } + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, - event_type, nh); + event_type, &info); + nh_notifier_info_fini(&info); + + return notifier_to_errno(err); +} + +static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, + enum nexthop_event_type event_type, + struct nexthop *nh, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; + int err; + + err = nh_notifier_info_init(&info, nh); + if (err) + return err; + + err = nb->notifier_call(nb, event_type, &info); + nh_notifier_info_fini(&info); + return notifier_to_errno(err); } @@ -782,9 +913,10 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; + struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; - int i, j; + int i, j, err; WARN_ON(!nh); @@ -832,6 +964,10 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, list_del(&nhge->nh_list); nexthop_put(nhge->nh); + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); + if (err) + pr_err("%s\n", extack._msg); + if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } @@ -907,7 +1043,7 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh, static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { - call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -940,13 +1076,17 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct netlink_ext_ack *extack) { struct nh_group *oldg, *newg; - int i; + int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); + if (err) + return err; + oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); @@ -985,31 +1125,54 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { + u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; + struct nh_grp_entry *nhge; + int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); + if (err) + return err; + + /* Hardware flags were set on 'old' as 'new' is not in the red-black + * tree. Therefore, inherit the flags from 'old' to 'new'. + */ + new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); + oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; + old_protocol = old->protocol; + old_nh_flags = old->nh_flags; + old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); + /* Send a replace notification for all the groups using the nexthop. */ + list_for_each_entry(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, + extack); + if (err) + goto err_notify; + } + /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { - struct nh_grp_entry *nhge; - list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; @@ -1020,6 +1183,21 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, } return 0; + +err_notify: + rcu_assign_pointer(new->nh_info, newi); + rcu_assign_pointer(old->nh_info, oldi); + old->nh_flags = old_nh_flags; + old->protocol = old_protocol; + oldi->nh_parent = old; + newi->nh_parent = new; + list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + + call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack); + } + call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); + return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, @@ -1168,7 +1346,11 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); - rc = 0; + + rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); + if (rc) + rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); + out: if (!rc) { nh_base_seq_inc(net); @@ -1957,10 +2139,40 @@ static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; -int register_nexthop_notifier(struct net *net, struct notifier_block *nb) +static int nexthops_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct rb_root *root = &net->nexthop.rb_root; + struct rb_node *node; + int err = 0; + + for (node = rb_first(root); node; node = rb_next(node)) { + struct nexthop *nh; + + nh = rb_entry(node, struct nexthop, rb_node); + err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh, + extack); + if (err) + break; + } + + return err; +} + +int register_nexthop_notifier(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return blocking_notifier_chain_register(&net->nexthop.notifier_chain, - nb); + int err; + + rtnl_lock(); + err = nexthops_dump(net, nb, extack); + if (err) + goto unlock; + err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, + nb); +unlock: + rtnl_unlock(); + return err; } EXPORT_SYMBOL(register_nexthop_notifier); @@ -1971,6 +2183,27 @@ int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) } EXPORT_SYMBOL(unregister_nexthop_notifier); +void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) +{ + struct nexthop *nexthop; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop) + goto out; + + nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + if (offload) + nexthop->nh_flags |= RTNH_F_OFFLOAD; + if (trap) + nexthop->nh_flags |= RTNH_F_TRAP; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_set_hw_flags); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8d5e1695b9aa..63cd370ea29d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -167,6 +167,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), + SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dc2a399cd9f4..e26652ff7059 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1741,7 +1741,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) return -ENOBUFS; @@ -1857,8 +1857,8 @@ static int __mkroute_input(struct sk_buff *skb, } rth = rt_dst_alloc(out_dev->dev, 0, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM)); + IN_DEV_ORCONF(in_dev, NOPOLICY), + IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -2227,7 +2227,7 @@ local_input: rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, flags | RTCF_LOCAL, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; @@ -2450,8 +2450,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, add: rth = rt_dst_alloc(dev_out, flags, type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM)); + IN_DEV_ORCONF(in_dev, NOPOLICY), + IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); @@ -2872,6 +2872,9 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; + if (rt->dst.lwtstate && + lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) @@ -3222,7 +3225,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -3246,8 +3249,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; - err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, - dev, &res); + err = ip_route_input_rcu(skb, dst, src, + rtm->rtm_tos & IPTOS_RT_MASK, dev, + &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6ac473b47f30..00dc3f943c80 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -331,7 +331,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; @@ -427,8 +427,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bae4284bf542..75a28b8f4470 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -485,6 +485,8 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, return true; if (tcp_rmem_pressure(sk)) return true; + if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) + return true; } if (sk->sk_prot->stream_memory_read) return sk->sk_prot->stream_memory_read(sk); @@ -952,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ -static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { if (skb && !skb->len) { tcp_unlink_write_queue(skb, sk); @@ -962,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) } } +struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool can_coalesce; + int copy, i; + + if (!skb || (copy = size_goal - skb->len) <= 0 || + !tcp_skb_can_collapse_to(skb)) { +new_segment: + if (!sk_stream_memory_free(sk)) + return NULL; + + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + tcp_rtx_and_write_queues_empty(sk)); + if (!skb) + return NULL; + +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif + skb_entail(sk, skb); + copy = size_goal; + } + + if (copy > *size) + copy = *size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (!sk_wmem_schedule(sk, copy)) + return NULL; + + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + *size = copy; + return skb; +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -997,60 +1061,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; while (size > 0) { - struct sk_buff *skb = tcp_write_queue_tail(sk); - int copy, i; - bool can_coalesce; + struct sk_buff *skb; + size_t copy = size; - if (!skb || (copy = size_goal - skb->len) <= 0 || - !tcp_skb_can_collapse_to(skb)) { -new_segment: - if (!sk_stream_memory_free(sk)) - goto wait_for_space; - - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - tcp_rtx_and_write_queues_empty(sk)); - if (!skb) - goto wait_for_space; - -#ifdef CONFIG_TLS_DEVICE - skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); -#endif - skb_entail(sk, skb); - copy = size_goal; - } - - if (copy > size) - copy = size; - - i = skb_shinfo(skb)->nr_frags; - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { - tcp_mark_push(tp, skb); - goto new_segment; - } - if (!sk_wmem_schedule(sk, copy)) + skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©); + if (!skb) goto wait_for_space; - if (can_coalesce) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); - } - - if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk_wmem_queued_add(sk, copy); - sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; - WRITE_ONCE(tp->write_seq, tp->write_seq + copy); - TCP_SKB_CB(skb)->end_seq += copy; - tcp_skb_pcount_set(skb, 0); - if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; @@ -2403,13 +2420,12 @@ bool tcp_check_oom(struct sock *sk, int shift) return too_many_orphans || out_of_socket_memory; } -void tcp_close(struct sock *sk, long timeout) +void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; - lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { @@ -2573,6 +2589,12 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); +} + +void tcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); } @@ -3020,6 +3042,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepcnt); +int tcp_set_window_clamp(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!val) { + if (sk->sk_state != TCP_CLOSE) + return -EINVAL; + tp->window_clamp = 0; + } else { + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + } + return 0; +} + /* * Socket option code for TCP. */ @@ -3233,15 +3270,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_WINDOW_CLAMP: - if (!val) { - if (sk->sk_state != TCP_CLOSE) { - err = -EINVAL; - break; - } - tp->window_clamp = 0; - } else - tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6c4d79baff26..6ea3dc2e4219 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -945,7 +945,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || + (rs->rtt_us < bbr->min_rtt_us || (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 37f4cb2bba5c..bc7d2a586e18 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -15,8 +15,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; - int i, ret, copied = 0; struct sk_msg *msg_rx; + int i, copied = 0; msg_rx = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); @@ -37,17 +37,16 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - ret = copy_page_to_iter(page, sge->offset, copy, iter); - if (ret != copy) { - msg_rx->sg.start = i; - return -EFAULT; - } + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; copied += copy; if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - sk_mem_uncharge(sk, copy); + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); msg_rx->sg.size -= copy; if (!sge->length) { @@ -56,6 +55,11 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, put_page(page); } } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; sk_msg_iter_var_next(i); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index db47ac24d057..563d016e7478 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -198,6 +198,11 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fc445833b5e5..9e8a6c1aa019 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2546,7 +2546,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. - * But when the retransmits are acked without further losses, PRR + * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) @@ -2563,7 +2563,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2577,8 +2577,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == - FLAG_RETRANS_DATA_ACKED) { + } else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -3419,7 +3418,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ - tcp_cwnd_reduction(sk, acked_sacked, flag); + tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag); } else if (tcp_may_raise_cwnd(sk, flag)) { /* Advance cwnd if state allows */ tcp_cong_avoid(sk, ack, acked_sacked); @@ -4908,7 +4907,8 @@ void tcp_data_ready(struct sock *sk) int avail = tp->rcv_nxt - tp->copied_seq; if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && - !sock_flag(sk, SOCK_DONE)) + !sock_flag(sk, SOCK_DONE) && + tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) return; sk->sk_data_ready(sk); @@ -6799,18 +6799,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) + dst = af_ops->route_req(sk, skb, &fl, req); + if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ if (!net->ipv4.sysctl_tcp_syncookies && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7352c097ae48..af2338294598 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -980,17 +980,22 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); - tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; - if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos & ~INET_ECN_MASK : + inet_sk(sk)->tos; + + if (!INET_ECN_is_capable(tos) && + tcp_bpf_ca_needs_ecn((struct sock *)req)) + tos |= INET_ECN_ECT_0; + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, rcu_dereference(ireq->ireq_opt), - tos & ~INET_ECN_MASK); + tos); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1439,9 +1444,15 @@ static void tcp_v4_init_req(struct request_sock *req, } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v4_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet_csk_route_req(sk, &fl->u.ip4, req); } @@ -1461,7 +1472,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif - .init_req = tcp_v4_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v4_init_sequence, #endif @@ -1498,6 +1508,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bool *own_req) { struct inet_request_sock *ireq; + bool found_dup_sk = false; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; @@ -1575,12 +1586,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (likely(*own_req)) { tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { - newinet->inet_opt = NULL; + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } else { + newinet->inet_opt = NULL; + } } return newsk; @@ -2740,6 +2761,20 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +bool tcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); + + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); +} +EXPORT_SYMBOL(tcp_stream_memory_free); + struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 8c643a4ffad1..e6459537d4d2 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -89,6 +89,7 @@ struct lp { /** * tcp_lp_init + * @sk: socket to initialize congestion control algorithm for * * Init all required variables. * Clone the handling from Vegas module implementation. @@ -111,6 +112,7 @@ static void tcp_lp_init(struct sock *sk) /** * tcp_lp_cong_avoid + * @sk: socket to avoid congesting * * Implementation of cong_avoid. * Will only call newReno CA when away from inference. @@ -126,6 +128,7 @@ static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) /** * tcp_lp_remote_hz_estimator + * @sk: socket which needs an estimate for the remote HZs * * Estimate remote HZ. * We keep on updating the estimated value, where original TCP-LP @@ -176,6 +179,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) /** * tcp_lp_owd_calculator + * @sk: socket to calculate one way delay for * * Calculate one way delay (in relative format). * Original implement OWD as minus of remote time difference to local time @@ -210,6 +214,8 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) /** * tcp_lp_rtt_sample + * @sk: socket to add a rtt sample to + * @rtt: round trip time, which is ignored! * * Implementation or rtt_sample. * Will take the following action, @@ -254,6 +260,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) /** * tcp_lp_pkts_acked + * @sk: socket requiring congestion avoidance calculations * * Implementation of pkts_acked. * Deal with active drop under Early Congestion Indication. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bf48cd73e967..41880d3521ed 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -445,11 +445,12 @@ struct tcp_out_options { struct mptcp_out_options mptcp; }; -static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp, + struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) - mptcp_write_options(ptr, &opts->mptcp); + mptcp_write_options(ptr, tp, &opts->mptcp); #endif } @@ -701,7 +702,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options); - mptcp_options_write(ptr, opts); + mptcp_options_write(ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -1038,9 +1039,9 @@ static void tcp_tsq_handler(struct sock *sk) * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ -static void tcp_tasklet_func(unsigned long data) +static void tcp_tasklet_func(struct tasklet_struct *t) { - struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; @@ -1125,9 +1126,7 @@ void __init tcp_tasklet_init(void) struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); - tasklet_init(&tsq->tasklet, - tcp_tasklet_func, - (unsigned long)tsq); + tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } @@ -1348,7 +1347,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); @@ -1359,6 +1357,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ th->window = htons(min(tp->rcv_wnd, 65535U)); } + + tcp_options_write((__be32 *)(th + 1), tp, &opts); + #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { @@ -1569,6 +1570,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2123,6 +2125,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2393,6 +2396,7 @@ static int tcp_mtu_probe(struct sock *sk) skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); + mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index f65a3ddd0d58..177307a3081f 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -153,6 +153,7 @@ void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout, prior_inflight; + u32 lost = tp->lost; prior_inflight = tcp_packets_in_flight(tp); tcp_rack_detect_loss(sk, &timeout); @@ -160,7 +161,7 @@ void tcp_rack_reo_timeout(struct sock *sk) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); if (!inet_csk(sk)->icsk_ca_ops->cong_control) - tcp_cwnd_reduction(sk, 1, 0); + tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0); } tcp_xmit_retransmit_queue(sk); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 09f0a23d1a01..a3f105227ccc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -541,7 +541,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, inet_sdif(skb), udptable, skb); } -struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, +struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = ip_hdr(skb); @@ -550,7 +550,6 @@ struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, iph->daddr, dport, inet_iif(skb), inet_sdif(skb), &udp_table, NULL); } -EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. @@ -702,7 +701,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); - if (!sk) { + if (!sk || udp_sk(sk)->encap_type) { /* No socket for error: try tunnels before discarding */ sk = ERR_PTR(-ENOENT); if (static_branch_unlikely(&udp_encap_needed_key)) { @@ -874,7 +873,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct udphdr *uh; - int err = 0; + int err; int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; @@ -2038,6 +2037,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rc == -ENOMEM) UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); + else + UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, + is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); trace_udp_fail_queue_rcv_skb(rc, sk); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 1dbece34496e..b2cee9a307d4 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -30,7 +30,7 @@ static int udp_dump_one(struct udp_table *tbl, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; - int err = -EINVAL; + int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e67a66fbf27b..ff39e94781bf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -49,6 +49,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb_set_transport_header(skb, skb_inner_transport_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; @@ -67,6 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); features &= skb->dev->hw_enc_features; + /* CRC checksum can't be handled by HW when it's a UDP tunneling packet. */ + features &= ~NETIF_F_SCTP_CRC; /* The only checksum offload we care about from here on out is the * outer one so strip the existing checksum feature flags and @@ -366,7 +369,7 @@ out: static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { - struct udphdr *uh = udp_hdr(skb); + struct udphdr *uh = udp_gro_udphdr(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; @@ -500,12 +503,22 @@ out: } EXPORT_SYMBOL(udp_gro_receive); +static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + inet_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -523,7 +536,10 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; rcu_read_lock(); - sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udp_encap_needed_key)) + sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; @@ -551,8 +567,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, { __be16 newlen = htons(skb->len - nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - int err = -ENOSYS; struct sock *sk; + int err; uh->len = newlen; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index dc19aff7c2e0..fb0648e7fb32 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -64,14 +64,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, - .priority = 3, + .priority = 4, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, - .priority = 2, + .priority = 3, }; #endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 01146b66d666..eff2cacd5209 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1997,6 +1997,7 @@ EXPORT_SYMBOL(ipv6_chk_prefix); * ipv6_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address + * @dev: used to find the L3 domain of interest * * The caller should be protected by RCU, or RTNL. */ @@ -5022,8 +5023,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, return -EMSGSIZE; if (args->netnsid >= 0 && - nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || @@ -5054,8 +5057,10 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return -EMSGSIZE; if (args->netnsid >= 0 && - nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 642fc6ac13d2..8a22486cf270 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -306,7 +306,9 @@ static int ip6addrlbl_del(struct net *net, /* add default label */ static int __net_init ip6addrlbl_net_init(struct net *net) { - int err = 0; + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *n; + int err; int i; ADDRLABEL(KERN_DEBUG "%s\n", __func__); @@ -315,14 +317,20 @@ static int __net_init ip6addrlbl_net_init(struct net *net) INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { - int ret = ip6addrlbl_add(net, - ip6addrlbl_init_table[i].prefix, - ip6addrlbl_init_table[i].prefixlen, - 0, - ip6addrlbl_init_table[i].label, 0); - /* XXX: should we free all rules when we catch an error? */ - if (ret && (!err || err != -ENOMEM)) - err = ret; + err = ip6addrlbl_add(net, + ip6addrlbl_init_table[i].prefix, + ip6addrlbl_init_table[i].prefixlen, + 0, + ip6addrlbl_init_table[i].label, 0); + if (err) + goto err_ip6addrlbl_add; + } + return 0; + +err_ip6addrlbl_add: + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { + hlist_del_rcu(&p->list); + kfree_rcu(p, rcu); } return err; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e648fbebb167..a7e3d170af51 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); if (err) return err; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index d88d97617f7e..440080da805b 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -588,7 +588,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) + err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN); + if (err) goto out_free; ip6h->priority = 0; diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 78f766019b7e..51184a70ac7e 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -423,7 +423,7 @@ static void calipso_doi_free_rcu(struct rcu_head *entry) /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value - * @audit_secid: the LSM secid to use in the audit message + * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will @@ -1226,7 +1226,7 @@ static int calipso_req_setattr(struct request_sock *req, /** * calipso_req_delattr - Delete the CALIPSO option from a request socket - * @reg: the request socket + * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 374105e4394f..6126f8bf94b3 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -906,11 +906,6 @@ void ipv6_exthdrs_exit(void) /* * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). */ -static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb) -{ - return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev); -} - static inline struct net *ipv6_skb_net(struct sk_buff *skb) { return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ec448b71bf9a..8956144ea65e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -158,7 +158,13 @@ static bool is_ineligible(const struct sk_buff *skb) tp = skb_header_pointer(skb, ptr+offsetof(struct icmp6hdr, icmp6_type), sizeof(_type), &_type); - if (!tp || !(*tp & ICMPV6_INFOMSG_MASK)) + + /* Based on RFC 8200, Section 4.5 Fragment Header, return + * false if this is a fragment packet with no icmp header info. + */ + if (!tp && frag_off != 0) + return false; + else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK)) return true; } return false; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 931b186d2e48..c3bc89b6b1a1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1133,8 +1133,13 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, return; if (rt->dst.dev) { - dev->needed_headroom = rt->dst.dev->hard_header_len + - t_hlen; + unsigned short dst_len = rt->dst.dev->hard_header_len + + t_hlen; + + if (t->dev->header_ops) + dev->hard_header_len = dst_len; + else + dev->needed_headroom = dst_len; if (set_mtu) { dev->mtu = rt->dst.dev->mtu - t_hlen; @@ -1159,7 +1164,12 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; + + if (tunnel->dev->header_ops) + tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; + else + tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; + return t_hlen; } @@ -1391,7 +1401,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_do_ioctl = ip6gre_tunnel_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1828,7 +1838,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1896,7 +1906,7 @@ static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a0217e5bf3bc..a7950baa05e5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -94,36 +94,6 @@ static inline int ip6_tnl_mpls_supported(void) return IS_ENABLED(CONFIG_MPLS); } -static struct net_device_stats *ip6_get_stats(struct net_device *dev) -{ - struct pcpu_sw_netstats tmp, sum = { 0 }; - int i; - - for_each_possible_cpu(i) { - unsigned int start; - const struct pcpu_sw_netstats *tstats = - per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - tmp.rx_packets = tstats->rx_packets; - tmp.rx_bytes = tstats->rx_bytes; - tmp.tx_packets = tstats->tx_packets; - tmp.tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - sum.rx_packets += tmp.rx_packets; - sum.rx_bytes += tmp.rx_bytes; - sum.tx_packets += tmp.tx_packets; - sum.tx_bytes += tmp.tx_bytes; - } - dev->stats.rx_packets = sum.rx_packets; - dev->stats.rx_bytes = sum.rx_bytes; - dev->stats.tx_packets = sum.tx_packets; - dev->stats.tx_bytes = sum.tx_bytes; - return &dev->stats; -} - #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -204,6 +174,7 @@ ip6_tnl_lookup(struct net *net, int link, /** * ip6_tnl_bucket - get head of list matching given tunnel parameters + * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: @@ -230,6 +201,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) /** * ip6_tnl_link - add tunnel to hash table + * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ @@ -246,6 +218,7 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) /** * ip6_tnl_unlink - remove tunnel from hash table + * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ @@ -417,6 +390,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) /** * parse_tvl_tnl_enc_lim - handle encapsulation limit option * @skb: received socket buffer + * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, @@ -485,14 +459,9 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); -/** - * ip6_tnl_err - tunnel error handler - * - * Description: - * ip6_tnl_err() should handle errors in the tunnel according - * to the specifications in RFC 2473. - **/ - +/* ip6_tnl_err() should handle errors in the tunnel according to the + * specifications in RFC 2473. + */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) @@ -1271,6 +1240,8 @@ route_lookup: if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; + skb_set_inner_ipproto(skb, proto); + err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; @@ -1280,8 +1251,6 @@ route_lookup: ipv6_push_frag_opts(skb, &opt.ops, &proto); } - skb_set_inner_ipproto(skb, proto); - skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); @@ -1835,7 +1804,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats = ip6_get_stats, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5f9c4fdc120d..0225fd694192 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -125,6 +125,7 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, /** * vti6_tnl_bucket - get head of list matching given tunnel parameters + * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: @@ -889,7 +890,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_do_ioctl = vti6_ioctl, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 43a894bf9a1b..a6804a7e34c1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1148,7 +1148,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = optval; + msg.msg_control_user = optval; msg.msg_controllen = len; msg.msg_flags = flags; msg.msg_control_is_user = true; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 8cd2782a31e4..6c8604390266 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -548,7 +548,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage *p) + struct sockaddr_storage __user *p) { int err, i, count, copycount; const struct in6_addr *group; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 27f29b957ee7..76717478f173 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -81,6 +81,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); +static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, @@ -115,6 +116,7 @@ struct neigh_table nd_tbl = { .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, + .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { @@ -1706,6 +1708,11 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static int ndisc_is_multicast(const void *pkey) +{ + return ipv6_addr_is_multicast((struct in6_addr *)pkey); +} + static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 6d0e942d082d..ab9a279dd6d4 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -20,10 +20,10 @@ #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include "../bridge/br_private.h" -int ip6_route_me_harder(struct net *net, struct sk_buff *skb) +int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); - struct sock *sk = sk_to_full_sk(skb->sk); + struct sock *sk = sk_to_full_sk(sk_partial); unsigned int hh_len; struct dst_entry *dst; int strict = (ipv6_addr_type(&iph->daddr) & @@ -84,7 +84,7 @@ static int nf_ip6_reroute(struct sk_buff *skb, if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) - return ip6_route_me_harder(entry->state.net, skb); + return ip6_route_me_harder(entry->state.net, entry->state.sk, skb); } return 0; } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 1a2748611e00..cee74803d7a1 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -57,7 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { - err = ip6_route_me_harder(state->net, skb); + err = ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 054d287eb13d..c129ad334eb3 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -440,6 +440,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; + u8 nexthdr = NEXTHDR_FRAGMENT; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; @@ -455,6 +456,14 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) return 0; + /* Discard the first fragment if it does not include all headers + * RFC 8200, Section 4.5 + */ + if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) { + pr_debug("Drop incomplete fragment\n"); + return 0; + } + if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 4aef6baaa55e..aa35e6e37c1f 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,140 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> +static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int thoff; + __be16 fo; + u8 proto = ip6h->nexthdr; + + if (skb_csum_unnecessary(skb)) + return true; + + if (ip6h->payload_len && + pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) + return false; + + ip6h = ipv6_hdr(skb); + thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); + if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) + return false; + + if (!nf_reject_verify_csum(proto)) + return true; + + return nf_ip6_checksum(skb, hook, thoff, proto) == 0; +} + +static int nf_reject_ip6hdr_validate(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return 0; + + pkt_len = ntohs(hdr->payload_len); + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + return 0; + + return 1; +} + +struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + const struct tcphdr *oth; + struct tcphdr _oth; + unsigned int otcplen; + struct ipv6hdr *nip6h; + + if (!nf_reject_ip6hdr_validate(oldskb)) + return NULL; + + oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook); + if (!oth) + return NULL; + + nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, + net->ipv6.devconf_all->hop_limit); + nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); + nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); + +struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + struct ipv6hdr *nip6h; + struct icmp6hdr *icmp6h; + unsigned int len; + + if (!nf_reject_ip6hdr_validate(oldskb)) + return NULL; + + /* Include "As much of invoking packet as possible without the ICMPv6 + * packet exceeding the minimum IPv6 MTU" in the ICMP payload. + */ + len = min_t(unsigned int, 1220, oldskb->len); + + if (!pskb_may_pull(oldskb, len)) + return NULL; + + if (!nf_reject_v6_csum_ok(oldskb, hook)) + return NULL; + + nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + + LL_MAX_HEADER + len, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, + net->ipv6.devconf_all->hop_limit); + + skb_reset_transport_header(nskb); + icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); + icmp6h->icmp6_type = ICMPV6_DEST_UNREACH; + icmp6h->icmp6_code = code; + + skb_put_data(nskb, skb_network_header(oldskb), len); + nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); + + icmp6h->icmp6_cksum = + csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, + nskb->len - sizeof(struct ipv6hdr), + IPPROTO_ICMPV6, + csum_partial(icmp6h, + nskb->len - sizeof(struct ipv6hdr), + 0)); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); + const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook) @@ -170,7 +304,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING) { + if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; @@ -268,7 +402,8 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if (hooknum == NF_INET_PRE_ROUTING && nf_reject6_fill_skb_dst(skb_in)) + if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && + nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index bbff3e02e302..d6306aa46bb1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -126,6 +126,7 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), + SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; @@ -137,6 +138,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 1f5d4d196dcc..47a0dc46cbdb 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -42,6 +42,8 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/tcp.h> +#include <linux/udp.h> #include <net/sock.h> #include <net/snmp.h> @@ -322,6 +324,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + u8 nexthdr; int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) @@ -351,6 +354,20 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } + /* RFC 8200, Section 4.5 Fragment Header: + * If the first fragment does not include all headers through an + * Upper-Layer header, then that fragment should be discarded and + * an ICMP Parameter Problem, Code 3, message should be sent to + * the source of the fragment, with the Pointer field set to zero. + */ + nexthdr = hdr->nexthdr; + if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); + return -1; + } + iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7e0ce7af8234..188e114b29b4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5558,6 +5558,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) goto nla_put_failure; + + if (dst->lwtstate && + lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + goto nla_put_failure; } else if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; struct nlattr *mp; @@ -6039,11 +6043,6 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt, struct sk_buff *skb; int err = -ENOBUFS; - /* call_fib6_entry_notifiers will be removed when in-kernel notifier - * is implemented and supported for nexthop objects - */ - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); - skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index 307f336b5353..488aec9e1a74 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 5fdf3ebb953f..233da43dd8d7 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5e2c34c0ac97..2da0ee703779 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1128,7 +1128,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1396,7 +1395,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; @@ -1426,7 +1425,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index e796a64be308..9b6cae1e49d9 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -136,7 +136,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct dst_entry *dst; __u8 rcv_wscale; u32 tsoff = 0; @@ -241,7 +241,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8db59f4e5f13..1a1510513739 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -527,15 +527,20 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); + tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos & ~INET_ECN_MASK : + np->tclass; + + if (!INET_ECN_is_capable(tclass) && + tcp_bpf_ca_needs_ecn((struct sock *)req)) + tclass |= INET_ECN_ECT_0; + rcu_read_lock(); opt = ireq->ipv6_opt; - tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos : np->tclass; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, - tclass & ~INET_ECN_MASK, - sk->sk_priority); + tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -823,9 +828,15 @@ static void tcp_v6_init_req(struct request_sock *req, } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v6_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } @@ -846,7 +857,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif - .init_req = tcp_v6_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif @@ -1193,6 +1203,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; + bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG @@ -1368,7 +1379,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); @@ -1383,6 +1395,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * skb_set_owner_r(newnp->pktoptions, newsk); } } + } else { + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } } return newsk; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 29d9691359b9..9008f5796ad4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -276,7 +276,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, inet6_sdif(skb), udptable, skb); } -struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, +struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -285,7 +285,6 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, &iph->daddr, dport, inet6_iif(skb), inet6_sdif(skb), &udp_table, NULL); } -EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. @@ -560,7 +559,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, NULL); - if (!sk) { + if (!sk || udp_sk(sk)->encap_type) { /* No socket for error: try tunnels before discarding */ sk = ERR_PTR(-ENOENT); if (static_branch_unlikely(&udpv6_encap_needed_key)) { @@ -637,6 +636,9 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rc == -ENOMEM) UDP6_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); + else + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_MEMERRORS, is_udplite); UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 584157a07759..c7bd7b1a04c1 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -28,10 +28,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int tnl_hlen; int err; - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); @@ -48,6 +44,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) return __udp_gso_segment(skb, features); + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot * do checksum of UDP packets sent as multiple IP fragments. */ @@ -111,12 +111,22 @@ out: return segs; } +static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + inet6_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -135,7 +145,10 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; rcu_read_lock(); - sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udpv6_encap_needed_key)) + sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 25b7ebda2fab..f696d46e6910 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -303,13 +303,13 @@ static const struct xfrm_type xfrm6_tunnel_type = { static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, - .priority = 2, + .priority = 3, }; static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, - .priority = 2, + .priority = 3, }; static int __net_init xfrm6_tunnel_net_init(struct net *net) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index d80572074667..db7d888914fa 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1434,7 +1434,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) break; } - if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) { + if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) && + sk->sk_state == IUCV_CONNECTED) { if (iucv->transport == AF_IUCV_TRANS_IUCV) { txmsg.class = 0; txmsg.tag = 0; @@ -1644,7 +1645,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); + nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1850,7 +1851,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); + nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 864326f150e2..ad7730b68772 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -241,6 +241,7 @@ EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); * L3 master device * @net: network namespace for device index lookup * @fl: flow struct + * @arg: store the table the rule matched with here */ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 3c03f6512c5f..213ea7abc9ab 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -418,14 +418,94 @@ int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) return used; } +/* Handle device status changes. */ +static int lapb_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct lapb_cb *lapb; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_X25) + return NOTIFY_DONE; + + lapb = lapb_devtostruct(dev); + if (!lapb) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); + + if (netif_carrier_ok(dev)) { + lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, + dev->name); + if (lapb->mode & LAPB_DCE) { + lapb_start_t1timer(lapb); + } else { + if (lapb->state == LAPB_STATE_0) { + lapb->state = LAPB_STATE_1; + lapb_establish_data_link(lapb); + } + } + } + break; + case NETDEV_GOING_DOWN: + if (netif_carrier_ok(dev)) + lapb_disconnect_request(dev); + break; + case NETDEV_DOWN: + lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); + lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb->n2count = 0; + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + break; + case NETDEV_CHANGE: + if (netif_carrier_ok(dev)) { + lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, + dev->name); + if (lapb->mode & LAPB_DCE) { + lapb_start_t1timer(lapb); + } else { + if (lapb->state == LAPB_STATE_0) { + lapb->state = LAPB_STATE_1; + lapb_establish_data_link(lapb); + } + } + } else { + lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); + lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb->n2count = 0; + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + } + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block lapb_dev_notifier = { + .notifier_call = lapb_device_event, +}; + static int __init lapb_init(void) { - return 0; + return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); + + unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 8f5b17001a07..baa247fe4ed0 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -85,11 +85,18 @@ static void lapb_t1timer_expiry(struct timer_list *t) switch (lapb->state) { /* - * If we are a DCE, keep going DM .. DM .. DM + * If we are a DCE, send DM up to N2 times, then switch to + * STATE_1 and send SABM(E). */ case LAPB_STATE_0: - if (lapb->mode & LAPB_DCE) + if (lapb->mode & LAPB_DCE && + lapb->n2count != lapb->n2) { + lapb->n2count++; lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE); + } else { + lapb->state = LAPB_STATE_1; + lapb_establish_data_link(lapb); + } break; /* diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 1144cda2a0fc..912aa9bd5e29 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -909,6 +909,8 @@ static void llc_sk_init(struct sock *sk) * @net: network namespace * @family: upper layer protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? * * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7276e66ae435..454432ced0c9 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2708,16 +2708,6 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, return 0; } -static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev, - const u8 *addr) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - - memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN); - - return 0; -} - static void ieee80211_rfkill_poll(struct wiphy *wiphy) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -4138,7 +4128,6 @@ const struct cfg80211_ops mac80211_config_ops = { .set_wiphy_params = ieee80211_set_wiphy_params, .set_tx_power = ieee80211_set_tx_power, .get_tx_power = ieee80211_get_tx_power, - .set_wds_peer = ieee80211_set_wds_peer, .rfkill_poll = ieee80211_rfkill_poll, CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump) diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 8f48aff74c7b..b6c80a45b9f5 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -275,11 +275,11 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: width = vif->bss_conf.chandef.width; break; + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_MONITOR: @@ -743,7 +743,6 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, continue; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: break; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index fe8a7a87e513..9fc8ce214322 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -574,9 +574,6 @@ static ssize_t ieee80211_if_parse_tsf( IEEE80211_IF_FILE_RW(tsf); -/* WDS attributes */ -IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC); - #ifdef CONFIG_MAC80211_MESH IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC); @@ -701,11 +698,6 @@ static void add_ibss_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD_MODE(tsf, 0600); } -static void add_wds_files(struct ieee80211_sub_if_data *sdata) -{ - DEBUGFS_ADD(peer); -} - #ifdef CONFIG_MAC80211_MESH static void add_mesh_files(struct ieee80211_sub_if_data *sdata) @@ -805,9 +797,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata) case NL80211_IFTYPE_AP_VLAN: add_vlan_files(sdata); break; - case NL80211_IFTYPE_WDS: - add_wds_files(sdata); - break; default: break; } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 829dcad69c2c..6a51b8b58f9e 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -274,7 +274,7 @@ static ssize_t sta_aql_read(struct file *file, char __user *userbuf, "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", q_depth[0], q_depth[1], q_depth[2], q_depth[3], q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], - q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]), + q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2a21226fb518..cde2e3f4fbcd 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -311,11 +311,6 @@ struct ieee80211_if_ap { bool multicast_to_unicast; }; -struct ieee80211_if_wds { - struct sta_info *sta; - u8 remote_addr[ETH_ALEN]; -}; - struct ieee80211_if_vlan { struct list_head list; /* write-protected with RTNL and local->mtx */ @@ -985,7 +980,6 @@ struct ieee80211_sub_if_data { union { struct ieee80211_if_ap ap; - struct ieee80211_if_wds wds; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; @@ -1795,7 +1789,7 @@ static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); -void ieee80211_tx_pending(unsigned long data); +void ieee80211_tx_pending(struct tasklet_struct *t); netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, @@ -2146,7 +2140,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi); -void ieee80211_wake_txqs(unsigned long data); +void ieee80211_wake_txqs(struct tasklet_struct *t); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1be775979132..f5d4ceb72882 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -230,10 +230,6 @@ static inline int identical_mac_addr_allowed(int type1, int type2) type2 == NL80211_IFTYPE_MONITOR || type1 == NL80211_IFTYPE_P2P_DEVICE || type2 == NL80211_IFTYPE_P2P_DEVICE || - (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) || - (type1 == NL80211_IFTYPE_WDS && - (type2 == NL80211_IFTYPE_WDS || - type2 == NL80211_IFTYPE_AP)) || (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) || (type1 == NL80211_IFTYPE_AP_VLAN && (type2 == NL80211_IFTYPE_AP || @@ -417,15 +413,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * In WDS mode a station must exist here and be flushed, for - * AP_VLANs stations may exist since there's nothing else that + * For AP_VLANs stations may exist since there's nothing else that * would have removed them, but in other modes there shouldn't * be any stations. */ flushed = sta_info_flush(sdata); - WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || - (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1))); + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && flushed > 0); /* don't count this interface for allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) @@ -552,8 +545,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * When we get here, the interface is marked down. * Free the remaining keys, if there are any * (which can happen in AP mode if userspace sets - * keys before the interface is operating, and maybe - * also in WDS mode) + * keys before the interface is operating) * * Force the key freeing to always synchronize_net() * to wait for the RX path in case it is using this @@ -1020,16 +1012,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct net_device *dev = wdev->netdev; struct ieee80211_local *local = sdata->local; - struct sta_info *sta; u32 changed = 0; int res; u32 hw_reconf_flags = 0; switch (sdata->vif.type) { - case NL80211_IFTYPE_WDS: - if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) - return -ENOLINK; - break; case NL80211_IFTYPE_AP_VLAN: { struct ieee80211_sub_if_data *master; @@ -1078,6 +1065,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_WDS: /* cannot happen */ WARN_ON(1); break; @@ -1196,7 +1184,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_OCB: netif_carrier_off(dev); break; - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: break; @@ -1218,28 +1205,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) set_bit(SDATA_STATE_RUNNING, &sdata->state); switch (sdata->vif.type) { - case NL80211_IFTYPE_WDS: - /* Create STA entry for the WDS peer */ - sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, - GFP_KERNEL); - if (!sta) { - res = -ENOMEM; - goto err_del_interface; - } - - sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); - sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - - res = sta_info_insert(sta); - if (res) { - /* STA has been freed */ - goto err_del_interface; - } - - rate_control_rate_init(sta); - netif_carrier_on(dev); - break; case NL80211_IFTYPE_P2P_DEVICE: rcu_assign_pointer(local->p2p_sdata, sdata); break; @@ -1356,6 +1321,7 @@ static void ieee80211_iface_work(struct work_struct *work) while ((skb = skb_dequeue(&sdata->skb_queue))) { struct ieee80211_mgmt *mgmt = (void *)skb->data; + kcov_remote_start_common(skb_get_kcov_handle(skb)); if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK) { int len = skb->len; @@ -1465,6 +1431,7 @@ static void ieee80211_iface_work(struct work_struct *work) } kfree_skb(skb); + kcov_remote_stop(); } /* then other type-dependent work */ @@ -1574,9 +1541,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->u.mntr.flags = MONITOR_FLAG_CONTROL | MONITOR_FLAG_OTHER_BSS; break; - case NL80211_IFTYPE_WDS: - sdata->vif.bss_conf.bssid = NULL; - break; case NL80211_IFTYPE_NAN: idr_init(&sdata->u.nan.function_inst_ids); spin_lock_init(&sdata->u.nan.func_lock); @@ -1587,6 +1551,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); break; @@ -1631,9 +1596,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_OCB: /* * Could probably support everything - * but WDS here (WDS do_open can fail - * under memory pressure, which this - * code isn't prepared to handle). + * but here. */ break; case NL80211_IFTYPE_P2P_CLIENT: @@ -1726,7 +1689,6 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, case NL80211_IFTYPE_MONITOR: /* doesn't matter */ break; - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: /* match up with an AP interface */ list_for_each_entry(sdata, &local->interfaces, list) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 523380aed92e..dee88ec566ad 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -220,9 +220,9 @@ u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_ERP_SLOT; } -static void ieee80211_tasklet_handler(unsigned long data) +static void ieee80211_tasklet_handler(struct tasklet_struct *t) { - struct ieee80211_local *local = (struct ieee80211_local *) data; + struct ieee80211_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue)) || @@ -733,16 +733,12 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->pending[i]); atomic_set(&local->agg_queue_stop[i], 0); } - tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, - (unsigned long)local); + tasklet_setup(&local->tx_pending_tasklet, ieee80211_tx_pending); if (ops->wake_tx_queue) - tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs, - (unsigned long)local); + tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs); - tasklet_init(&local->tasklet, - ieee80211_tasklet_handler, - (unsigned long) local); + tasklet_setup(&local->tasklet, ieee80211_tasklet_handler); skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); @@ -935,14 +931,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return -EINVAL; } } else { - /* - * WDS is currently prohibited when channel contexts are used - * because there's no clear definition of which channel WDS - * type interfaces use - */ - if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)) - return -EINVAL; - /* DFS is not supported with multi-channel combinations yet */ for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *comb; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index ce5825d6f1d1..97095b7c9c64 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -667,6 +667,35 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) } } +static void +ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, + u8 *ie, u8 ie_len) +{ + struct ieee80211_supported_band *sband; + const u8 *cap; + const struct ieee80211_he_operation *he_oper = NULL; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + if (!ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT) || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return; + + sdata->vif.bss_conf.he_support = true; + + cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len); + if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3])) + he_oper = (void *)(cap + 3); + + if (he_oper) + sdata->vif.bss_conf.he_oper.params = + __le32_to_cpu(he_oper->he_oper_params); +} + /** * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame * @hdr: 802.11 frame header @@ -943,6 +972,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) bcn->tail_len = skb->len; memcpy(bcn->tail, skb->data, bcn->tail_len); + ieee80211_mesh_update_bss_params(sdata, bcn->tail, bcn->tail_len); bcn->meshconf = (struct ieee80211_meshconf_ie *) (bcn->tail + ifmsh->meshconf_offset); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f400240a556f..6adfcb9c06dc 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5464,6 +5464,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { bool is_6ghz = req->bss->channel->band == NL80211_BAND_6GHZ; + bool is_5ghz = req->bss->channel->band == NL80211_BAND_5GHZ; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)req->bss->priv; @@ -5616,7 +5617,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap)) memcpy(&assoc_data->ap_vht_cap, vht_ie + 2, sizeof(struct ieee80211_vht_cap)); - else if (!is_6ghz) + else if (is_5ghz) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT | IEEE80211_STA_DISABLE_HE; rcu_read_unlock(); diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 38c45e1dafd8..ae378a41c927 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -150,21 +150,6 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) case NL80211_IFTYPE_STATION: ieee80211_mgd_quiesce(sdata); break; - case NL80211_IFTYPE_WDS: - /* tear down aggregation sessions and remove STAs */ - mutex_lock(&local->sta_mtx); - sta = sdata->u.wds.sta; - if (sta && sta->uploaded) { - enum ieee80211_sta_state state; - - state = sta->sta_state; - for (; state > IEEE80211_STA_NOTEXIST; state--) - WARN_ON(drv_sta_state(local, sta->sdata, - sta, state, - state - 1)); - } - mutex_unlock(&local->sta_mtx); - break; default: break; } diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 86bc469a28bc..b13b1da19386 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -274,7 +274,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, success = !!(info->flags & IEEE80211_TX_STAT_ACK); for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { - if (ar[i].idx < 0) + if (ar[i].idx < 0 || !ar[i].count) break; ndx = rix_to_ndx(mi, ar[i].idx); @@ -287,12 +287,6 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, mi->r[ndx].stats.success += success; } - if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0)) - mi->sample_packets++; - - if (mi->sample_deferred > 0) - mi->sample_deferred--; - if (time_after(jiffies, mi->last_stats_update + mp->update_interval / (mp->new_avg ? 2 : 1))) minstrel_update_stats(mp, mi); @@ -367,7 +361,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, return; delta = (mi->total_packets * sampling_ratio / 100) - - (mi->sample_packets + mi->sample_deferred / 2); + mi->sample_packets; /* delta < 0: no sampling required */ prev_sample = mi->prev_sample; @@ -376,7 +370,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, return; if (mi->total_packets >= 10000) { - mi->sample_deferred = 0; mi->sample_packets = 0; mi->total_packets = 0; } else if (delta > mi->n_rates * 2) { @@ -401,19 +394,8 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * rate sampling method should be used. * Respect such rates that are not sampled for 20 interations. */ - if (mrr_capable && - msr->perfect_tx_time > mr->perfect_tx_time && - msr->stats.sample_skipped < 20) { - /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark - * packets that have the sampling rate deferred to the - * second MRR stage. Increase the sample counter only - * if the deferred sample rate was actually used. - * Use the sample_deferred counter to make sure that - * the sampling is not done in large bursts */ - info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; - rate++; - mi->sample_deferred++; - } else { + if (msr->perfect_tx_time < mr->perfect_tx_time || + msr->stats.sample_skipped >= 20) { if (!msr->sample_limit) return; @@ -433,6 +415,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, rate->idx = mi->r[ndx].rix; rate->count = minstrel_get_retry_count(&mi->r[ndx], info); + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; } diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index dbb43bcd3c45..86cd80b3ffde 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -126,7 +126,6 @@ struct minstrel_sta_info { u8 max_prob_rate; unsigned int total_packets; unsigned int sample_packets; - int sample_deferred; unsigned int sample_row; unsigned int sample_column; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1e2e5a406d58..062c2b45584e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1477,7 +1477,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) if (unlikely((ieee80211_is_data(hdr->frame_control) || ieee80211_is_pspoll(hdr->frame_control)) && rx->sdata->vif.type != NL80211_IFTYPE_ADHOC && - rx->sdata->vif.type != NL80211_IFTYPE_WDS && rx->sdata->vif.type != NL80211_IFTYPE_OCB && (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) { /* @@ -4080,10 +4079,6 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; return true; - case NL80211_IFTYPE_WDS: - if (bssid || !ieee80211_is_data(hdr->frame_control)) - return false; - return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2); case NL80211_IFTYPE_P2P_DEVICE: return ieee80211_is_public_action(hdr, skb->len) || ieee80211_is_probe_req(hdr->frame_control) || @@ -4742,6 +4737,8 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rx_flags = 0; + kcov_remote_start_common(skb_get_kcov_handle(skb)); + /* * Frames with failed FCS/PLCP checksum are not returned, * all other frames are returned without radiotap header @@ -4749,15 +4746,15 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, * Also, frames with less than 16 bytes are dropped. */ skb = ieee80211_rx_monitor(local, skb, rate); - if (!skb) - return; - - ieee80211_tpt_led_trig_rx(local, - ((struct ieee80211_hdr *)skb->data)->frame_control, - skb->len); + if (skb) { + ieee80211_tpt_led_trig_rx(local, + ((struct ieee80211_hdr *)skb->data)->frame_control, + skb->len); - __ieee80211_rx_handle_packet(hw, pubsta, skb, list); + __ieee80211_rx_handle_packet(hw, pubsta, skb, list); + } + kcov_remote_stop(); return; drop: kfree_skb(skb); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index fb4f2b9b294f..ec6973ee88ef 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -258,6 +258,24 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { + /* + * If we had used sta_info_pre_move_state() then we might not + * have gone through the state transitions down again, so do + * it here now (and warn if it's inserted). + * + * This will clear state such as fast TX/RX that may have been + * allocated during state transitions. + */ + while (sta->sta_state > IEEE80211_STA_NONE) { + int ret; + + WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); + + ret = sta_info_move_state(sta, sta->sta_state - 1); + if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) + break; + } + if (sta->rate_ctrl) rate_control_free_sta(sta); @@ -687,7 +705,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) out_drop_sta: local->num_sta--; synchronize_net(); - __cleanup_single_sta(sta); + cleanup_single_sta(sta); out_err: mutex_unlock(&local->sta_mtx); kfree(sinfo); @@ -706,19 +724,13 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) err = sta_info_insert_check(sta); if (err) { + sta_info_free(local, sta); mutex_unlock(&local->sta_mtx); rcu_read_lock(); - goto out_free; + return err; } - err = sta_info_insert_finish(sta); - if (err) - goto out_free; - - return 0; - out_free: - sta_info_free(local, sta); - return err; + return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 00ae81e9e1a1..7afd07636b81 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -785,7 +785,7 @@ int sta_info_init(struct ieee80211_local *local); void sta_info_stop(struct ieee80211_local *local); /** - * sta_info_flush - flush matching STA entries from the STA table + * __sta_info_flush - flush matching STA entries from the STA table * * Returns the number of removed STA entries. * @@ -794,6 +794,13 @@ void sta_info_stop(struct ieee80211_local *local); */ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans); +/** + * sta_info_flush - flush matching STA entries from the STA table + * + * Returns the number of removed STA entries. + * + * @sdata: sdata to remove all stations from + */ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata) { return __sta_info_flush(sdata, false); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 6feb45135020..3485610755ef 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -49,7 +49,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | - IEEE80211_TX_CTL_AMPDU)) { + IEEE80211_TX_CTL_AMPDU | + IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } @@ -915,15 +916,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); - if (!acked && test_sta_flag(sta, WLAN_STA_PS_STA)) { - /* - * The STA is in power save mode, so assume - * that this TX packet failed because of that. - */ - ieee80211_handle_filtered_frame(local, sta, skb); - return; - } - if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) @@ -1150,6 +1142,12 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + /* + * The STA is in power save mode, so assume + * that this TX packet failed because of that. + */ + if (skb) + ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8ba10a48ded4..01eb08527817 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -319,9 +319,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type == NL80211_IFTYPE_OCB) return TX_CONTINUE; - if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) - return TX_CONTINUE; - if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; @@ -1942,19 +1939,24 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, /* device xmit handlers */ +enum ieee80211_encrypt { + ENCRYPT_NO, + ENCRYPT_MGMT, + ENCRYPT_DATA, +}; + static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - int head_need, bool may_encrypt) + int head_need, + enum ieee80211_encrypt encrypt) { struct ieee80211_local *local = sdata->local; - struct ieee80211_hdr *hdr; bool enc_tailroom; int tail_need = 0; - hdr = (struct ieee80211_hdr *) skb->data; - enc_tailroom = may_encrypt && - (sdata->crypto_tx_tailroom_needed_cnt || - ieee80211_is_mgmt(hdr->frame_control)); + enc_tailroom = encrypt == ENCRYPT_MGMT || + (encrypt == ENCRYPT_DATA && + sdata->crypto_tx_tailroom_needed_cnt); if (enc_tailroom) { tail_need = IEEE80211_ENCRYPT_TAILROOM; @@ -1985,23 +1987,29 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; int headroom; - bool may_encrypt; + enum ieee80211_encrypt encrypt; - may_encrypt = !(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT); + if (info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT) + encrypt = ENCRYPT_NO; + else if (ieee80211_is_mgmt(hdr->frame_control)) + encrypt = ENCRYPT_MGMT; + else + encrypt = ENCRYPT_DATA; headroom = local->tx_headroom; - if (may_encrypt) + if (encrypt != ENCRYPT_NO) headroom += sdata->encrypt_headroom; headroom -= skb_headroom(skb); headroom = max_t(int, 0, headroom); - if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) { + if (ieee80211_skb_resize(sdata, skb, headroom, encrypt)) { ieee80211_free_txskb(&local->hw, skb); return; } + /* reload after potential resize */ hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; @@ -2102,6 +2110,9 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, info->flags |= IEEE80211_TX_CTL_NO_ACK; if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; + if (txflags & IEEE80211_RADIOTAP_F_TX_ORDER) + info->control.flags |= + IEEE80211_TX_CTRL_DONT_REORDER; break; case IEEE80211_RADIOTAP_RATE: @@ -2268,11 +2279,13 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, payload[7]); } - /* - * Initialize skb->priority for QoS frames. This is put in the TID field - * of the frame before passing it to the driver. + /* Initialize skb->priority for QoS frames. If the DONT_REORDER flag + * is set, stick to the default value for skb->priority to assure + * frames injected with this flag are not reordered relative to each + * other. */ - if (ieee80211_is_data_qos(hdr->frame_control)) { + if (ieee80211_is_data_qos(hdr->frame_control) && + !(info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) { u8 *p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; } @@ -2284,8 +2297,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * we handle as though they are non-injected frames. * This code here isn't entirely correct, the local MAC address * isn't always enough to find the interface to use; for proper - * VLAN/WDS support we will need a different mechanism (which - * likely isn't going to be monitor interfaces). + * VLAN support we have an nl80211-based mechanism. * * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. @@ -2296,8 +2308,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (!ieee80211_sdata_running(tmp_sdata)) continue; if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR || - tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - tmp_sdata->vif.type == NL80211_IFTYPE_WDS) + tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) { sdata = tmp_sdata; @@ -2391,9 +2402,6 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, } sta = sta_info_get_bss(sdata, skb->data); break; - case NL80211_IFTYPE_WDS: - sta = sta_info_get(sdata, sdata->u.wds.remote_addr); - break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: /* determined much later */ @@ -2569,20 +2577,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, hdrlen = 24; band = chanctx_conf->def.chan->band; break; - case NL80211_IFTYPE_WDS: - fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); - /* RA TA DA SA */ - memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN); - memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); - memcpy(hdr.addr3, skb->data, ETH_ALEN); - memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); - hdrlen = 30; - /* - * This is the exception! WDS style interfaces are prohibited - * when channel contexts are in used so this must be valid - */ - band = local->hw.conf.chandef.chan->band; - break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: if (!is_multicast_ether_addr(skb->data)) { @@ -2828,7 +2822,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, head_need += sdata->encrypt_headroom; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); - if (ieee80211_skb_resize(sdata, skb, head_need, true)) { + if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) { ieee80211_free_txskb(&local->hw, skb); skb = NULL; return ERR_PTR(-ENOMEM); @@ -3502,7 +3496,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (unlikely(ieee80211_skb_resize(sdata, skb, max_t(int, extra_head + hw_headroom - skb_headroom(skb), 0), - false))) { + ENCRYPT_NO))) { kfree_skb(skb); return true; } @@ -3619,13 +3613,14 @@ begin: tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); - if (txq->sta && !(info->flags & IEEE80211_TX_CTL_INJECTED)) { + if (txq->sta) { tx.sta = container_of(txq->sta, struct sta_info, sta); /* * Drop unicast frames to unauthorised stations unless they are - * EAPOL frames from the local station. + * injected frames or EAPOL frames from the local station. */ - if (unlikely(ieee80211_is_data(hdr->frame_control) && + if (unlikely(!(info->flags & IEEE80211_TX_CTL_INJECTED) && + ieee80211_is_data(hdr->frame_control) && !ieee80211_vif_is_mesh(&tx.sdata->vif) && tx.sdata->vif.type != NL80211_IFTYPE_OCB && !is_multicast_ether_addr(hdr->addr1) && @@ -4406,9 +4401,10 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, /* * Transmit all pending packets. Called from tasklet. */ -void ieee80211_tx_pending(unsigned long data) +void ieee80211_tx_pending(struct tasklet_struct *t) { - struct ieee80211_local *local = (struct ieee80211_local *)data; + struct ieee80211_local *local = from_tasklet(local, t, + tx_pending_tasklet); unsigned long flags; int i; bool txok; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 49342060490f..8c3c01a1b923 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -386,9 +386,10 @@ _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) rcu_read_unlock(); } -void ieee80211_wake_txqs(unsigned long data) +void ieee80211_wake_txqs(struct tasklet_struct *t) { - struct ieee80211_local *local = (struct ieee80211_local *)data; + struct ieee80211_local *local = from_tasklet(local, t, + wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); @@ -2513,7 +2514,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) return res; } break; - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: @@ -2523,6 +2523,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_WDS: WARN_ON(1); break; } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 2fb99325135a..9ea6004abe1b 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -118,9 +118,11 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; - if (local->hw.queues < IEEE80211_NUM_ACS) + if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || + local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { @@ -141,6 +143,7 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct mac80211_qos_map *qos_map; bool qos; @@ -153,7 +156,7 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, else qos = false; - if (!qos) { + if (!qos || (info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } @@ -202,9 +205,6 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP: ra = skb->data; break; - case NL80211_IFTYPE_WDS: - ra = sdata->u.wds.remote_addr; - break; case NL80211_IFTYPE_STATION: /* might be a TDLS station */ sta = sta_info_get(sdata, skb->data); @@ -249,6 +249,14 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, p = ieee80211_get_qos_ctl(hdr); + /* don't overwrite the QoS field of injected frames */ + if (info->flags & IEEE80211_TX_CTL_INJECTED) { + /* do take into account Ack policy of injected frames */ + if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) + info->flags |= IEEE80211_TX_CTL_NO_ACK; + return; + } + /* set up the first byte */ /* diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 06ea0f8bfd5c..520cedc594e1 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -20,9 +20,9 @@ #include "ieee802154_i.h" #include "cfg.h" -static void ieee802154_tasklet_handler(unsigned long data) +static void ieee802154_tasklet_handler(struct tasklet_struct *t) { - struct ieee802154_local *local = (struct ieee802154_local *)data; + struct ieee802154_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue))) { @@ -91,9 +91,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) INIT_LIST_HEAD(&local->interfaces); mutex_init(&local->iflist_mtx); - tasklet_init(&local->tasklet, - ieee802154_tasklet_handler, - (unsigned long)local); + tasklet_setup(&local->tasklet, ieee802154_tasklet_handler); skb_queue_head_init(&local->skb_queue); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index f2868a8a50c3..47bab701555f 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -377,6 +377,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (!pskb_may_pull(skb, sizeof(*hdr))) goto err; + skb_dst_drop(skb); + /* Read and decode the label */ hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 54b888f94009..96ba616f59bf 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -18,6 +18,7 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; int mptcp_enabled; + unsigned int add_addr_timeout; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net) return mptcp_get_pernet(net)->mptcp_enabled; } +unsigned int mptcp_get_add_addr_timeout(struct net *net) +{ + return mptcp_get_pernet(net)->add_addr_timeout; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = { */ .proc_handler = proc_dointvec, }, + { + .procname = "add_addr_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, {} }; static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; } static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) } table[0].data = &pernet->mptcp_enabled; + table[1].data = &pernet->add_addr_timeout; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 5f390a97f556..b70ae4ba3000 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -140,7 +140,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_flags = flags; info->mptcpi_token = READ_ONCE(msk->token); info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a044dd43411d..6b7b4b67f18c 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -242,7 +242,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->add_addr = 1; mp_opt->addr_id = *ptr++; - pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); + pr_debug("ADD_ADDR%s: id=%d, echo=%d", + (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", + mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -492,7 +494,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); + snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -528,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -573,17 +576,27 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, } #endif -static bool mptcp_established_options_add_addr(struct sock *sk, +static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + bool drop_other_suboptions = false; + unsigned int opt_size = *size; struct mptcp_addr_info saddr; bool echo; int len; + if (mptcp_pm_should_add_signal_ipv6(msk) && + skb && skb_is_tcp_pure_ack(skb)) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + remaining += opt_size; + drop_other_suboptions = true; + } + if (!mptcp_pm_should_add_signal(msk) || !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) return false; @@ -593,6 +606,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, return false; *size = len; + if (drop_other_suboptions) + *size -= opt_size; opts->addr_id = saddr.id; if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; @@ -678,7 +693,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; - if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) { + if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; @@ -759,6 +774,11 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto fully_established; } + if (mp_opt->add_addr) { + WRITE_ONCE(msk->fully_established, true); + return true; + } + /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. @@ -809,31 +829,39 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) return cur_ack; } -static void update_una(struct mptcp_sock *msk, - struct mptcp_options_received *mp_opt) +static void ack_update_msk(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_options_received *mp_opt) { - u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 write_seq = READ_ONCE(msk->write_seq); + u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); + struct sock *sk = (struct sock *)msk; + u64 old_snd_una; + + mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ + old_snd_una = msk->snd_una; new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, write_seq)) + if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; - while (after64(new_snd_una, old_snd_una)) { - snd_una = old_snd_una; - old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, - new_snd_una); - if (old_snd_una == snd_una) { - mptcp_data_acked((struct sock *)msk); - break; - } + new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; + + if (after64(new_wnd_end, msk->wnd_end)) { + msk->wnd_end = new_wnd_end; + __mptcp_wnd_updated(sk, ssk); } + + if (after64(new_snd_una, old_snd_una)) { + msk->snd_una = new_snd_una; + __mptcp_data_acked(sk); + } + mptcp_data_unlock(sk); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -886,8 +914,19 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; - if (__mptcp_check_fallback(msk)) + if (__mptcp_check_fallback(msk)) { + /* Keep it simple and unconditionally trigger send data cleanup and + * pending queue spooling. We will need to acquire the data lock + * for more accurate checks, and once the lock is acquired, such + * helpers are cheap. + */ + mptcp_data_lock(subflow->conn); + if (mptcp_send_head(subflow->conn)) + __mptcp_wnd_updated(subflow->conn, sk); + __mptcp_data_acked(subflow->conn); + mptcp_data_unlock(subflow->conn); return; + } mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) @@ -930,7 +969,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * monodirectional flows will stuck */ if (mp_opt.use_ack) - update_una(msk, &mp_opt); + ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -975,7 +1014,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } } -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +static void mptcp_set_rwin(const struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + const struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + + ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + + if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); +} + +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, + struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { @@ -1132,4 +1188,7 @@ mp_capable_done: TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } + + if (tp) + mptcp_set_rwin(tp); } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e19e1525ecbb..75c5040e8d5d 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -16,11 +16,17 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo) { + u8 add_addr = READ_ONCE(msk->pm.add_addr_signal); + pr_debug("msk=%p, local_id=%d", msk, addr->id); msk->pm.local = *addr; - WRITE_ONCE(msk->pm.add_addr_echo, echo); - WRITE_ONCE(msk->pm.add_addr_signal, true); + add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); + if (echo) + add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); + if (addr->family == AF_INET6) + add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); + WRITE_ONCE(msk->pm.add_addr_signal, add_addr); return 0; } @@ -89,8 +95,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + mptcp_schedule_work((struct sock *)msk); return true; } @@ -150,14 +155,24 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_lock_bh(&pm->lock); - if (!READ_ONCE(pm->accept_addr)) + if (!READ_ONCE(pm->accept_addr)) { mptcp_pm_announce_addr(msk, addr, true); - else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + mptcp_pm_add_addr_send_ack(msk); + } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } spin_unlock_bh(&pm->lock); } +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) +{ + if (!mptcp_pm_should_add_signal_ipv6(msk)) + return; + + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) { struct mptcp_pm_data *pm = &msk->pm; @@ -183,13 +198,13 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; - *echo = READ_ONCE(msk->pm.add_addr_echo); + *echo = mptcp_pm_should_add_signal_echo(msk); if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo)) goto out_unlock; *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.add_addr_signal, false); + WRITE_ONCE(msk->pm.add_addr_signal, 0); ret = true; out_unlock: @@ -233,11 +248,10 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.subflows = 0; msk->pm.rm_id = 0; WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.add_addr_signal, false); + WRITE_ONCE(msk->pm.add_addr_signal, 0); WRITE_ONCE(msk->pm.rm_addr_signal, false); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); - WRITE_ONCE(msk->pm.add_addr_echo, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 0d6f3d912891..03f2c28f11f5 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -228,11 +228,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); @@ -264,6 +266,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); if (lookup_anno_list_by_saddr(msk, &entry->addr)) return false; @@ -279,7 +282,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); return true; } @@ -325,6 +329,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_add_addr_send_ack(msk); } } else { /* pick failed, avoid fourther attempts later */ @@ -395,6 +400,33 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) spin_lock_bh(&msk->pm.lock); mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_nl_add_addr_send_ack(msk); +} + +void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!mptcp_pm_should_add_signal_ipv6(msk)) + return; + + __mptcp_flush_join_list(msk); + subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); + if (subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 add_addr; + + spin_unlock_bh(&msk->pm.lock); + pr_debug("send ack for add_addr6"); + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + spin_lock_bh(&msk->pm.lock); + + add_addr = READ_ONCE(msk->pm.add_addr_signal); + add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); + WRITE_ONCE(msk->pm.add_addr_signal, add_addr); + } } void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) @@ -413,14 +445,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (msk->pm.rm_id != subflow->remote_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_accepted--; @@ -449,14 +480,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (rm_id != subflow->local_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.local_addr_used--; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 185dacb39781..57213ff60f78 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> +#include <net/xfrm.h> #include "protocol.h" #include "mib.h" @@ -41,6 +42,9 @@ struct mptcp_skb_cb { static struct percpu_counter mptcp_sockets_allocated; +static void __mptcp_destroy_sock(struct sock *sk); +static void __mptcp_check_send_data_fin(struct sock *sk); + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -53,6 +57,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } +/* Returns end sequence number of the receiver's advertised window */ +static u64 mptcp_wnd_end(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->wnd_end); +} + static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; @@ -102,6 +112,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); + sock_hold(ssock->sk); subflow->request_mptcp = 1; /* accept() will wait on first subflow sk_wq, and we always wakes up @@ -157,18 +168,19 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; - int space; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - space = tcp_space(sk); - max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; + max_seq = READ_ONCE(msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); - if (after64(seq, max_seq)) { + if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); + pr_debug("oow by %lld, rcv_wnd_sent %llu\n", + (unsigned long long)end_seq - (unsigned long)max_seq, + (unsigned long long)msk->rcv_wnd_sent); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -274,6 +286,15 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_ext_reset(skb); skb_orphan(skb); + /* try to fetch required memory from subflow */ + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { + if (ssk->sk_forward_alloc < skb->truesize) + goto drop; + __sk_mem_reclaim(ssk, skb->truesize); + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } + /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value @@ -301,6 +322,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); +drop: mptcp_drop(sk, skb); return false; } @@ -313,17 +335,35 @@ static void mptcp_stop_timer(struct sock *sk) mptcp_sk(sk)->timer_ival = 0; } -static void mptcp_check_data_fin_ack(struct sock *sk) +static void mptcp_close_wake_up(struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +} + +static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (__mptcp_check_fallback(msk)) - return; + return !__mptcp_check_fallback(msk) && + ((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == READ_ONCE(msk->snd_una); +} + +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ - if (((1 << sk->sk_state) & - (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && - msk->write_seq == atomic64_read(&msk->snd_una)) { + if (mptcp_pending_data_fin_ack(sk)) { mptcp_stop_timer(sk); WRITE_ONCE(msk->snd_data_fin_enable, 0); @@ -331,20 +371,14 @@ static void mptcp_check_data_fin_ack(struct sock *sk) switch (sk->sk_state) { case TCP_FIN_WAIT1: inet_sk_state_store(sk, TCP_FIN_WAIT2); - sk->sk_state_change(sk); break; case TCP_CLOSING: case TCP_LAST_ACK: inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_state_change(sk); break; } - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_close_wake_up(sk); } } @@ -378,13 +412,79 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_check_data_fin(struct sock *sk) +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + +static bool tcp_can_send_ack(const struct sock *ssk) +{ + return !((1 << inet_sk_state_load(ssk)) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE)); +} + +static void mptcp_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); + release_sock(ssk); + } +} + +static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +{ + int ret; + + lock_sock(ssk); + ret = tcp_can_send_ack(ssk); + if (ret) + tcp_cleanup_rbuf(ssk, 1); + release_sock(ssk); + return ret; +} + +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +{ + struct sock *ack_hint = READ_ONCE(msk->ack_hint); + struct mptcp_subflow_context *subflow; + + /* if the hinted ssk is still active, try to use it */ + if (likely(ack_hint)) { + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) + return; + } + } + + /* otherwise pick the first active subflow */ + mptcp_for_each_subflow(msk, subflow) + if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) + return; +} + +static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; + bool ret = false; if (__mptcp_check_fallback(msk) || !msk->first) - return; + return ret; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -400,8 +500,6 @@ static void mptcp_check_data_fin(struct sock *sk) */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { - struct mptcp_subflow_context *subflow; - WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); @@ -418,7 +516,6 @@ static void mptcp_check_data_fin(struct sock *sk) break; case TCP_FIN_WAIT2: inet_sk_state_store(sk, TCP_CLOSE); - // @@ Close subflows now? break; default: /* Other states not expected */ @@ -426,23 +523,12 @@ static void mptcp_check_data_fin(struct sock *sk) break; } + ret = true; mptcp_set_timeout(sk, NULL); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); - } - - sk->sk_state_change(sk); - - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_send_ack(msk); + mptcp_close_wake_up(sk); } + return ret; } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, @@ -454,12 +540,22 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - u32 old_copied_seq; bool done = false; + int sk_rbuf; + + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + + if (unlikely(ssk_rbuf > sk_rbuf)) { + WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); + sk_rbuf = ssk_rbuf; + } + } pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); - old_copied_seq = tp->copied_seq; do { u32 map_remaining, offset; u32 seq = tp->copied_seq; @@ -518,20 +614,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } } while (more_data_avail); + WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; - if (tp->copied_seq != old_copied_seq) - tcp_cleanup_rbuf(ssk, 1); - return done; } -static bool mptcp_ofo_queue(struct mptcp_sock *msk) +static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; @@ -577,41 +671,34 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (READ_ONCE(sk->sk_lock.owned)) - return false; - - if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) - return false; - - /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) { - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_ofo_queue(msk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); - } + mptcp_data_lock(sk); - spin_unlock_bh(&sk->sk_lock.slock); + __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + __mptcp_ofo_queue(msk); - return moved > 0; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); + mptcp_data_unlock(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + int sk_rbuf, ssk_rbuf; bool wake; /* move_skbs_to_msk below can legitly clear the data_avail flag, @@ -622,30 +709,23 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (wake) set_bit(MPTCP_DATA_READY, &msk->flags); - if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && - move_skbs_to_msk(msk, ssk)) - goto wake; + ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + if (unlikely(ssk_rbuf > sk_rbuf)) + sk_rbuf = ssk_rbuf; - /* don't schedule if mptcp sk is (still) over limit */ - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) + /* over limit? can't append more skbs to msk */ + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) goto wake; - /* mptcp socket is owned, release_cb should retry */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, - &sk->sk_tsq_flags)) { - sock_hold(sk); + move_skbs_to_msk(msk, ssk); - /* need to try again, its possible release_cb() has already - * been called after the test_and_set_bit() above. - */ - move_skbs_to_msk(msk, ssk); - } wake: if (wake) sk->sk_data_ready(sk); } -static void __mptcp_flush_join_list(struct mptcp_sock *msk) +void __mptcp_flush_join_list(struct mptcp_sock *msk) { if (likely(list_empty(&msk->join_list))) return; @@ -665,6 +745,10 @@ static void mptcp_reset_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; + /* prevent rescheduling on close */ + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + return; + /* should never be called with mptcp level timer cleared */ tout = READ_ONCE(mptcp_sk(sk)->timer_ival); if (WARN_ON_ONCE(!tout)) @@ -672,23 +756,23 @@ static void mptcp_reset_timer(struct sock *sk) sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } -void mptcp_data_acked(struct sock *sk) +bool mptcp_schedule_work(struct sock *sk) { - mptcp_reset_timer(sk); - - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && - schedule_work(&mptcp_sk(sk)->work)) + if (inet_sk_state_load(sk) != TCP_CLOSE && + schedule_work(&mptcp_sk(sk)->work)) { + /* each subflow already holds a reference to the sk, and the + * workqueue is invoked by a subflow, so sk can't go away here. + */ sock_hold(sk); + return true; + } + return false; } void mptcp_subflow_eof(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && - schedule_work(&msk->work)) - sock_hold(sk); + if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); } static void mptcp_check_for_eof(struct mptcp_sock *msk) @@ -699,8 +783,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) receivers += !subflow->rx_eof; + if (receivers) + return; - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ @@ -710,16 +796,21 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } -} - -static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) -{ - const struct sock *sk = (const struct sock *)msk; - - if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); - return !!msk->cached_ext; + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + inet_sk_state_store(sk, TCP_CLOSE); + break; + default: + return; + } + mptcp_close_wake_up(sk); } static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) @@ -744,8 +835,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq, if (!tcp_skb_can_collapse_to(skb)) return false; - /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == write_seq; + /* can collapse only if MPTCP level sequence is in order and this + * mapping has not been xmitted yet + */ + return mpext && mpext->data_seq + mpext->data_len == write_seq && + !mpext->frozen; } static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, @@ -753,9 +847,125 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && + pfrag->size - pfrag->offset > 0 && df->data_seq + df->data_len == msk->write_seq; } +static int mptcp_wmem_with_overhead(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int ret, skbs; + + ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); + skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; + if (skbs < msk->skb_tx_cache.qlen) + return ret; + + return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); +} + +static void __mptcp_wmem_reserve(struct sock *sk, int size) +{ + int amount = mptcp_wmem_with_overhead(sk, size); + struct mptcp_sock *msk = mptcp_sk(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + if (amount <= sk->sk_forward_alloc) + goto reserve; + + /* under memory pressure try to reserve at most a single page + * otherwise try to reserve the full estimate and fallback + * to a single page before entering the error path + */ + if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || + !sk_wmem_schedule(sk, amount)) { + if (amount <= PAGE_SIZE) + goto nomem; + + amount = PAGE_SIZE; + if (!sk_wmem_schedule(sk, amount)) + goto nomem; + } + +reserve: + msk->wmem_reserved = amount; + sk->sk_forward_alloc -= amount; + return; + +nomem: + /* we will wait for memory on next allocation */ + msk->wmem_reserved = -1; +} + +static void __mptcp_update_wmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->wmem_reserved) + return; + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + if (msk->wmem_reserved > 0) { + sk->sk_forward_alloc += msk->wmem_reserved; + msk->wmem_reserved = 0; + } +} + +static bool mptcp_wmem_alloc(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* check for pre-existing error condition */ + if (msk->wmem_reserved < 0) + return false; + + if (msk->wmem_reserved >= size) + goto account; + + mptcp_data_lock(sk); + if (!sk_wmem_schedule(sk, size)) { + mptcp_data_unlock(sk); + return false; + } + + sk->sk_forward_alloc -= size; + msk->wmem_reserved += size; + mptcp_data_unlock(sk); + +account: + msk->wmem_reserved -= size; + return true; +} + +static void mptcp_wmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + msk->wmem_reserved += size; +} + +static void mptcp_mem_reclaim_partial(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* if we are experiencing a transint allocation error, + * the forward allocation memory has been already + * released + */ + if (msk->wmem_reserved < 0) + return; + + mptcp_data_lock(sk); + sk->sk_forward_alloc += msk->wmem_reserved; + sk_mem_reclaim_partial(sk); + msk->wmem_reserved = sk->sk_forward_alloc; + sk->sk_forward_alloc = 0; + mptcp_data_unlock(sk); +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -771,21 +981,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static bool mptcp_is_writeable(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - if (!sk_stream_is_writeable((struct sock *)msk)) - return false; - - mptcp_for_each_subflow(msk, subflow) { - if (sk_stream_is_writeable(subflow->tcp_sock)) - return true; - } - return false; -} - -static void mptcp_clean_una(struct sock *sk) +static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; @@ -796,13 +992,15 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->write_seq); - snd_una = atomic64_read(&msk->snd_una); + msk->snd_una = READ_ONCE(msk->snd_nxt); + snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; + if (WARN_ON_ONCE(dfrag == msk->first_pending)) + break; dfrag_clear(sk, dfrag); cleaned = true; } @@ -811,12 +1009,13 @@ static void mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->data_len)) + if (WARN_ON_ONCE(delta > dfrag->already_sent)) goto out; dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; + dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); cleaned = true; @@ -824,19 +1023,42 @@ static void mptcp_clean_una(struct sock *sk) out: if (cleaned) { - sk_mem_reclaim_partial(sk); - - /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); - smp_mb__after_atomic(); + if (tcp_under_memory_pressure(sk)) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ - sk_stream_write_space(sk); + if (sk_stream_is_writeable(sk)) { + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags)) + sk_stream_write_space(sk); } } + + if (snd_una == READ_ONCE(msk->snd_nxt)) { + if (msk->timer_ival) + mptcp_stop_timer(sk); + } else { + mptcp_reset_timer(sk); + } +} + +static void mptcp_enter_memory_pressure(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + bool first = true; + + sk_stream_moderate_sndbuf(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (first) + tcp_enter_memory_pressure(ssk); + sk_stream_moderate_sndbuf(ssk); + first = false; + } } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of @@ -848,8 +1070,7 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag, sk->sk_allocation))) return true; - sk->sk_prot->enter_memory_pressure(sk); - sk_stream_moderate_sndbuf(sk); + mptcp_enter_memory_pressure(sk); return false; } @@ -865,149 +1086,237 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } +struct mptcp_sendmsg_info { + int mss_now; + int size_goal; + u16 limit; + u16 sent; + unsigned int flags; +}; + +static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, + int avail_size) +{ + u64 window_end = mptcp_wnd_end(msk); + + if (__mptcp_check_fallback(msk)) + return avail_size; + + if (!before64(data_seq + avail_size, window_end)) { + u64 allowed_size = window_end - data_seq; + + return min_t(unsigned int, allowed_size, avail_size); + } + + return avail_size; +} + +static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) +{ + struct skb_ext *mpext = __skb_ext_alloc(gfp); + + if (!mpext) + return false; + __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); + return true; +} + +static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) +{ + struct sk_buff *skb; + + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); + if (likely(skb)) { + if (likely(__mptcp_add_ext(skb, gfp))) { + skb_reserve(skb, MAX_TCP_HEADER); + skb->reserved_tailroom = skb->end - skb->tail; + return skb; + } + __kfree_skb(skb); + } else { + mptcp_enter_memory_pressure(sk); + } + return NULL; +} + +static bool mptcp_tx_cache_refill(struct sock *sk, int size, + struct sk_buff_head *skbs, int *total_ts) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + int space_needed; + + if (unlikely(tcp_under_memory_pressure(sk))) { + mptcp_mem_reclaim_partial(sk); + + /* under pressure pre-allocate at most a single skb */ + if (msk->skb_tx_cache.qlen) + return true; + space_needed = msk->size_goal_cache; + } else { + space_needed = msk->tx_pending_data + size - + msk->skb_tx_cache.qlen * msk->size_goal_cache; + } + + while (space_needed > 0) { + skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); + if (unlikely(!skb)) { + /* under memory pressure, try to pass the caller a + * single skb to allow forward progress + */ + while (skbs->qlen > 1) { + skb = __skb_dequeue_tail(skbs); + __kfree_skb(skb); + } + return skbs->qlen > 0; + } + + *total_ts += skb->truesize; + __skb_queue_tail(skbs, skb); + space_needed -= msk->size_goal_cache; + } + return true; +} + +static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + + if (ssk->sk_tx_skb_cache) { + skb = ssk->sk_tx_skb_cache; + if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && + !__mptcp_add_ext(skb, gfp))) + return false; + return true; + } + + skb = skb_peek(&msk->skb_tx_cache); + if (skb) { + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + skb = __skb_dequeue(&msk->skb_tx_cache); + if (WARN_ON_ONCE(!skb)) + return false; + + mptcp_wmem_uncharge(sk, skb->truesize); + ssk->sk_tx_skb_cache = skb; + return true; + } + + /* over memory limit, no point to try to allocate a new skb */ + return false; + } + + skb = __mptcp_do_alloc_tx_skb(sk, gfp); + if (!skb) + return false; + + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + ssk->sk_tx_skb_cache = skb; + return true; + } + kfree_skb(skb); + return false; +} + +static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +{ + return !ssk->sk_tx_skb_cache && + !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && + tcp_under_memory_pressure(sk); +} + +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +{ + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) + mptcp_mem_reclaim_partial(sk); + return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, struct mptcp_data_frag *dfrag, - long *timeo, int *pmss_now, - int *ps_goal) + struct mptcp_data_frag *dfrag, + struct mptcp_sendmsg_info *info) { - int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; - bool dfrag_collapsed, can_collapse = false; + u64 data_seq = dfrag->data_seq + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); + bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; - bool retransmission = !!dfrag; struct sk_buff *skb, *tail; - struct page_frag *pfrag; - struct page *page; - u64 *write_seq; - size_t psize; - - /* use the mptcp page cache so that we can easily move the data - * from one substream to another, but do per subflow memory accounting - * Note: pfrag is used only !retransmission, but the compiler if - * fooled into a warning if we don't init here - */ - pfrag = sk_page_frag(sk); - if (!retransmission) { - write_seq = &msk->write_seq; - page = pfrag->page; - } else { - write_seq = &dfrag->data_seq; - page = dfrag->page; - } + bool can_collapse = false; + int avail_size; + size_t ret = 0; + + pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); - /* compute copy limit */ - mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); - *pmss_now = mss_now; - *ps_goal = size_goal; - avail_size = size_goal; + /* compute send limit */ + info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); + avail_size = info->size_goal; + msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { - mpext = skb_ext_find(skb, SKB_EXT_MPTCP); - /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ - can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(*write_seq, skb, mpext); + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + can_collapse = (info->size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(data_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else - avail_size = size_goal - skb->len; + avail_size = info->size_goal - skb->len; } - if (!retransmission) { - /* reuse tail pfrag, if possible, or carve a new one from the - * page allocator - */ - dfrag = mptcp_rtx_tail(sk); - offset = pfrag->offset; - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); - if (!dfrag_collapsed) { - dfrag = mptcp_carve_data_frag(msk, pfrag, offset); - offset = dfrag->offset; - frag_truesize = dfrag->overhead; - } - psize = min_t(size_t, pfrag->size - offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, offset, - min_t(size_t, msg_data_left(msg), - psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; + /* Zero window and all data acked? Probe. */ + avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); + if (avail_size == 0) { + u64 snd_una = READ_ONCE(msk->snd_una); - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) { - iov_iter_revert(&msg->msg_iter, psize); - return -ENOMEM; - } - } else { - offset = dfrag->offset; - psize = min_t(size_t, dfrag->data_len, avail_size); + if (skb || snd_una != msk->snd_nxt) + return 0; + zero_window_probe = true; + data_seq = snd_una - 1; + avail_size = 1; } - /* tell the TCP stack to delay the push so that we can safely - * access the skb after the sendpages call - */ - ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); - if (ret <= 0) { - if (!retransmission) - iov_iter_revert(&msg->msg_iter, psize); - return ret; - } - - frag_truesize += ret; - if (!retransmission) { - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); - - /* send successful, keep track of sent data for mptcp-level - * retransmission - */ - dfrag->data_len += ret; - if (!dfrag_collapsed) { - get_page(dfrag->page); - list_add_tail(&dfrag->list, &msk->rtx_queue); - sk_wmem_queued_add(sk, frag_truesize); - } else { - sk_wmem_queued_add(sk, ret); - } + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; - /* charge data on mptcp rtx queue to the master socket - * Note: we charge such data both to sk and ssk - */ - sk->sk_forward_alloc -= frag_truesize; + ret = info->limit - info->sent; + tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, + dfrag->offset + info->sent, &ret); + if (!tail) { + tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + return -ENOMEM; } - /* if the tail skb extension is still the cached one, collapsing - * really happened. Note: we can't check for 'same skb' as the sk_buff - * hdr on tail can be transmitted, freed and re-allocated by the - * do_tcp_sendpages() call + /* if the tail skb is still the cached one, collapsing really happened. */ - tail = tcp_write_queue_tail(ssk); - if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + if (skb == tail) { WARN_ON_ONCE(!can_collapse); mpext->data_len += ret; + WARN_ON_ONCE(zero_window_probe); goto out; } - skb = tcp_write_queue_tail(ssk); - mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); - msk->cached_ext = NULL; + mpext = skb_ext_find(tail, SKB_EXT_MPTCP); + if (WARN_ON_ONCE(!mpext)) { + /* should never reach here, stream corrupted */ + return -EINVAL; + } memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = *write_seq; + mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -1017,44 +1326,17 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); + if (zero_window_probe) { + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mpext->frozen = 1; + ret = 0; + tcp_push_pending_frames(ssk); + } out: - if (!retransmission) - pfrag->offset += frag_truesize; - WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; } -static void mptcp_nospace(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct socket *sock = READ_ONCE(ssk->sk_socket); - - /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); - } -} - -static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) - return false; - - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); -} - #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ @@ -1079,9 +1361,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, sock_owned_by_me((struct sock *)msk); *sndbuf = 0; - if (!mptcp_ext_cache_refill(msk)) - return NULL; - if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; @@ -1144,27 +1423,160 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) +static void mptcp_push_release(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) +{ + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); + release_sock(ssk); +} + +static void mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + u32 sndbuf; + + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + prev_ssk = ssk; + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, sndbuf); + + /* try to keep the subflow socket lock across + * consecutive xmit on the same socket + */ + if (ssk != prev_ssk && prev_ssk) + mptcp_push_release(sk, prev_ssk, &info); + if (!ssk) + goto out; + + if (ssk != prev_ssk || !prev_ssk) + lock_sock(ssk); + + /* keep it simple and always provide a new skb for the + * subflow, even if we will not use it when collapsing + * on the pending one + */ + if (!mptcp_alloc_tx_skb(sk, ssk)) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + + /* at this point we held the socket lock for the last subflow we used */ + if (ssk) + mptcp_push_release(sk, ssk, &info); + +out: + if (copied) { + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + __mptcp_check_send_data_fin(sk); + } +} + +static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) { - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + + info.flags = 0; + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } + if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) + goto out; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + goto out; + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + +out: + /* __mptcp_alloc_tx_skb could have released some wmem and we are + * not going to flush it via release_sock() + */ + __mptcp_update_wmem(sk); + if (copied) { + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + if (msk->snd_data_fin_enable && + msk->snd_nxt + 1 == msk->write_seq) + mptcp_schedule_work(sk); + } } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { - int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; - struct sock *ssk; - u32 sndbuf; - bool tx_ok; + int ret = 0; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1175,130 +1587,97 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); -restart: - mptcp_clean_una(sk); - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } + while (msg_data_left(msg)) { + int total_ts, frag_truesize = 0; + struct mptcp_data_frag *dfrag; + struct sk_buff_head skbs; + bool dfrag_collapsed; + size_t psize, offset; - __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk, &sndbuf); - while (!sk_stream_memory_free(sk) || - !ssk || - !mptcp_page_frag_refill(ssk, pfrag)) { - if (ssk) { - /* make sure retransmit timer is - * running before we wait for memory. - * - * The retransmit timer might be needed - * to make the peer send an up-to-date - * MPTCP Ack. - */ - mptcp_set_timeout(sk, ssk); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); - } - - mptcp_nospace(msk); - ret = sk_stream_wait_memory(sk, &timeo); - if (ret) - goto out; - - mptcp_clean_una(sk); - - ssk = mptcp_subflow_get_send(msk, &sndbuf); - if (list_empty(&msk->conn_list)) { - ret = -ENOTCONN; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; goto out; } - } - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, sndbuf); + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_pending_tail(sk); + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + if (!sk_stream_memory_free(sk)) + goto wait_for_memory; - pr_debug("conn_list->subflow=%p", ssk); + if (!mptcp_page_frag_refill(sk, pfrag)) + goto wait_for_memory; - lock_sock(ssk); - tx_ok = msg_data_left(msg); - while (tx_ok) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, - &size_goal); - if (ret < 0) { - if (ret == -EAGAIN && timeo > 0) { - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } - break; + dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); + frag_truesize = dfrag->overhead; } - /* burst can be negative, we will try move to the next subflow - * at selection time, if possible. + /* we do not bound vs wspace, to allow a single packet. + * memory accounting will prevent execessive memory usage + * anyway */ - msk->snd_burst -= ret; - copied += ret; - - tx_ok = msg_data_left(msg); - if (!tx_ok) - break; + offset = dfrag->offset + dfrag->data_len; + psize = pfrag->size - offset; + psize = min_t(size_t, psize, msg_data_left(msg)); + total_ts = psize + frag_truesize; + __skb_queue_head_init(&skbs); + if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) + goto wait_for_memory; + + if (!mptcp_wmem_alloc(sk, total_ts)) { + __skb_queue_purge(&skbs); + goto wait_for_memory; + } - if (!sk_stream_memory_free(ssk) || - !mptcp_page_frag_refill(ssk, pfrag) || - !mptcp_ext_cache_refill(msk)) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; + skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); + if (copy_page_from_iter(dfrag->page, offset, psize, + &msg->msg_iter) != psize) { + mptcp_wmem_uncharge(sk, psize + frag_truesize); + ret = -EFAULT; + goto out; } - /* memory is charged to mptcp level socket as well, i.e. - * if msg is very large, mptcp socket may run out of buffer - * space. mptcp_clean_una() will release data that has - * been acked at mptcp level in the mean time, so there is - * a good chance we can continue sending data right away. - * - * Normally, when the tcp subflow can accept more data, then - * so can the MPTCP socket. However, we need to cope with - * peers that might lag behind in their MPTCP-level - * acknowledgements, i.e. data might have been acked at - * tcp level only. So, we must also check the MPTCP socket - * limits before we send more data. + /* data successfully copied into the write queue */ + copied += psize; + dfrag->data_len += psize; + frag_truesize += psize; + pfrag->offset += frag_truesize; + WRITE_ONCE(msk->write_seq, msk->write_seq + psize); + + /* charge data on mptcp pending queue to the msk socket + * Note: we charge such data both to sk and ssk */ - if (unlikely(!sk_stream_memory_free(sk))) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); - mptcp_clean_una(sk); - if (!sk_stream_memory_free(sk)) { - /* can't send more for now, need to wait for - * MPTCP-level ACKs from peer. - * - * Wakeup will happen via mptcp_clean_una(). - */ - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } + sk_wmem_queued_add(sk, frag_truesize); + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + if (!msk->first_pending) + WRITE_ONCE(msk->first_pending, dfrag); } + pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, + dfrag->data_seq, dfrag->data_len, dfrag->already_sent, + !dfrag_collapsed); + + continue; + +wait_for_memory: + set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_push_pending(sk, msg->msg_flags); + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; } - mptcp_set_timeout(sk, ssk); if (copied) { - tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); - - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + msk->tx_pending_data += copied; + mptcp_push_pending(sk, msg->msg_flags); } - release_sock(ssk); out: - ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1322,11 +1701,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len) { - struct sock *sk = (struct sock *)msk; struct sk_buff *skb; int copied = 0; - while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + while ((skb = skb_peek(&msk->receive_queue)) != NULL) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1346,7 +1724,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } - __skb_unlink(skb, &sk->sk_receive_queue); + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); if (copied >= len) @@ -1454,32 +1835,68 @@ new_measure: msk->rcvq_space.time = mstamp; } -static bool __mptcp_move_skbs(struct mptcp_sock *msk) +static void __mptcp_update_rmem(struct sock *sk) { - unsigned int moved = 0; - bool done; + struct mptcp_sock *msk = mptcp_sk(sk); - /* avoid looping forever below on racing close */ - if (((struct sock *)msk)->sk_state == TCP_CLOSE) - return false; + if (!msk->rmem_released) + return; + + atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, msk->rmem_released); + msk->rmem_released = 0; +} + +static void __mptcp_splice_receive_queue(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); +} + +static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) +{ + struct sock *sk = (struct sock *)msk; + unsigned int moved = 0; + bool ret, done; __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); + bool slowpath; - if (!ssk) + /* we can have data pending in the subflows only if the msk + * receive buffer was full at subflow_data_ready() time, + * that is an unlikely slow path. + */ + if (likely(!ssk)) break; - lock_sock(ssk); + slowpath = lock_sock_fast(ssk); + mptcp_data_lock(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - release_sock(ssk); + mptcp_data_unlock(sk); + if (moved && rcv) { + WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); + tcp_cleanup_rbuf(ssk, 1); + WRITE_ONCE(msk->rmem_pending, 0); + } + unlock_sock_fast(ssk, slowpath); } while (!done); - if (mptcp_ofo_queue(msk) || moved > 0) { - mptcp_check_data_fin((struct sock *)msk); - return true; + /* acquire the data lock only if some input data is pending */ + ret = moved > 0; + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || + !skb_queue_empty_lockless(&sk->sk_receive_queue)) { + mptcp_data_lock(sk); + __mptcp_update_rmem(sk); + ret |= __mptcp_ofo_queue(msk); + __mptcp_splice_receive_queue(sk); + mptcp_data_unlock(sk); } - return false; + if (ret) + mptcp_check_data_fin((struct sock *)msk); + return !skb_queue_empty(&msk->receive_queue); } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1493,15 +1910,19 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + if (unlikely(sk->sk_state == TCP_LISTEN)) { + copied = -ENOTCONN; + goto out_err; + } + timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - __mptcp_flush_join_list(msk); - while (len > (size_t)copied) { - int bytes_read; + while (copied < len) { + int bytes_read, old_space; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); if (unlikely(bytes_read < 0)) { @@ -1512,10 +1933,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && - __mptcp_move_skbs(msk)) + if (skb_queue_empty(&msk->receive_queue) && + __mptcp_move_skbs(msk, len - copied)) continue; + /* be sure to advertise window change */ + old_space = READ_ONCE(msk->old_wspace); + if ((tcp_space(sk) - old_space) >= old_space) + mptcp_cleanup_rbuf(msk); + /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ @@ -1538,8 +1964,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* race breaker: the shutdown could be after the + * previous receive queue check + */ + if (__mptcp_move_skbs(msk, len - copied)) + continue; break; + } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -1561,14 +1993,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, mptcp_wait_data(sk, &timeo); } - if (skb_queue_empty(&sk->sk_receive_queue)) { + if (skb_queue_empty_lockless(&sk->sk_receive_queue) && + skb_queue_empty(&msk->receive_queue)) { /* entire backlog drained, clear DATA_READY. */ clear_bit(MPTCP_DATA_READY, &msk->flags); /* .. race-breaker: ssk might have gotten new data * after last __mptcp_move_skbs() returned false. */ - if (unlikely(__mptcp_move_skbs(msk))) + if (unlikely(__mptcp_move_skbs(msk, 0))) set_bit(MPTCP_DATA_READY, &msk->flags); } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { /* data to read but mptcp_wait_data() cleared DATA_READY */ @@ -1577,7 +2010,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty(&sk->sk_receive_queue), copied); + skb_queue_empty_lockless(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -1588,13 +2021,8 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { - mptcp_stop_timer(sk); - } else { - set_bit(MPTCP_WORK_RTX, &msk->flags); - if (schedule_work(&msk->work)) - sock_hold(sk); - } + set_bit(MPTCP_WORK_RTX, &msk->flags); + mptcp_schedule_work(sk); } static void mptcp_retransmit_timer(struct timer_list *t) @@ -1616,6 +2044,14 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } +static void mptcp_timeout_timer(struct timer_list *t) +{ + struct sock *sk = from_timer(sk, t, sk_timer); + + mptcp_schedule_work(sk); + sock_put(sk); +} + /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * @@ -1629,7 +2065,7 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); if (__mptcp_check_fallback(msk)) - return msk->first; + return NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1638,8 +2074,11 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) continue; /* still data outstanding at TCP level? Don't retransmit. */ - if (!tcp_write_queue_empty(ssk)) + if (!tcp_write_queue_empty(ssk)) { + if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss) + continue; return NULL; + } if (subflow->backup) { if (!backup) @@ -1662,20 +2101,43 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) * parent socket. */ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout) + struct mptcp_subflow_context *subflow) { - struct socket *sock = READ_ONCE(ssk->sk_socket); + bool dispose_socket = false; + struct socket *sock; list_del(&subflow->node); - if (sock && sock != sk->sk_socket) { - /* outgoing subflow */ - sock_release(sock); + lock_sock(ssk); + + /* if we are invoked by the msk cleanup code, the subflow is + * already orphaned + */ + sock = ssk->sk_socket; + if (sock) { + dispose_socket = sock != sk->sk_socket; + sock_orphan(ssk); + } + + /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops + * the ssk has been already destroyed, we just need to release the + * reference owned by msk; + */ + if (!inet_csk(ssk)->icsk_ulp_ops) { + kfree_rcu(subflow, rcu); } else { - /* incoming subflow */ - tcp_close(ssk, timeout); + /* otherwise ask tcp do dispose of ssk and subflow ctx */ + subflow->disposable = 1; + __tcp_close(ssk, 0); + + /* close acquired an extra ref */ + __sock_put(ssk); } + release_sock(ssk); + if (dispose_socket) + iput(SOCK_INODE(sock)); + + sock_put(ssk); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -1694,6 +2156,10 @@ static void pm_work(struct mptcp_sock *msk) pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_nl_add_addr_send_ack(msk); + } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_nl_rm_addr_received(msk); @@ -1720,40 +2186,69 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) if (inet_sk_state_load(ssk) != TCP_CLOSE) continue; - __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + __mptcp_close_ssk((struct sock *)msk, ssk, subflow); + } +} + +static bool mptcp_check_close_timeout(const struct sock *sk) +{ + s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; + struct mptcp_subflow_context *subflow; + + if (delta >= TCP_TIMEWAIT_LEN) + return true; + + /* if all subflows are in closed status don't bother with additional + * timeout + */ + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != + TCP_CLOSE) + return false; } + return true; } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, mss_now = 0, size_goal = 0; + struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - u64 orig_write_seq; size_t copied = 0; - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; - long timeo = 0; + int state, ret; lock_sock(sk); - mptcp_clean_una(sk); + state = sk->sk_state; + if (unlikely(state == TCP_CLOSE)) + goto unlock; + mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); - __mptcp_move_skbs(msk); - if (msk->pm.status) pm_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); + /* if the msk data is completely acked, or the socket timedout, + * there is no point in keeping around an orphaned sk + */ + if (sock_flag(sk, SOCK_DEAD) && + (mptcp_check_close_timeout(sk) || + (state != sk->sk_state && + ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) { + inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_destroy_sock(sk); + goto unlock; + } + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1761,39 +2256,30 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; - if (!mptcp_ext_cache_refill(msk)) - goto reset_unlock; - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; lock_sock(ssk); - orig_len = dfrag->data_len; - orig_offset = dfrag->offset; - orig_write_seq = dfrag->data_seq; - while (dfrag->data_len > 0) { - int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, - &mss_now, &size_goal); - if (ret < 0) + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = dfrag->already_sent; + while (info.sent < dfrag->already_sent) { + if (!mptcp_alloc_tx_skb(sk, ssk)) + break; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; - dfrag->data_len -= ret; - dfrag->offset += ret; - - if (!mptcp_ext_cache_refill(msk)) - break; + info.sent += ret; } if (copied) - tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); - - dfrag->data_seq = orig_write_seq; - dfrag->offset = orig_offset; - dfrag->data_len = orig_len; + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -1816,10 +2302,17 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); - __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); + __skb_queue_head_init(&msk->receive_queue); + __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; + msk->first_pending = NULL; + msk->wmem_reserved = 0; + msk->rmem_released = 0; + msk->tx_pending_data = 0; + msk->size_goal_cache = TCP_BASE_MSS; + msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -1827,7 +2320,7 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - + timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); return 0; } @@ -1861,11 +2354,15 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; + struct sk_buff *skb; - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); - + WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); + while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { + sk->sk_forward_alloc += skb->truesize; + kfree_skb(skb); + } } static void mptcp_cancel_work(struct sock *sk) @@ -1873,7 +2370,7 @@ static void mptcp_cancel_work(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) - sock_put(sk); + __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) @@ -1931,42 +2428,67 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void mptcp_close(struct sock *sk, long timeout) +static void __mptcp_check_send_data_fin(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", + msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), + msk->snd_nxt, msk->write_seq); - if (sk->sk_state == TCP_LISTEN) { - inet_sk_state_store(sk, TCP_CLOSE); - goto cleanup; - } else if (sk->sk_state == TCP_CLOSE) { - goto cleanup; - } + /* we still need to enqueue subflows or not really shutting down, + * skip this + */ + if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || + mptcp_send_head(sk)) + return; + + WRITE_ONCE(msk->snd_nxt, msk->write_seq); + /* fallback socket will not get data_fin/ack, can move to the next + * state now + */ if (__mptcp_check_fallback(msk)) { - goto update_state; - } else if (mptcp_close_state(sk)) { - pr_debug("Sending DATA_FIN sk=%p", sk); - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); + if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_close_wake_up(sk); + } else if (sk->sk_state == TCP_FIN_WAIT1) { + inet_sk_state_store(sk, TCP_FIN_WAIT2); + } + } - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + __mptcp_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); - } + mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } +} - sk_stream_wait_close(sk, timeout); +static void __mptcp_wr_shutdown(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); -update_state: - inet_sk_state_store(sk, TCP_CLOSE); + pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", + msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, + !!mptcp_send_head(sk)); + + /* will be ignored by fallback sockets */ + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + __mptcp_check_send_data_fin(sk); +} + +static void __mptcp_destroy_sock(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + LIST_HEAD(conn_list); + + pr_debug("msk=%p", msk); -cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -1975,20 +2497,77 @@ cleanup: spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - __mptcp_clear_xmit(sk); - - release_sock(sk); + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + sk_stop_timer(sk, &sk->sk_timer); + msk->pm.status = 0; list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); } - mptcp_cancel_work(sk); + sk->sk_prot->destroy(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_released); + sk_stream_kill_queues(sk); + xfrm_sk_free_policy(sk); + sk_refcnt_debug_release(sk); + sock_put(sk); +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow; + bool do_cancel_work = false; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } - __skb_queue_purge(&sk->sk_receive_queue); + if (mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); - sk_common_release(sk); + sk_stream_wait_close(sk, timeout); + +cleanup: + /* orphan all the subflows */ + inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; + list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow, dispose_socket; + struct socket *sock; + + slow = lock_sock_fast(ssk); + sock = ssk->sk_socket; + dispose_socket = sock && sock != sk->sk_socket; + sock_orphan(ssk); + unlock_sock_fast(ssk, slow); + + /* for the outgoing subflows we additionally need to free + * the associated socket + */ + if (dispose_socket) + iput(SOCK_INODE(sock)); + } + sock_orphan(sk); + + sock_hold(sk); + pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + do_cancel_work = true; + } else { + sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + } + release_sock(sk); + if (do_cancel_work) + mptcp_cancel_work(sk); + sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) @@ -2059,13 +2638,17 @@ struct sock *mptcp_sk_clone(const struct sock *sk, WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; - atomic64_set(&msk->snd_una, msk->write_seq); + msk->snd_nxt = msk->write_seq; + msk->snd_una = msk->write_seq; + msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + if (mp_opt->mp_capable) { msk->can_ack = true; msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -2092,6 +2675,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; + + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, @@ -2116,7 +2701,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; - struct sock *ssk = newsk; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; @@ -2131,21 +2715,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, /* acquire the 2nd reference for the owning socket */ sock_hold(new_mptcp_sock); - - local_bh_disable(); - bh_lock_sock(new_mptcp_sock); - msk = mptcp_sk(new_mptcp_sock); - msk->first = newsk; - newsk = new_mptcp_sock; - mptcp_copy_inaddrs(newsk, ssk); - list_add(&subflow->node, &msk->conn_list); - - mptcp_rcv_space_init(msk, ssk); - bh_unlock_sock(new_mptcp_sock); - - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - local_bh_enable(); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); @@ -2156,6 +2727,13 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, void mptcp_destroy_common(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; + + __mptcp_clear_xmit(sk); + + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -2165,9 +2743,6 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->cached_ext) - __skb_ext_put(msk->cached_ext); - mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } @@ -2282,16 +2857,58 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_WRITE_TIMER_DEFERRED) +void __mptcp_data_acked(struct sock *sk) +{ + if (!sock_owned_by_user(sk)) + __mptcp_clean_una(sk); + else + set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); -/* this is very alike tcp_release_cb() but we must handle differently a - * different set of events - */ + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); +} + +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk) +{ + if (!mptcp_send_head(sk)) + return; + + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + +#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) + +/* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; + /* push_pending may touch wmem_reserved, do it before the later + * cleanup + */ + if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + __mptcp_clean_una(sk); + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) { + /* mptcp_push_pending() acquires the subflow socket lock + * + * 1) can't be invoked in atomic scope + * 2) must avoid ABBA deadlock with msk socket spinlock: the RX + * datapath acquires the msk socket spinlock while helding + * the subflow socket lock + */ + + spin_unlock_bh(&sk->sk_lock.slock); + mptcp_push_pending(sk, 0); + spin_lock_bh(&sk->sk_lock.slock); + } + + /* clear any wmem reservation and errors */ + __mptcp_update_wmem(sk); + __mptcp_update_rmem(sk); + do { flags = sk->sk_tsq_flags; if (!(flags & MPTCP_DEFERRED_ALL)) @@ -2301,15 +2918,6 @@ static void mptcp_release_cb(struct sock *sk) sock_release_ownership(sk); - if (flags & TCPF_DELACK_TIMER_DEFERRED) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - ssk = mptcp_subflow_recv_lookup(msk); - if (!ssk || !schedule_work(&msk->work)) - __sock_put(sk); - } - if (flags & TCPF_WRITE_TIMER_DEFERRED) { mptcp_retransmit_handler(sk); __sock_put(sk); @@ -2367,9 +2975,11 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); - atomic64_set(&msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); @@ -2385,9 +2995,9 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -bool mptcp_finish_join(struct sock *sk) +bool mptcp_finish_join(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; struct socket *parent_sock; @@ -2408,12 +3018,14 @@ bool mptcp_finish_join(struct sock *sk) /* active connections are already on conn_list, and we can't acquire * msk lock here. * use the join list lock as synchronization point and double-check - * msk status to avoid racing with mptcp_close() + * msk status to avoid racing with __mptcp_destroy_sock() */ spin_lock_bh(&msk->join_list_lock); ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) + if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { list_add_tail(&subflow->node, &msk->join_list); + sock_hold(ssk); + } spin_unlock_bh(&msk->join_list_lock); if (!ret) return false; @@ -2422,19 +3034,12 @@ bool mptcp_finish_join(struct sock *sk) * at close time */ parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !sk->sk_socket) - mptcp_sock_graft(sk, parent_sock); + if (parent_sock && !ssk->sk_socket) + mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); return true; } -static bool mptcp_memory_free(const struct sock *sk, int wake) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; -} - static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -2455,8 +3060,8 @@ static struct proto mptcp_prot = { .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2599,6 +3204,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; + struct sock *newsk = newsock->sk; + bool slowpath; + + slowpath = lock_sock_fast(newsk); + mptcp_copy_inaddrs(newsk, msk->first); + mptcp_rcv_space_init(msk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. @@ -2610,6 +3221,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + unlock_sock_fast(newsk, slowpath); } if (inet_csk_listen_poll(ssock->sk)) @@ -2628,6 +3240,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + + if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) + return 0; + + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + set_bit(MPTCP_NOSPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + return 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -2646,8 +3276,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + mask |= mptcp_check_writeable(msk); } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; @@ -2658,12 +3287,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; + struct sock *sk = sock->sk; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); - lock_sock(sock->sk); + lock_sock(sk); how++; if ((how & ~SHUTDOWN_MASK) || !how) { @@ -2672,45 +3301,22 @@ static int mptcp_shutdown(struct socket *sock, int how) } if (sock->state == SS_CONNECTING) { - if ((1 << sock->sk->sk_state) & + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } - /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (__mptcp_check_fallback(msk)) { - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } else if ((how & SEND_SHUTDOWN) && - ((1 << sock->sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && - mptcp_close_state(sock->sk)) { - __mptcp_flush_join_list(msk); - - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } + sk->sk_shutdown |= how; + if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); /* Wake up anyone sleeping in poll. */ - sock->sk->sk_state_change(sock->sk); + sk->sk_state_change(sk); out_unlock: - release_sock(sock->sk); + release_sock(sk); return ret; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 13ab89dc1914..fc56e730fb35 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -86,11 +86,20 @@ /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 -#define MPTCP_SEND_SPACE 1 +#define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_PUSH_PENDING 6 +#define MPTCP_CLEAN_UNA 7 + +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; @@ -153,11 +162,18 @@ struct mptcp_addr_info { enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, }; +enum mptcp_add_addr_status { + MPTCP_ADD_ADDR_SIGNAL, + MPTCP_ADD_ADDR_ECHO, + MPTCP_ADD_ADDR_IPV6, +}; + struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; @@ -165,13 +181,12 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ - bool add_addr_signal; + u8 add_addr_signal; bool rm_addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; - bool add_addr_echo; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; @@ -187,9 +202,10 @@ struct mptcp_pm_data { struct mptcp_data_frag { struct list_head list; u64 data_seq; - int data_len; - int offset; - int overhead; + u16 data_len; + u16 offset; + u16 overhead; + u16 already_sent; struct page *page; }; @@ -200,13 +216,20 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 snd_nxt; u64 ack_seq; + u64 rcv_wnd_sent; u64 rcv_data_fin_seq; + int wmem_reserved; struct sock *last_snd; int snd_burst; - atomic64_t snd_una; + int old_wspace; + u64 snd_una; + u64 wnd_end; unsigned long timer_ival; u32 token; + int rmem_pending; + int rmem_released; unsigned long flags; bool can_ack; bool fully_established; @@ -214,13 +237,18 @@ struct mptcp_sock { bool snd_data_fin_enable; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; + struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; + struct sk_buff_head receive_queue; + struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ + int tx_pending_data; + int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; + struct mptcp_data_frag *first_pending; struct list_head join_list; - struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; @@ -232,6 +260,22 @@ struct mptcp_sock { } rcvq_space; }; +#define mptcp_lock_sock(___sk, cb) do { \ + struct sock *__sk = (___sk); /* silence macro reuse warning */ \ + might_sleep(); \ + spin_lock_bh(&__sk->sk_lock.slock); \ + if (__sk->sk_lock.owned) \ + __lock_sock(__sk); \ + cb; \ + __sk->sk_lock.owned = 1; \ + spin_unlock(&__sk->sk_lock.slock); \ + mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ + local_bh_enable(); \ +} while (0) + +#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) +#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) + #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) @@ -240,11 +284,46 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +static inline int __mptcp_space(const struct sock *sk) +{ + return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending); +} + +static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + + return READ_ONCE(msk->first_pending); +} + +static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *cur; + + cur = msk->first_pending; + return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : + list_next_entry(cur, list); +} + +static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->first_pending) + return NULL; + + if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) + return NULL; + + return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (list_empty(&msk->rtx_queue)) + if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); @@ -312,7 +391,8 @@ struct mptcp_subflow_context { mpc_map : 1, backup : 1, rx_eof : 1, - can_ack : 1; /* only after processing the remote a key */ + can_ack : 1, /* only after processing the remote a key */ + disposable : 1; /* ctx can be free at ulp release time */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -361,15 +441,24 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } +static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + sock_hold(mptcp_subflow_tcp_sock(subflow)); + spin_lock_bh(&msk->join_list_lock); + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); +} + int mptcp_is_enabled(struct net *net); +unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout); + struct mptcp_subflow_context *subflow); void mptcp_subflow_reset(struct sock *ssk); /* called with sk socket lock held */ @@ -407,9 +496,18 @@ static inline bool mptcp_is_fully_established(struct sock *sk) void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); -void mptcp_data_acked(struct sock *sk); +bool mptcp_schedule_work(struct sock *sk); +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk); +void __mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); +void __mptcp_flush_join_list(struct mptcp_sock *msk); +static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->snd_data_fin_enable) && + READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); +} + void mptcp_destroy_common(struct mptcp_sock *msk); void __init mptcp_token_init(void); @@ -444,6 +542,7 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk, void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); struct mptcp_pm_add_entry * @@ -458,7 +557,17 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal); + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); +} + +static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); +} + +static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) @@ -485,6 +594,7 @@ void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); +void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); @@ -494,13 +604,6 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } -static inline bool before64(__u64 seq1, __u64 seq2) -{ - return (__s64)(seq1 - seq2) < 0; -} - -#define after64(seq2, seq1) before64(seq1, seq2) - void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ac4a1fe3550b..5f5815a1665f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -112,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li return 0; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +/* Init mptcp request socket. + * + * Returns an error code if a JOIN has failed and a TCP reset + * should be sent. + */ +static int subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -125,7 +130,7 @@ static void subflow_init_req(struct request_sock *req, ret = __subflow_init_req(req, sk_listener); if (ret) - return; + return 0; mptcp_get_options(skb, &mp_opt); @@ -133,7 +138,7 @@ static void subflow_init_req(struct request_sock *req, SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (mp_opt.mp_join) - return; + return 0; } else if (mp_opt.mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } @@ -157,7 +162,7 @@ again: } else { subflow_req->mp_capable = 1; } - return; + return 0; } err = mptcp_token_new_request(req); @@ -175,7 +180,11 @@ again: subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); - if (unlikely(req->syncookie) && subflow_req->msk) { + /* Can't fall back to TCP in this case. */ + if (!subflow_req->msk) + return -EPERM; + + if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); } @@ -183,6 +192,8 @@ again: pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } + + return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, @@ -228,27 +239,53 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); -static void subflow_v4_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v4_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; + + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; - subflow_init_req(req, sk_listener, skb); + dst_release(dst); + if (!req->syncookie) + tcp_request_sock_ops.send_reset(sk, skb); + return NULL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static void subflow_v6_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v6_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; + + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; - subflow_init_req(req, sk_listener, skb); + dst_release(dst); + if (!req->syncookie) + tcp6_request_sock_ops.send_reset(sk, skb); + return NULL; } #endif @@ -543,9 +580,8 @@ create_msk: fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!mp_opt.mp_join || - !mptcp_can_accept_new_subflow(subflow_req->msk) || - !subflow_hmac_valid(req, &mp_opt)) { + if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || + !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; } @@ -578,6 +614,10 @@ create_child: */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* link the newly created socket to the msk */ + mptcp_add_pending_subflow(mptcp_sk(new_msk), ctx); + WRITE_ONCE(mptcp_sk(new_msk)->first, child); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ @@ -846,8 +886,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; - if (incr) - tcp_cleanup_rbuf(ssk, incr); } static bool subflow_check_data_avail(struct sock *ssk) @@ -969,7 +1007,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; - *space = tcp_space(sk); + *space = __mptcp_space(sk); *full_space = tcp_full_space(sk); } @@ -994,20 +1032,9 @@ static void subflow_data_ready(struct sock *sk) mptcp_data_ready(parent, sk); } -static void subflow_write_space(struct sock *sk) +static void subflow_write_space(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *parent = subflow->conn; - - if (!sk_stream_is_writeable(sk)) - return; - - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); - } + /* we take action in __mptcp_clean_una() */ } static struct inet_connection_sock_af_ops * @@ -1125,13 +1152,11 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (err && err != -EINPROGRESS) goto failed; - spin_lock_bh(&msk->join_list_lock); - list_add_tail(&subflow->node, &msk->join_list); - spin_unlock_bh(&msk->join_list_lock); - + mptcp_add_pending_subflow(msk, subflow); return err; failed: + subflow->disposable = 1; sock_release(sf); return err; } @@ -1254,7 +1279,6 @@ static void subflow_state_change(struct sock *sk) mptcp_data_ready(parent, sk); if (__mptcp_check_fallback(mptcp_sk(parent)) && - !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1297,17 +1321,26 @@ out: return err; } -static void subflow_ulp_release(struct sock *sk) +static void subflow_ulp_release(struct sock *ssk) { - struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + bool release = true; + struct sock *sk; if (!ctx) return; - if (ctx->conn) - sock_put(ctx->conn); + sk = ctx->conn; + if (sk) { + /* if the msk has been orphaned, keep the ctx + * alive, will be freed by mptcp_done() + */ + release = ctx->disposable; + sock_put(sk); + } - kfree_rcu(ctx, rcu); + if (release) + kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, @@ -1392,7 +1425,7 @@ void __init mptcp_subflow_init(void) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; - subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; @@ -1401,7 +1434,7 @@ void __init mptcp_subflow_init(void) #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; - subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 8b47c4bb1c6b..feb4b9ffd462 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -291,7 +291,7 @@ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, { struct mptcp_sock *ret = NULL; struct hlist_nulls_node *pos; - int slot, num; + int slot, num = 0; for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) { struct token_bucket *bucket = &token_hash[slot]; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index f1be3e3f6425..a9cb355324d1 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1726,9 +1726,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->ptype.dev = dev; dev_add_pack(&ndp->ptype); - /* Set up generic netlink interface */ - ncsi_init_netlink(dev); - pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; @@ -1892,8 +1889,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) list_del_rcu(&ndp->node); spin_unlock_irqrestore(&ncsi_dev_lock, flags); - ncsi_unregister_netlink(nd->dev); - kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index adddc7707aa4..bb5f1650f11c 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -766,24 +766,8 @@ static struct genl_family ncsi_genl_family __ro_after_init = { .n_small_ops = ARRAY_SIZE(ncsi_ops), }; -int ncsi_init_netlink(struct net_device *dev) +static int __init ncsi_init_netlink(void) { - int rc; - - rc = genl_register_family(&ncsi_genl_family); - if (rc) - netdev_err(dev, "ncsi: failed to register netlink family\n"); - - return rc; -} - -int ncsi_unregister_netlink(struct net_device *dev) -{ - int rc; - - rc = genl_unregister_family(&ncsi_genl_family); - if (rc) - netdev_err(dev, "ncsi: failed to unregister netlink family\n"); - - return rc; + return genl_register_family(&ncsi_genl_family); } +subsys_initcall(ncsi_init_netlink); diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h index 7502723fba83..39a1a9d7bf77 100644 --- a/net/ncsi/ncsi-netlink.h +++ b/net/ncsi/ncsi-netlink.h @@ -22,7 +22,4 @@ int ncsi_send_netlink_err(struct net_device *dev, struct nlmsghdr *nlhdr, int err); -int ncsi_init_netlink(struct net_device *dev); -int ncsi_unregister_netlink(struct net_device *dev); - #endif /* __NCSI_NETLINK_H__ */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 52370211e46b..49fbef0d99be 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -682,6 +682,16 @@ config NFT_FIB_NETDEV The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_REJECT_NETDEV + depends on NFT_REJECT_IPV4 + depends on NFT_REJECT_IPV6 + tristate "Netfilter nf_tables netdev REJECT support" + help + This option enables the REJECT support from the netdev table. + The return packet generation will be delegated to the IPv4 + or IPv6 ICMP or TCP RST implementation depending on the + protocol of the packet. + endif # NF_TABLES_NETDEV endif # NF_TABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0e0ded87e27b..33da7bf1b68e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o +obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 6f35832f0de3..89009c82a6b2 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -271,8 +271,7 @@ flag_nested(const struct nlattr *nla) static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, - [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), }; int @@ -637,13 +636,14 @@ ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, if (SET_WITH_COUNTER(set)) { struct ip_set_counter *counter = ext_counter(data, set); + ip_set_update_counter(counter, ext, flags); + if (flags & IPSET_FLAG_MATCH_COUNTERS && !(ip_set_match_counter(ip_set_get_packets(counter), mext->packets, mext->packets_op) && ip_set_match_counter(ip_set_get_bytes(counter), mext->bytes, mext->bytes_op))) return false; - ip_set_update_counter(counter, ext, flags); } if (SET_WITH_SKBINFO(set)) ip_set_get_skbinfo(ext_skbinfo(data, set), @@ -1109,6 +1109,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl, ret = -IPSET_ERR_PROTOCOL; goto put_out; } + /* Set create flags depending on the type revision */ + set->flags |= set->type->create_flags[revision]; ret = set->type->create(net, set, tb, flags); if (ret != 0) @@ -1239,10 +1241,12 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { + u32 flags = flag_exist(nlh); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { - ret = -ENOENT; + if (!(flags & IPSET_FLAG_EXIST)) + ret = -ENOENT; goto out; } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 521e970be402..5f1208ad049e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -37,18 +37,18 @@ */ /* Number of elements to store in an initial array block */ -#define AHASH_INIT_SIZE 4 +#define AHASH_INIT_SIZE 2 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE) /* Max muber of elements in the array block when tuned */ #define AHASH_MAX_TUNED 64 +#define AHASH_MAX(h) ((h)->bucketsize) + /* Max number of elements can be tuned */ #ifdef IP_SET_HASH_WITH_MULTI -#define AHASH_MAX(h) ((h)->ahash_max) - static u8 -tune_ahash_max(u8 curr, u32 multi) +tune_bucketsize(u8 curr, u32 multi) { u32 n; @@ -61,12 +61,10 @@ tune_ahash_max(u8 curr, u32 multi) */ return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } - -#define TUNE_AHASH_MAX(h, multi) \ - ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#define TUNE_BUCKETSIZE(h, multi) \ + ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi)) #else -#define AHASH_MAX(h) AHASH_MAX_SIZE -#define TUNE_AHASH_MAX(h, multi) +#define TUNE_BUCKETSIZE(h, multi) #endif /* A hash bucket */ @@ -321,9 +319,7 @@ struct htype { #ifdef IP_SET_HASH_WITH_MARKMASK u32 markmask; /* markmask value for mark mask to store */ #endif -#ifdef IP_SET_HASH_WITH_MULTI - u8 ahash_max; /* max elements in an array block */ -#endif + u8 bucketsize; /* max elements in an array block */ #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif @@ -950,7 +946,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto set_full; /* Create a new slot */ if (n->pos >= n->size) { - TUNE_AHASH_MAX(h, multi); + TUNE_BUCKETSIZE(h, multi); if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); @@ -1305,6 +1301,11 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) goto nla_put_failure; #endif + if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) { + if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) || + nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval))) + goto nla_put_failure; + } if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) @@ -1547,8 +1548,20 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifdef IP_SET_HASH_WITH_MARKMASK h->markmask = markmask; #endif - get_random_bytes(&h->initval, sizeof(h->initval)); - + if (tb[IPSET_ATTR_INITVAL]) + h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL])); + else + get_random_bytes(&h->initval, sizeof(h->initval)); + h->bucketsize = AHASH_MAX_SIZE; + if (tb[IPSET_ATTR_BUCKETSIZE]) { + h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]); + if (h->bucketsize < AHASH_INIT_SIZE) + h->bucketsize = AHASH_INIT_SIZE; + else if (h->bucketsize > AHASH_MAX_SIZE) + h->bucketsize = AHASH_MAX_SIZE; + else if (h->bucketsize % 2) + h->bucketsize += 1; + } t->htable_bits = hbits; t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 5d6d68eaf6a9..d1bef23fd4f5 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -23,7 +23,8 @@ /* 1 Counters support */ /* 2 Comments support */ /* 3 Forceadd support */ -#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ +/* 4 skbinfo support */ +#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -277,11 +278,13 @@ static struct ip_set_type hash_ip_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index eceb7bc4a93a..467c59a83c0a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -23,7 +23,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>"); @@ -268,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index aba1df617d6e..18346d18aa16 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -21,7 +21,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */ +/* 2 skbinfo support */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); @@ -274,12 +275,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmark_create, .create_policy = { [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 1ff228717e29..e1ca11196515 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -25,7 +25,8 @@ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ +/* 5 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -341,11 +342,13 @@ static struct ip_set_type hash_ipport_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index fa88afd812fa..ab179e064597 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -25,7 +25,8 @@ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ +/* 5 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -356,11 +357,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index eef6ecfcb409..8f075b44cf64 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -27,7 +27,8 @@ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ +/* 7 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -513,11 +514,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 0b61593165ef..718814730acf 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -16,7 +16,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -125,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_mac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 136cf0781d3a..c1a11f041ac6 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -24,7 +24,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -354,11 +355,13 @@ static struct ip_set_type hash_net_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_net_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index be5e95a0d876..ddd51c2e1cb3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -26,7 +26,8 @@ /* 4 Comments support added */ /* 5 Forceadd support added */ /* 6 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ +/* 7 interface wildcard support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -225,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -442,7 +443,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip, e.cidr); - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -470,11 +471,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netiface_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index da4ef910b12d..6532f0505e66 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -22,7 +22,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ +/* 2 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -459,11 +460,13 @@ static struct ip_set_type hash_netnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 34448df80fb9..ec1564a1cb5a 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -26,7 +26,8 @@ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ +/* 7 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -460,11 +461,13 @@ static struct ip_set_type hash_netport_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 934c1712cba8..0e91d1e82f1c 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ /* 1 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ +/* 2 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -558,11 +559,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index cc3c275934f4..c0b8215ab3d4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -742,12 +742,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && - ip6_route_me_harder(ipvs->net, skb) != 0) + ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) + ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) return 1; return 0; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e279ded4e306..d45dbcba8b49 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -4167,12 +4167,18 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) spin_lock_init(&ipvs->tot_stats.lock); - proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); - proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, - ip_vs_stats_show, NULL); - proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, - ip_vs_stats_percpu_show, NULL); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, + &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) + goto err_vs; + if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, + ip_vs_stats_show, NULL)) + goto err_stats; + if (!proc_create_net_single("ip_vs_stats_percpu", 0, + ipvs->net->proc_net, + ip_vs_stats_percpu_show, NULL)) + goto err_percpu; +#endif if (ip_vs_control_net_init_sysctl(ipvs)) goto err; @@ -4180,6 +4186,17 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) return 0; err: +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + +err_percpu: + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + +err_stats: + remove_proc_entry("ip_vs", ipvs->net->proc_net); + +err_vs: +#endif free_percpu(ipvs->tot_stats.cpustats); return -ENOMEM; } @@ -4188,9 +4205,11 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); +#endif free_percpu(ipvs->tot_stats.cpustats); } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index c8fb2187ad4b..811c6c9b59e1 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -834,12 +834,6 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } -static bool nf_conntrack_tcp_established(const struct nf_conn *ct) -{ - return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && - test_bit(IPS_ASSURED_BIT, &ct->status); -} - /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 59151dc07fdc..e87b6bd6b3cd 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -715,7 +715,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -953,7 +953,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - err = nf_ip6_route_me_harder(state->net, skb); + err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 9cca35d22927..d7d34a62d3bf 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -446,7 +446,7 @@ synproxy_send_tcp(struct net *net, skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 65cb8e3c13d9..a11bc8dcaa82 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -581,7 +581,8 @@ struct nft_module_request { }; #ifdef CONFIG_MODULES -static int nft_request_module(struct net *net, const char *fmt, ...) +static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, + ...) { char module_name[MODULE_NAME_LEN]; struct nft_module_request *req; @@ -619,7 +620,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...) static void lockdep_nfnl_nft_mutex_not_held(void) { #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + if (debug_locks) + WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); #endif } @@ -1281,7 +1283,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); @@ -1721,7 +1723,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_alloc; } - nla_strlcpy(ifname, attr, IFNAMSIZ); + nla_strscpy(ifname, attr, IFNAMSIZ); dev = __dev_get_by_name(net, ifname); if (!dev) { err = -ENOENT; @@ -5734,7 +5736,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, struct rhlist_head *tmp, *list; struct nft_object *obj; - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); k.name = search; WARN_ON_ONCE(!rcu_read_lock_held() && @@ -7137,7 +7139,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, GFP_KERNEL); kfree(buf); - if (ctx->report && + if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; @@ -7259,7 +7261,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq, AUDIT_NFT_OP_GEN_REGISTER, GFP_KERNEL); - if (nlmsg_report(nlh) && + if (!nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; @@ -8053,12 +8055,16 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net, bool autoload) +static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_hook *hook; + if (action == NFNL_ABORT_VALIDATE && + nf_tables_validate(net) < 0) + return -EAGAIN; + list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { switch (trans->msg_type) { @@ -8190,7 +8196,7 @@ static int __nf_tables_abort(struct net *net, bool autoload) nf_tables_abort_release(trans); } - if (autoload) + if (action == NFNL_ABORT_AUTOLOAD) nf_tables_module_autoload(net); else nf_tables_module_autoload_cleanup(net); @@ -8203,9 +8209,10 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, + enum nfnl_abort_action action) { - int ret = __nf_tables_abort(net, autoload); + int ret = __nf_tables_abort(net, action); mutex_unlock(&net->nft.commit_mutex); @@ -8836,7 +8843,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net, false); + __nf_tables_abort(net, NFNL_ABORT_NONE); __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9f625724a20f..9ae14270c543 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -28,6 +28,23 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) return flow; } +void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, + enum flow_dissector_key_id addr_type) +{ + struct nft_flow_match *match = &flow->match; + struct nft_flow_key *mask = &match->mask; + struct nft_flow_key *key = &match->key; + + if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) + return; + + key->control.addr_type = addr_type; + mask->control.addr_type = 0xffff; + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = + offsetof(struct nft_flow_key, control); +} + struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 2daa1f6ae344..d3df66a39b5e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -333,7 +333,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, return netlink_ack(skb, nlh, -EINVAL, NULL); replay: status = 0; - +replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); @@ -499,7 +499,7 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb, true); + ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); @@ -510,11 +510,25 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb, false); + ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { - ss->abort(net, oskb, false); + enum nfnl_abort_action abort_action; + + if (status & NFNL_BATCH_FAILURE) + abort_action = NFNL_ABORT_NONE; + else + abort_action = NFNL_ABORT_VALIDATE; + + err = ss->abort(net, oskb, abort_action); + if (err == -EAGAIN) { + nfnl_err_reset(&err_list); + kfree_skb(skb); + module_put(ss->owner); + status |= NFNL_BATCH_FAILURE; + goto replay_abort; + } } if (ss->cleanup) ss->cleanup(net); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5bfec829c12f..5e511df8d709 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -112,7 +112,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, nfacct->flags = flags; } - nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); + nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 5b0d0a77379c..0f94fce1d3ed 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -146,7 +146,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; - nla_strlcpy(expect_policy->name, + nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); @@ -233,7 +233,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], if (ret < 0) goto err1; - nla_strlcpy(helper->name, + nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c index 8826bbe71136..edd02cda57fc 100644 --- a/net/netfilter/nft_chain_route.c +++ b/net/netfilter/nft_chain_route.c @@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv, iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv, skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u32 *)ipv6_hdr(skb)))) { - err = nf_ip6_route_me_harder(state->net, skb); + err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index bc079d68a536..00e563a72d3d 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -123,11 +123,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; - if (priv->op != NFT_CMP_EQ || reg->len != priv->len) + if (priv->op != NFT_CMP_EQ || priv->len > reg->len) return -EOPNOTSUPP; - memcpy(key + reg->offset, &priv->data, priv->len); - memcpy(mask + reg->offset, ®->mask, priv->len); + memcpy(key + reg->offset, &priv->data, reg->len); + memcpy(mask + reg->offset, ®->mask, reg->len); flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; @@ -137,7 +137,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, nft_reg_load16(priv->data.data) != ARPHRD_ETHER) return -EOPNOTSUPP; - nft_offload_update_dependency(ctx, &priv->data, priv->len); + nft_offload_update_dependency(ctx, &priv->data, reg->len); return 0; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 322bd674963e..a8c4d442231c 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -990,7 +990,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (!priv->l4proto) return -ENOENT; - nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); + nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 57899454a530..a06a46b039c5 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -152,7 +152,7 @@ static int nft_log_init(const struct nft_ctx *ctx, priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); if (priv->prefix == NULL) return -ENOMEM; - nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); + nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); } else { priv->prefix = (char *)nft_log_null_prefix; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index b37bd02448d8..bf4b3ad5314c 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -724,22 +724,22 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, switch (priv->key) { case NFT_META_PROTOCOL: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, - sizeof(__u16), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, + sizeof(__u16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case NFT_META_L4PROTO: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, - sizeof(__u8), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; case NFT_META_IIF: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, - ingress_ifindex, sizeof(__u32), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); break; case NFT_META_IIFTYPE: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, - ingress_iftype, sizeof(__u16), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); break; default: return -EOPNOTSUPP; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index dcd3c7b8a367..47d4e0e21651 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -165,6 +165,34 @@ nla_put_failure: return -1; } +static bool nft_payload_offload_mask(struct nft_offload_reg *reg, + u32 priv_len, u32 field_len) +{ + unsigned int remainder, delta, k; + struct nft_data mask = {}; + __be32 remainder_mask; + + if (priv_len == field_len) { + memset(®->mask, 0xff, priv_len); + return true; + } else if (priv_len > field_len) { + return false; + } + + memset(&mask, 0xff, field_len); + remainder = priv_len % sizeof(u32); + if (remainder) { + k = priv_len / sizeof(u32); + delta = field_len - priv_len; + remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1)); + mask.data[k] = (__force u32)remainder_mask; + } + + memcpy(®->mask, &mask, field_len); + + return true; +} + static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) @@ -173,21 +201,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ethhdr, h_source): - if (priv->len != ETH_ALEN) + if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): - if (priv->len != ETH_ALEN) + if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_proto): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, @@ -195,14 +223,14 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, vlan_tci, sizeof(__be16), reg); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, @@ -210,7 +238,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, @@ -218,7 +246,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + sizeof(struct vlan_hdr): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, @@ -239,21 +267,25 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct iphdr, saddr): - if (priv->len != sizeof(struct in_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, daddr): - if (priv->len != sizeof(struct in_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, protocol): - if (priv->len != sizeof(__u8)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, @@ -275,21 +307,25 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): - if (priv->len != sizeof(struct in6_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, daddr): - if (priv->len != sizeof(struct in6_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, nexthdr): - if (priv->len != sizeof(__u8)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, @@ -331,14 +367,14 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct tcphdr, source): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, @@ -359,14 +395,14 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct udphdr, source): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 61fb7e8afbf0..927ff8459bd9 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -40,6 +40,7 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; @@ -47,9 +48,17 @@ int nft_reject_init(const struct nft_ctx *ctx, priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; case NFT_REJECT_TCP_RST: break; default: @@ -69,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index cf8f2646e93c..32f3ea398ddf 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -58,60 +58,16 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; } -static int nft_reject_inet_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_reject_inet_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { - struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - - icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - if (priv->type == NFT_REJECT_ICMPX_UNREACH && - icmp_code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; - - priv->icmp_code = icmp_code; - break; - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - return 0; -} - -static int nft_reject_inet_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - default: - break; - } - - return 0; - -nla_put_failure: - return -1; + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_INGRESS)); } static struct nft_expr_type nft_reject_inet_type; @@ -119,9 +75,9 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_inet_init, - .dump = nft_reject_inet_dump, - .validate = nft_reject_validate, + .init = nft_reject_init, + .dump = nft_reject_dump, + .validate = nft_reject_inet_validate, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c new file mode 100644 index 000000000000..d89f68754f42 --- /dev/null +++ b/net/netfilter/nft_reject_netdev.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com> + * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb) +{ + dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol), + eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest, + nskb->len); + dev_queue_xmit(nskb); +} + +static void nft_reject_netdev_send_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_send_v4_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_send_v6_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + + +static void nft_reject_netdev_send_v6_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct ethhdr *eth = eth_hdr(pkt->skb); + struct nft_reject *priv = nft_expr_priv(expr); + const unsigned char *dest = eth->h_dest; + + if (is_broadcast_ether_addr(dest) || + is_multicast_ether_addr(dest)) + goto out; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt)); + break; + case NFT_REJECT_ICMPX_UNREACH: + nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; + case htons(ETH_P_IPV6): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt)); + break; + case NFT_REJECT_ICMPX_UNREACH: + nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + nft_reject_icmpv6_code(priv->icmp_code)); + break; + } + break; + default: + /* No explicit way to reject this protocol, drop it. */ + break; + } +out: + regs->verdict.code = NF_DROP; +} + +static int nft_reject_netdev_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + +static struct nft_expr_type nft_reject_netdev_type; +static const struct nft_expr_ops nft_reject_netdev_ops = { + .type = &nft_reject_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_netdev_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, + .validate = nft_reject_netdev_validate, +}; + +static struct nft_expr_type nft_reject_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "reject", + .ops = &nft_reject_netdev_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_netdev_module_init(void) +{ + return nft_register_expr(&nft_reject_netdev_type); +} + +static void __exit nft_reject_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_reject_netdev_type); +} + +module_init(nft_reject_netdev_module_init); +module_exit(nft_reject_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>"); +MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>"); +MODULE_DESCRIPTION("Reject packets from netdev via nftables"); +MODULE_ALIAS_NFT_AF_EXPR(5, "reject"); diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index cedf47ab3c6f..2182d361e273 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -191,8 +191,8 @@ static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, skb, - RTN_UNSPEC); + return ip_route_me_harder(entry->state.net, entry->state.sk, + skb, RTN_UNSPEC); } #endif return 0; diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 4e62f2ad3575..f28c8947c730 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -366,6 +366,7 @@ static const struct netlbl_calipso_ops *calipso_ops; /** * netlbl_calipso_ops_register - Register the CALIPSO operations + * @ops: ops to register * * Description: * Register the CALIPSO packet engine operations. diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index eb1d66d20afb..df1b41ed73fd 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -95,7 +95,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_entry; } - nla_strlcpy(entry->domain, + nla_strscpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 2e8e3f7b2111..ccb491642811 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1166,12 +1166,13 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; - u32 iter_bkt; - u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + u32 skip_addr4 = cb->args[2]; + u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) + u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif @@ -1182,7 +1183,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; - iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || @@ -1190,7 +1191,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < cb->args[2]) + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1203,10 +1204,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr4 = 0; + skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < cb->args[3]) + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1219,8 +1222,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr6 = 0; + skip_addr6 = 0; #endif /* IPv6 */ } + iter_chain = 0; + skip_chain = 0; } unlabel_staticlist_return: diff --git a/net/nfc/core.c b/net/nfc/core.c index eb377f87bcae..573c80c6ff7a 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -189,7 +189,8 @@ static const struct rfkill_ops nfc_rfkill_ops = { * nfc_start_poll - start polling for nfc targets * * @dev: The nfc device that must start polling - * @protocols: bitset of nfc protocols that must be used for polling + * @im_protocols: bitset of nfc initiator protocols to be used for polling + * @tm_protocols: bitset of nfc transport protocols to be used for polling * * The device remains polling for targets until a target is found or * the nfc_stop_poll function is called. @@ -436,6 +437,7 @@ error: * * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated + * @mode: idle or sleep? */ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { @@ -703,7 +705,11 @@ EXPORT_SYMBOL(nfc_tm_deactivated); /** * nfc_alloc_send_skb - allocate a skb for data exchange responses * + * @dev: device sending the response + * @sk: socket sending the response + * @flags: MSG_DONTWAIT flag * @size: size to allocate + * @err: pointer to memory to store the error code */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, @@ -1039,6 +1045,8 @@ struct nfc_dev *nfc_get_device(unsigned int idx) * * @ops: device operations * @supported_protocols: NFC protocols supported by the device + * @tx_headroom: reserved space at beginning of skb + * @tx_tailroom: reserved space at end of skb */ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, u32 supported_protocols, diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index e3599ed4a7a8..da7e2112771f 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -458,6 +458,9 @@ static void digital_add_poll_tech(struct nfc_digital_dev *ddev, u8 rf_tech, /** * start_poll operation + * @nfc_dev: device to be polled + * @im_protocols: bitset of nfc initiator protocols to be used for polling + * @tm_protocols: bitset of nfc transport protocols to be used for polling * * For every supported protocol, the corresponding polling function is added * to the table of polling technologies (ddev->poll_techs[]) using diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 741da8f81c2b..4953ee5146e1 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1112,6 +1112,8 @@ static struct nfc_ops nci_nfc_ops = { * * @ops: device operations * @supported_protocols: NFC protocols supported by the device + * @tx_headroom: Reserved space at beginning of skb + * @tx_tailroom: Reserved space at end of skb */ struct nci_dev *nci_allocate_device(struct nci_ops *ops, __u32 supported_protocols, diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index c18e76d6d8ba..6b275a387a92 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -363,16 +363,13 @@ exit: } static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, - u8 result, struct sk_buff *skb) + struct sk_buff *skb) { struct nci_conn_info *conn_info; - u8 status = result; conn_info = ndev->hci_dev->conn_info; - if (!conn_info) { - status = NCI_STATUS_REJECTED; + if (!conn_info) goto exit; - } conn_info->rx_skb = skb; @@ -388,7 +385,7 @@ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, { switch (type) { case NCI_HCI_HCP_RESPONSE: - nci_hci_resp_received(ndev, pipe, instruction, skb); + nci_hci_resp_received(ndev, pipe, skb); break; case NCI_HCI_HCP_COMMAND: nci_hci_cmd_received(ndev, pipe, instruction, skb); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 8709f3d4e7c4..573b38ad2f8e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1226,7 +1226,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], + nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b87bfc82f44f..c3a664871cb5 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -199,6 +199,9 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, __be32 lse; int err; + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + stack = mpls_hdr(skb); lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); err = skb_mpls_update_lse(skb, lse); @@ -958,14 +961,13 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, { /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */ struct nlattr *dec_ttl_arg = nla_data(attr); - int rem = nla_len(attr); if (nla_len(dec_ttl_arg)) { - struct nlattr *actions = nla_next(dec_ttl_arg, &rem); + struct nlattr *actions = nla_data(dec_ttl_arg); if (actions) - return clone_execute(dp, skb, key, 0, actions, rem, - last, false); + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), last, false); } consume_skb(skb); return 0; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 4beb96139d77..6a88daab0190 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1037,6 +1037,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } + + if (nf_ct_protonum(ct) == IPPROTO_TCP && + nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) { + /* Be liberal for tcp packets so that out-of-window + * packets are not marked invalid. + */ + nf_ct_set_tcp_be_liberal(ct); + } } return 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 832f898edb6a..9d6ef6cb9b26 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1703,13 +1703,13 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; - err = ovs_dp_change(dp, a); - if (err) - goto err_destroy_meters; - /* So far only local changes have been made, now need the lock. */ ovs_lock(); + err = ovs_dp_change(dp, a); + if (err) + goto err_unlock_and_destroy_meters; + vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); @@ -1725,8 +1725,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - ovs_unlock(); - goto err_destroy_meters; + goto err_unlock_and_destroy_meters; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1741,7 +1740,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_meters: +err_unlock_and_destroy_meters: + ovs_unlock(); ovs_meters_exit(dp); err_destroy_ports: kfree(dp->ports); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b03d142ec82e..c7f34d6a9934 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -294,6 +294,10 @@ static bool icmp6hdr_ok(struct sk_buff *skb) /** * Parse vlan tag from vlan header. + * @skb: skb containing frame to parse + * @key_vh: pointer to parsed vlan tag + * @untag_vlan: should the vlan header be removed from the frame + * * Returns ERROR on memory error. * Returns 0 if it encounters a non-vlan or incomplete packet. * Returns 1 after successfully parsing vlan tag. diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 9d3e50c4d29f..ec0689ddc635 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2503,28 +2503,42 @@ static int validate_and_copy_dec_ttl(struct net *net, __be16 eth_type, __be16 vlan_tci, u32 mpls_label_count, bool log) { - int start, err; - u32 nested = true; + const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; + int start, action_start, err, rem; + const struct nlattr *a, *actions; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (!nla_len(attr)) - return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL, - NULL, 0, log); + /* Ignore unknown attributes to be future proof. */ + if (type > OVS_DEC_TTL_ATTR_MAX) + continue; + + if (!type || attrs[type]) + return -EINVAL; + + attrs[type] = a; + } + + actions = attrs[OVS_DEC_TTL_ATTR_ACTION]; + if (rem || !actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); if (start < 0) return start; - err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested, - sizeof(nested), log); - - if (err) - return err; + action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); + if (action_start < 0) + return start; - err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type, + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, vlan_tci, mpls_label_count, log); if (err) return err; + add_nested_action_end(*sfa, action_start); add_nested_action_end(*sfa, start); return 0; } @@ -3487,20 +3501,42 @@ out: static int dec_ttl_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { - int err = 0, rem = nla_len(attr); - struct nlattr *start; + struct nlattr *start, *action_start; + const struct nlattr *a; + int err = 0, rem; start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); - if (!start) return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(attr), rem, skb); - if (err) - nla_nest_cancel(skb, start); - else - nla_nest_end(skb, start); + nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + + action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION); + if (!action_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + goto out; + + nla_nest_end(skb, action_start); + break; + default: + /* Ignore all other option to be future compatible */ + break; + } + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); return err; } diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index f3486a37361a..c89c8da99f1a 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -390,7 +390,7 @@ static struct mask_cache *tbl_mask_cache_alloc(u32 size) } int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) { - struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); struct mask_cache *new; if (size == mc->cache_size) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 8fbefd52af7f..15424d26e85d 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -423,7 +423,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) return -EINVAL; meter = dp_meter_create(a); - if (IS_ERR_OR_NULL(meter)) + if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 1e30d8df3ba5..5b2ee9c1c00b 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -35,21 +35,18 @@ internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { int len, err; + /* store len value because skb can be freed inside ovs_vport_receive() */ len = skb->len; + rcu_read_lock(); err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); - if (likely(!err)) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - } else { + if (likely(!err)) + dev_sw_netstats_tx_add(netdev, 1, len); + else netdev->stats.tx_errors++; - } + return NETDEV_TX_OK; } @@ -83,24 +80,12 @@ static void internal_dev_destructor(struct net_device *dev) ovs_vport_free(vport); } -static void -internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->rx_errors = dev->stats.rx_errors; - stats->tx_errors = dev->stats.tx_errors; - stats->tx_dropped = dev->stats.tx_dropped; - stats->rx_dropped = dev->stats.rx_dropped; - - dev_fetch_sw_netstats(stats, dev->tstats); -} - static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, - .ndo_get_stats64 = internal_get_stats, + .ndo_get_stats64 = dev_get_tstats64, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 82d801f063b7..4ed7e52c7012 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -111,10 +111,12 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) * * @priv_size: Size of private data area to allocate. * @ops: vport device ops + * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using - * vport_priv(). vports that are no longer needed should be released with + * vport_priv(). Some parameters of the vport will be initialized from @parms. + * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index cefbd50c1090..a667b19eab78 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -46,6 +46,7 @@ * Copyright (C) 2011, <lokec@ccs.neu.edu> */ +#include <linux/ethtool.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> @@ -93,8 +94,8 @@ /* Assumptions: - - If the device has no dev->header_ops, there is no LL header visible - above the device. In this case, its hard_header_len should be 0. + - If the device has no dev->header_ops->create, there is no LL header + visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. @@ -108,26 +109,26 @@ On receive: ----------- -Incoming, dev->header_ops != NULL +Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data -Outgoing, dev->header_ops != NULL +Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header -Incoming, dev->header_ops == NULL +Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data -Outgoing, dev->header_ops == NULL +Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume - If dev->header_ops == NULL we are unable to restore the ll header, + If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. @@ -1636,13 +1637,15 @@ static bool fanout_find_new_id(struct sock *sk, u16 *new_id) return false; } -static int fanout_add(struct sock *sk, u16 id, u16 type_flags) +static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); + u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; + u16 id = args->id; int err; switch (type) { @@ -1700,11 +1703,21 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } } err = -EINVAL; - if (match && match->flags != flags) - goto out; - if (!match) { + if (match) { + if (match->flags != flags) + goto out; + if (args->max_num_members && + args->max_num_members != match->max_num_members) + goto out; + } else { + if (args->max_num_members > PACKET_FANOUT_MAX) + goto out; + if (!args->max_num_members) + /* legacy PACKET_FANOUT_MAX */ + args->max_num_members = 256; err = -ENOMEM; - match = kzalloc(sizeof(*match), GFP_KERNEL); + match = kvzalloc(struct_size(match, arr, args->max_num_members), + GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); @@ -1720,6 +1733,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.id_match = match_fanout_group; + match->max_num_members = args->max_num_members; list_add(&match->list, &fanout_list); } err = -EINVAL; @@ -1730,7 +1744,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; - if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { + if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); po->fanout = match; po->rollover = rollover; @@ -1744,7 +1758,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); - kfree(match); + kvfree(match); } out: @@ -2069,7 +2083,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; - if (dev->header_ops) { + if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * @@ -2198,7 +2212,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; - if (dev->header_ops) { + if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { @@ -3075,7 +3089,7 @@ static int packet_release(struct socket *sock) kfree(po->rollover); if (f) { fanout_release_data(f); - kfree(f); + kvfree(f); } /* * Now the socket is dead. No more input will appear. @@ -3866,14 +3880,14 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, } case PACKET_FANOUT: { - int val; + struct fanout_args args = { 0 }; - if (optlen != sizeof(val)) + if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; - if (copy_from_sockptr(&val, optval, sizeof(val))) + if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; - return fanout_add(sk, val & 0xffff, val >> 16); + return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { diff --git a/net/packet/internal.h b/net/packet/internal.h index fd41ecb7f605..baafc3f3fa25 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -77,11 +77,12 @@ struct packet_ring_buffer { }; extern struct mutex fanout_mutex; -#define PACKET_FANOUT_MAX 256 +#define PACKET_FANOUT_MAX (1 << 16) struct packet_fanout { possible_net_t net; unsigned int num_members; + u32 max_num_members; u16 id; u8 type; u8 flags; @@ -90,10 +91,10 @@ struct packet_fanout { struct bpf_prog __rcu *bpf_prog; }; struct list_head list; - struct sock *arr[PACKET_FANOUT_MAX]; spinlock_t lock; refcount_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; + struct sock *arr[]; }; struct packet_rollover { diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index b8559c882431..56aaf8cb6527 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -517,10 +517,6 @@ static int ctrl_cmd_new_server(struct sockaddr_qrtr *from, port = from->sq_port; } - /* Don't accept spoofed messages */ - if (from->sq_node != node_id) - return -EINVAL; - srv = server_add(service, instance, node_id, port); if (!srv) return -EINVAL; @@ -559,10 +555,6 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, port = from->sq_port; } - /* Don't accept spoofed messages */ - if (from->sq_node != node_id) - return -EINVAL; - /* Local servers may only unregister themselves */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 957aa9263ba4..f4ab3ca6d73b 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -171,8 +171,13 @@ static void __qrtr_node_release(struct kref *kref) void __rcu **slot; spin_lock_irqsave(&qrtr_nodes_lock, flags); - if (node->nid != QRTR_EP_NID_AUTO) - radix_tree_delete(&qrtr_nodes, node->nid); + /* If the node is a bridge for other nodes, there are possibly + * multiple entries pointing to our released node, delete them all. + */ + radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { + if (*slot == node) + radix_tree_iter_delete(&qrtr_nodes, &iter, slot); + } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); @@ -347,7 +352,7 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, hdr->src_port_id = cpu_to_le32(from->sq_port); if (to->sq_port == QRTR_PORT_CTRL) { hdr->dst_node_id = cpu_to_le32(node->nid); - hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST); + hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); } else { hdr->dst_node_id = cpu_to_le32(to->sq_node); hdr->dst_port_id = cpu_to_le32(to->sq_port); @@ -401,12 +406,13 @@ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { unsigned long flags; - if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) + if (nid == QRTR_EP_NID_AUTO) return; spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); - node->nid = nid; + if (node->nid == QRTR_EP_NID_AUTO) + node->nid = nid; spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } @@ -494,6 +500,13 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) qrtr_node_assign(node, cb->src_node); + if (cb->type == QRTR_TYPE_NEW_SERVER) { + /* Remote node endpoint can bridge other distant nodes */ + const struct qrtr_ctrl_pkt *pkt = data + hdrlen; + + qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); + } + if (cb->type == QRTR_TYPE_RESUME_TX) { qrtr_tx_resume(node, skb); } else { @@ -519,18 +532,20 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_post); /** * qrtr_alloc_ctrl_packet() - allocate control packet skb * @pkt: reference to qrtr_ctrl_pkt pointer + * @flags: the type of memory to allocate * * Returns newly allocated sk_buff, or NULL on failure * * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and * on success returns a reference to the control packet in @pkt. */ -static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) +static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt, + gfp_t flags) { const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; - skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, GFP_KERNEL); + skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags); if (!skb) return NULL; @@ -592,6 +607,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_ctrl_pkt *pkt; struct qrtr_tx_flow *flow; struct sk_buff *skb; + unsigned long flags; void __rcu **slot; mutex_lock(&node->ep_lock); @@ -599,11 +615,18 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ - skb = qrtr_alloc_ctrl_packet(&pkt); - if (skb) { - pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); - qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + spin_lock_irqsave(&qrtr_nodes_lock, flags); + radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { + if (*slot != node) + continue; + src.sq_node = iter.index; + skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC); + if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); + qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + } } + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); @@ -656,7 +679,7 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; - skb = qrtr_alloc_ctrl_packet(&pkt); + skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt->client.node = cpu_to_le32(ipc->us.sq_node); @@ -982,7 +1005,7 @@ static int qrtr_send_resume_tx(struct qrtr_cb *cb) if (!node) return -EINVAL; - skb = qrtr_alloc_ctrl_packet(&pkt); + skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (!skb) return -ENOMEM; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 06603dd1c8aa..b36b60668b1d 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -956,9 +956,10 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, conn->c_proposed_version, UINT_MAX, UINT_MAX, isv6); - ret = rdma_connect(cm_id, &conn_param); + ret = rdma_connect_locked(cm_id, &conn_param); if (ret) - rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); + rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n", + ret); out: /* Beware - returning non-zero tells the rdma_cm to destroy diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 971c73c7d34c..97101c55763d 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -876,6 +876,9 @@ static int rfkill_resume(struct device *dev) rfkill->suspended = false; + if (!rfkill->registered) + return 0; + if (!rfkill->persistent) { cur = !!(rfkill->state & RFKILL_BLOCK_SW); rfkill_set_block(rfkill, cur); diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 7b094275ea8b..11c45c8c6c16 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -96,10 +96,19 @@ static void rose_loopback_timer(struct timer_list *unused) } if (frametype == ROSE_CALL_REQUEST) { - if ((dev = rose_dev_get(dest)) != NULL) { - if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) - kfree_skb(skb); - } else { + if (!rose_loopback_neigh->dev) { + kfree_skb(skb); + continue; + } + + dev = rose_dev_get(dest); + if (!dev) { + kfree_skb(skb); + continue; + } + + if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) { + dev_put(dev); kfree_skb(skb); } } else { diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ddd0f95713a9..b11281bed2a4 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -28,6 +28,7 @@ rxrpc-y := \ rtt.o \ security.o \ sendmsg.o \ + server_key.o \ skbuff.o \ utils.o diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index dce48162f6c2..7bd6f8a66a3e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -12,6 +12,7 @@ #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#include <keys/rxrpc-type.h> #include "protocol.h" #if 0 @@ -34,6 +35,7 @@ struct rxrpc_crypt { #define rxrpc_queue_delayed_work(WS,D) \ queue_delayed_work(rxrpc_workqueue, (WS), (D)) +struct key_preparsed_payload; struct rxrpc_connection; /* @@ -216,17 +218,30 @@ struct rxrpc_security { /* Clean up a security service */ void (*exit)(void); + /* Parse the information from a server key */ + int (*preparse_server_key)(struct key_preparsed_payload *); + + /* Clean up the preparse buffer after parsing a server key */ + void (*free_preparse_server_key)(struct key_preparsed_payload *); + + /* Destroy the payload of a server key */ + void (*destroy_server_key)(struct key *); + + /* Describe a server key */ + void (*describe_server_key)(const struct key *, struct seq_file *); + /* initialise a connection's security */ - int (*init_connection_security)(struct rxrpc_connection *); + int (*init_connection_security)(struct rxrpc_connection *, + struct rxrpc_key_token *); - /* prime a connection's packet security */ - int (*prime_packet_security)(struct rxrpc_connection *); + /* Work out how much data we can store in a packet, given an estimate + * of the amount of data remaining. + */ + int (*how_much_data)(struct rxrpc_call *, size_t, + size_t *, size_t *, size_t *); /* impose security on a packet */ - int (*secure_packet)(struct rxrpc_call *, - struct sk_buff *, - size_t, - void *); + int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t); /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, @@ -438,10 +453,15 @@ struct rxrpc_connection { struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ + const struct rxrpc_security *security; /* applied security module */ - struct key *server_key; /* security for this service */ - struct crypto_sync_skcipher *cipher; /* encryption handle */ - struct rxrpc_crypt csum_iv; /* packet checksum base */ + union { + struct { + struct crypto_sync_skcipher *cipher; /* encryption handle */ + struct rxrpc_crypt csum_iv; /* packet checksum base */ + u32 nonce; /* response re-use preventer */ + } rxkad; + }; unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -451,10 +471,7 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ - u32 security_nonce; /* response re-use preventer */ u32 service_id; /* Service ID, possibly upgraded */ - u8 size_align; /* data size alignment (for security) */ - u8 security_size; /* security header size */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ u8 bundle_shift; /* Index into bundle->avail_chans */ @@ -888,8 +905,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, - const struct rxrpc_security *, struct key *, - struct sk_buff *); + const struct rxrpc_security *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* @@ -906,10 +922,8 @@ extern const struct rxrpc_security rxrpc_no_security; * key.c */ extern struct key_type key_type_rxrpc; -extern struct key_type key_type_rxrpc_s; int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); -int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); @@ -1052,11 +1066,13 @@ extern const struct rxrpc_security rxkad; * security.c */ int __init rxrpc_init_security(void); +const struct rxrpc_security *rxrpc_security_lookup(u8); void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); -bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, - const struct rxrpc_security **, struct key **, - struct sk_buff *); +const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *, + struct sk_buff *); +struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, + struct sk_buff *, u32, u32); /* * sendmsg.c @@ -1064,6 +1080,13 @@ bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* + * server_key.c + */ +extern struct key_type key_type_rxrpc_s; + +int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); + +/* * skbuff.c */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 8df1964db333..382add72c66f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -261,7 +261,6 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_peer *peer, struct rxrpc_connection *conn, const struct rxrpc_security *sec, - struct key *key, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; @@ -309,7 +308,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); - rxrpc_new_incoming_connection(rx, conn, sec, key, skb); + rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { rxrpc_get_connection(conn); } @@ -353,7 +352,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn; struct rxrpc_peer *peer = NULL; struct rxrpc_call *call = NULL; - struct key *key = NULL; _enter(""); @@ -374,11 +372,13 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ conn = rxrpc_find_connection_rcu(local, skb, &peer); - if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb)) - goto no_call; + if (!conn) { + sec = rxrpc_get_incoming_security(rx, skb); + if (!sec) + goto no_call; + } - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb); - key_put(key); + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 7e574c75be8e..dbea0bfee48e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -180,10 +180,6 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) if (ret < 0) goto error_1; - ret = conn->security->prime_packet_security(conn); - if (ret < 0) - goto error_2; - atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); @@ -203,8 +199,6 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) _leave(" = %p", conn); return conn; -error_2: - conn->security->clear(conn); error_1: rxrpc_put_client_connection_id(conn); error_0: diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index aff184145ffa..aab069701398 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -333,11 +333,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - ret = conn->security->init_connection_security(conn); - if (ret < 0) - return ret; - - ret = conn->security->prime_packet_security(conn); + ret = conn->security->init_connection_security( + conn, conn->params.key->payload.data[0]); if (ret < 0) return ret; @@ -377,7 +374,6 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) _enter("{%d}", conn->debug_id); ASSERT(conn->security_ix != 0); - ASSERT(conn->server_key); if (conn->security->issue_challenge(conn) < 0) { abort_code = RX_CALL_DEAD; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3bcbe0665f91..b2159dbf5412 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -49,7 +49,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn->security = &rxrpc_no_security; spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); - conn->size_align = 4; conn->idle_timestamp = jiffies; } @@ -363,7 +362,6 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) conn->security->clear(conn); key_put(conn->params.key); - key_put(conn->server_key); rxrpc_put_bundle(conn->bundle); rxrpc_put_peer(conn->params.peer); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 6c847720494f..e1966dfc9152 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -156,7 +156,6 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, struct rxrpc_connection *conn, const struct rxrpc_security *sec, - struct key *key, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -170,7 +169,6 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; conn->security = sec; - conn->server_key = key_get(key); if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; else diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index f6c59f5fae9d..9aae99d67833 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -8,20 +8,25 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -static int none_init_connection_security(struct rxrpc_connection *conn) +static int none_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) { return 0; } -static int none_prime_packet_security(struct rxrpc_connection *conn) +/* + * Work out how much data we can put in an unsecured packet. + */ +static int none_how_much_data(struct rxrpc_call *call, size_t remain, + size_t *_buf_size, size_t *_data_size, size_t *_offset) { + *_buf_size = *_data_size = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + *_offset = 0; return 0; } -static int none_secure_packet(struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) +static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, + size_t data_size) { return 0; } @@ -86,8 +91,8 @@ const struct rxrpc_security rxrpc_no_security = { .init = none_init, .exit = none_exit, .init_connection_security = none_init_connection_security, - .prime_packet_security = none_prime_packet_security, .free_call_crypto = none_free_call_crypto, + .how_much_data = none_how_much_data, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .locate_data = none_locate_data, diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 2e8bd3b97301..9631aa8543b5 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -5,7 +5,7 @@ * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: - * "afs@CAMBRIDGE.REDHAT.COM> + * "afs@example.com" */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -23,13 +23,9 @@ #include <keys/user-type.h> #include "ar-internal.h" -static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse(struct key_preparsed_payload *); -static int rxrpc_preparse_s(struct key_preparsed_payload *); static void rxrpc_free_preparse(struct key_preparsed_payload *); -static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); -static void rxrpc_destroy_s(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); static long rxrpc_read(const struct key *, char *, size_t); @@ -50,38 +46,6 @@ struct key_type key_type_rxrpc = { EXPORT_SYMBOL(key_type_rxrpc); /* - * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the - * description and an 8-byte decryption key as the payload - */ -struct key_type key_type_rxrpc_s = { - .name = "rxrpc_s", - .flags = KEY_TYPE_NET_DOMAIN, - .vet_description = rxrpc_vet_description_s, - .preparse = rxrpc_preparse_s, - .free_preparse = rxrpc_free_preparse_s, - .instantiate = generic_key_instantiate, - .destroy = rxrpc_destroy_s, - .describe = rxrpc_describe, -}; - -/* - * Vet the description for an RxRPC server key - */ -static int rxrpc_vet_description_s(const char *desc) -{ - unsigned long num; - char *p; - - num = simple_strtoul(desc, &p, 10); - if (*p != ':' || num > 65535) - return -EINVAL; - num = simple_strtoul(p + 1, &p, 10); - if (*p || num < 1 || num > 255) - return -EINVAL; - return 0; -} - -/* * parse an RxKAD type XDR format token * - the caller guarantees we have at least 4 words */ @@ -165,402 +129,17 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } -static void rxrpc_free_krb5_principal(struct krb5_principal *princ) -{ - int loop; - - if (princ->name_parts) { - for (loop = princ->n_name_parts - 1; loop >= 0; loop--) - kfree(princ->name_parts[loop]); - kfree(princ->name_parts); - } - kfree(princ->realm); -} - -static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td) -{ - kfree(td->data); -} - -/* - * free up an RxK5 token - */ -static void rxrpc_rxk5_free(struct rxk5_key *rxk5) -{ - int loop; - - rxrpc_free_krb5_principal(&rxk5->client); - rxrpc_free_krb5_principal(&rxk5->server); - rxrpc_free_krb5_tagged(&rxk5->session); - - if (rxk5->addresses) { - for (loop = rxk5->n_addresses - 1; loop >= 0; loop--) - rxrpc_free_krb5_tagged(&rxk5->addresses[loop]); - kfree(rxk5->addresses); - } - if (rxk5->authdata) { - for (loop = rxk5->n_authdata - 1; loop >= 0; loop--) - rxrpc_free_krb5_tagged(&rxk5->authdata[loop]); - kfree(rxk5->authdata); - } - - kfree(rxk5->ticket); - kfree(rxk5->ticket2); - kfree(rxk5); -} - -/* - * extract a krb5 principal - */ -static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, - const __be32 **_xdr, - unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, n_parts, loop, tmp, paddedlen; - - /* there must be at least one name, and at least #names+1 length - * words */ - if (toklen <= 12) - return -EINVAL; - - _enter(",{%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen); - - n_parts = ntohl(*xdr++); - toklen -= 4; - if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX) - return -EINVAL; - princ->n_name_parts = n_parts; - - if (toklen <= (n_parts + 1) * 4) - return -EINVAL; - - princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); - if (!princ->name_parts) - return -ENOMEM; - - for (loop = 0; loop < n_parts; loop++) { - if (toklen < 4) - return -EINVAL; - tmp = ntohl(*xdr++); - toklen -= 4; - if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX) - return -EINVAL; - paddedlen = (tmp + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL); - if (!princ->name_parts[loop]) - return -ENOMEM; - memcpy(princ->name_parts[loop], xdr, tmp); - princ->name_parts[loop][tmp] = 0; - toklen -= paddedlen; - xdr += paddedlen >> 2; - } - - if (toklen < 4) - return -EINVAL; - tmp = ntohl(*xdr++); - toklen -= 4; - if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX) - return -EINVAL; - paddedlen = (tmp + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - princ->realm = kmalloc(tmp + 1, GFP_KERNEL); - if (!princ->realm) - return -ENOMEM; - memcpy(princ->realm, xdr, tmp); - princ->realm[tmp] = 0; - toklen -= paddedlen; - xdr += paddedlen >> 2; - - _debug("%s/...@%s", princ->name_parts[0], princ->realm); - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract a piece of krb5 tagged data - */ -static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, - size_t max_data_size, - const __be32 **_xdr, - unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, len, paddedlen; - - /* there must be at least one tag and one length word */ - if (toklen <= 8) - return -EINVAL; - - _enter(",%zu,{%x,%x},%u", - max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen); - - td->tag = ntohl(*xdr++); - len = ntohl(*xdr++); - toklen -= 8; - if (len > max_data_size) - return -EINVAL; - paddedlen = (len + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - td->data_len = len; - - if (len > 0) { - td->data = kmemdup(xdr, len, GFP_KERNEL); - if (!td->data) - return -ENOMEM; - toklen -= paddedlen; - xdr += paddedlen >> 2; - } - - _debug("tag %x len %x", td->tag, td->data_len); - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract an array of tagged data - */ -static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, - u8 *_n_elem, - u8 max_n_elem, - size_t max_elem_size, - const __be32 **_xdr, - unsigned int *_toklen) -{ - struct krb5_tagged_data *td; - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, n_elem, loop; - int ret; - - /* there must be at least one count */ - if (toklen < 4) - return -EINVAL; - - _enter(",,%u,%zu,{%x},%u", - max_n_elem, max_elem_size, ntohl(xdr[0]), toklen); - - n_elem = ntohl(*xdr++); - toklen -= 4; - if (n_elem > max_n_elem) - return -EINVAL; - *_n_elem = n_elem; - if (n_elem > 0) { - if (toklen <= (n_elem + 1) * 4) - return -EINVAL; - - _debug("n_elem %d", n_elem); - - td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), - GFP_KERNEL); - if (!td) - return -ENOMEM; - *_td = td; - - for (loop = 0; loop < n_elem; loop++) { - ret = rxrpc_krb5_decode_tagged_data(&td[loop], - max_elem_size, - &xdr, &toklen); - if (ret < 0) - return ret; - } - } - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract a krb5 ticket - */ -static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, - const __be32 **_xdr, unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, len, paddedlen; - - /* there must be at least one length word */ - if (toklen <= 4) - return -EINVAL; - - _enter(",{%x},%u", ntohl(xdr[0]), toklen); - - len = ntohl(*xdr++); - toklen -= 4; - if (len > AFSTOKEN_K5_TIX_MAX) - return -EINVAL; - paddedlen = (len + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - *_tktlen = len; - - _debug("ticket len %u", len); - - if (len > 0) { - *_ticket = kmemdup(xdr, len, GFP_KERNEL); - if (!*_ticket) - return -ENOMEM; - toklen -= paddedlen; - xdr += paddedlen >> 2; - } - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * parse an RxK5 type XDR format token - * - the caller guarantees we have at least 4 words - */ -static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep, - size_t datalen, - const __be32 *xdr, unsigned int toklen) -{ - struct rxrpc_key_token *token, **pptoken; - struct rxk5_key *rxk5; - const __be32 *end_xdr = xdr + (toklen >> 2); - time64_t expiry; - int ret; - - _enter(",{%x,%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), - toklen); - - /* reserve some payload space for this subkey - the length of the token - * is a reasonable approximation */ - prep->quotalen = datalen + toklen; - - token = kzalloc(sizeof(*token), GFP_KERNEL); - if (!token) - return -ENOMEM; - - rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL); - if (!rxk5) { - kfree(token); - return -ENOMEM; - } - - token->security_index = RXRPC_SECURITY_RXK5; - token->k5 = rxk5; - - /* extract the principals */ - ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen); - if (ret < 0) - goto error; - ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen); - if (ret < 0) - goto error; - - /* extract the session key and the encoding type (the tag field -> - * ENCTYPE_xxx) */ - ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - if (toklen < 4 * 8 + 2 * 4) - goto inval; - rxk5->authtime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->starttime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->endtime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->renew_till = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->is_skey = ntohl(*xdr++); - rxk5->flags = ntohl(*xdr++); - toklen -= 4 * 8 + 2 * 4; - - _debug("times: a=%llx s=%llx e=%llx rt=%llx", - rxk5->authtime, rxk5->starttime, rxk5->endtime, - rxk5->renew_till); - _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags); - - /* extract the permitted client addresses */ - ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses, - &rxk5->n_addresses, - AFSTOKEN_K5_ADDRESSES_MAX, - AFSTOKEN_DATA_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - /* extract the tickets */ - ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len, - &xdr, &toklen); - if (ret < 0) - goto error; - ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - /* extract the typed auth data */ - ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata, - &rxk5->n_authdata, - AFSTOKEN_K5_AUTHDATA_MAX, - AFSTOKEN_BDATALN_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - if (toklen != 0) - goto inval; - - /* attach the payload */ - for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; - *pptoken; - pptoken = &(*pptoken)->next) - continue; - *pptoken = token; - expiry = rxrpc_u32_to_time64(token->k5->endtime); - if (expiry < prep->expiry) - prep->expiry = expiry; - - _leave(" = 0"); - return 0; - -inval: - ret = -EINVAL; -error: - rxrpc_rxk5_free(rxk5); - kfree(token); - _leave(" = %d", ret); - return ret; -} - /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words */ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) { - const __be32 *xdr = prep->data, *token; + const __be32 *xdr = prep->data, *token, *p; const char *cp; unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix; size_t datalen = prep->datalen; - int ret; + int ret, ret2; _enter(",{%x,%x,%x,%x},%zu", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), @@ -610,20 +189,20 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) goto not_xdr; /* check each token wrapper */ - token = xdr; + p = xdr; loop = ntoken; do { if (datalen < 8) goto not_xdr; - toklen = ntohl(*xdr++); - sec_ix = ntohl(*xdr); + toklen = ntohl(*p++); + sec_ix = ntohl(*p); datalen -= 4; _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); paddedlen = (toklen + 3) & ~3; if (toklen < 20 || toklen > datalen || paddedlen > datalen) goto not_xdr; datalen -= paddedlen; - xdr += paddedlen >> 2; + p += paddedlen >> 2; } while (--loop > 0); @@ -634,44 +213,50 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) /* okay: we're going to assume it's valid XDR format * - we ignore the cellname, relying on the key to be correctly named */ + ret = -EPROTONOSUPPORT; do { - xdr = token; toklen = ntohl(*xdr++); - token = xdr + ((toklen + 3) >> 2); - sec_ix = ntohl(*xdr++); + token = xdr; + xdr += (toklen + 3) / 4; + + sec_ix = ntohl(*token++); toklen -= 4; - _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token); + _debug("TOKEN type=%x len=%x", sec_ix, toklen); switch (sec_ix) { case RXRPC_SECURITY_RXKAD: - ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen); - if (ret != 0) - goto error; + ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + default: + ret2 = -EPROTONOSUPPORT; + break; + } - case RXRPC_SECURITY_RXK5: - ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen); + switch (ret2) { + case 0: + ret = 0; + break; + case -EPROTONOSUPPORT: + break; + case -ENOPKG: if (ret != 0) - goto error; + ret = -ENOPKG; break; - default: - ret = -EPROTONOSUPPORT; + ret = ret2; goto error; } } while (--ntoken > 0); - _leave(" = 0"); - return 0; +error: + _leave(" = %d", ret); + return ret; not_xdr: _leave(" = -EPROTO"); return -EPROTO; -error: - _leave(" = %d", ret); - return ret; } /* @@ -805,10 +390,6 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; - case RXRPC_SECURITY_RXK5: - if (token->k5) - rxrpc_rxk5_free(token->k5); - break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -828,45 +409,6 @@ static void rxrpc_free_preparse(struct key_preparsed_payload *prep) } /* - * Preparse a server secret key. - * - * The data should be the 8-byte secret key. - */ -static int rxrpc_preparse_s(struct key_preparsed_payload *prep) -{ - struct crypto_skcipher *ci; - - _enter("%zu", prep->datalen); - - if (prep->datalen != 8) - return -EINVAL; - - memcpy(&prep->payload.data[2], prep->data, 8); - - ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ci)) { - _leave(" = %ld", PTR_ERR(ci)); - return PTR_ERR(ci); - } - - if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) - BUG(); - - prep->payload.data[0] = ci; - _leave(" = 0"); - return 0; -} - -/* - * Clean up preparse data. - */ -static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) -{ - if (prep->payload.data[0]) - crypto_free_skcipher(prep->payload.data[0]); -} - -/* * dispose of the data dangling from the corpse of a rxrpc key */ static void rxrpc_destroy(struct key *key) @@ -875,22 +417,29 @@ static void rxrpc_destroy(struct key *key) } /* - * dispose of the data dangling from the corpse of a rxrpc key - */ -static void rxrpc_destroy_s(struct key *key) -{ - if (key->payload.data[0]) { - crypto_free_skcipher(key->payload.data[0]); - key->payload.data[0] = NULL; - } -} - -/* * describe the rxrpc key */ static void rxrpc_describe(const struct key *key, struct seq_file *m) { + const struct rxrpc_key_token *token; + const char *sep = ": "; + seq_puts(m, key->description); + + for (token = key->payload.data[0]; token; token = token->next) { + seq_puts(m, sep); + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + seq_puts(m, "ka"); + break; + default: /* we have a ticket we can't encode */ + seq_printf(m, "%u", token->security_index); + break; + } + + sep = " "; + } } /* @@ -924,36 +473,6 @@ int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) } /* - * grab the security keyring for a server socket - */ -int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) -{ - struct key *key; - char *description; - - _enter(""); - - if (optlen <= 0 || optlen > PAGE_SIZE - 1) - return -EINVAL; - - description = memdup_sockptr_nul(optval, optlen); - if (IS_ERR(description)) - return PTR_ERR(description); - - key = request_key(&key_type_keyring, description, NULL); - if (IS_ERR(key)) { - kfree(description); - _leave(" = %ld", PTR_ERR(key)); - return PTR_ERR(key); - } - - rx->securities = key; - kfree(description); - _leave(" = 0 [key %x]", key->serial); - return 0; -} - -/* * generate a server data key */ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, @@ -1044,12 +563,10 @@ static long rxrpc_read(const struct key *key, char *buffer, size_t buflen) { const struct rxrpc_key_token *token; - const struct krb5_principal *princ; size_t size; __be32 *xdr, *oldxdr; u32 cnlen, toksize, ntoks, tok, zero; u16 toksizes[AFSTOKEN_MAX]; - int loop; _enter(""); @@ -1074,36 +591,8 @@ static long rxrpc_read(const struct key *key, case RXRPC_SECURITY_RXKAD: toksize += 8 * 4; /* viceid, kvno, key*2, begin, * end, primary, tktlen */ - toksize += RND(token->kad->ticket_len); - break; - - case RXRPC_SECURITY_RXK5: - princ = &token->k5->client; - toksize += 4 + princ->n_name_parts * 4; - for (loop = 0; loop < princ->n_name_parts; loop++) - toksize += RND(strlen(princ->name_parts[loop])); - toksize += 4 + RND(strlen(princ->realm)); - - princ = &token->k5->server; - toksize += 4 + princ->n_name_parts * 4; - for (loop = 0; loop < princ->n_name_parts; loop++) - toksize += RND(strlen(princ->name_parts[loop])); - toksize += 4 + RND(strlen(princ->realm)); - - toksize += 8 + RND(token->k5->session.data_len); - - toksize += 4 * 8 + 2 * 4; - - toksize += 4 + token->k5->n_addresses * 8; - for (loop = 0; loop < token->k5->n_addresses; loop++) - toksize += RND(token->k5->addresses[loop].data_len); - - toksize += 4 + RND(token->k5->ticket_len); - toksize += 4 + RND(token->k5->ticket2_len); - - toksize += 4 + token->k5->n_authdata * 8; - for (loop = 0; loop < token->k5->n_authdata; loop++) - toksize += RND(token->k5->authdata[loop].data_len); + if (!token->no_leak_key) + toksize += RND(token->kad->ticket_len); break; default: /* we have a ticket we can't encode */ @@ -1178,49 +667,10 @@ static long rxrpc_read(const struct key *key, ENCODE(token->kad->start); ENCODE(token->kad->expiry); ENCODE(token->kad->primary_flag); - ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); - break; - - case RXRPC_SECURITY_RXK5: - princ = &token->k5->client; - ENCODE(princ->n_name_parts); - for (loop = 0; loop < princ->n_name_parts; loop++) - ENCODE_STR(princ->name_parts[loop]); - ENCODE_STR(princ->realm); - - princ = &token->k5->server; - ENCODE(princ->n_name_parts); - for (loop = 0; loop < princ->n_name_parts; loop++) - ENCODE_STR(princ->name_parts[loop]); - ENCODE_STR(princ->realm); - - ENCODE(token->k5->session.tag); - ENCODE_DATA(token->k5->session.data_len, - token->k5->session.data); - - ENCODE64(token->k5->authtime); - ENCODE64(token->k5->starttime); - ENCODE64(token->k5->endtime); - ENCODE64(token->k5->renew_till); - ENCODE(token->k5->is_skey); - ENCODE(token->k5->flags); - - ENCODE(token->k5->n_addresses); - for (loop = 0; loop < token->k5->n_addresses; loop++) { - ENCODE(token->k5->addresses[loop].tag); - ENCODE_DATA(token->k5->addresses[loop].data_len, - token->k5->addresses[loop].data); - } - - ENCODE_DATA(token->k5->ticket_len, token->k5->ticket); - ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2); - - ENCODE(token->k5->n_authdata); - for (loop = 0; loop < token->k5->n_authdata; loop++) { - ENCODE(token->k5->authdata[loop].tag); - ENCODE_DATA(token->k5->authdata[loop].data_len, - token->k5->authdata[loop].data); - } + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; default: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index f114dc2af5cf..e2e9e9b0a6d7 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -15,6 +15,7 @@ #include <linux/scatterlist.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/key-type.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> @@ -27,6 +28,7 @@ #define INST_SZ 40 /* size of principal's instance */ #define REALM_SZ 40 /* size of principal's auth domain */ #define SNAME_SZ 40 /* size of service name */ +#define RXKAD_ALIGN 8 struct rxkad_level1_hdr { __be32 data_size; /* true data size (excluding padding) */ @@ -37,6 +39,9 @@ struct rxkad_level2_hdr { __be32 checksum; /* decrypted data checksum */ }; +static int rxkad_prime_packet_security(struct rxrpc_connection *conn, + struct crypto_sync_skcipher *ci); + /* * this holds a pinned cipher so that keventd doesn't get called by the cipher * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE @@ -47,17 +52,59 @@ static struct skcipher_request *rxkad_ci_req; static DEFINE_MUTEX(rxkad_ci_mutex); /* + * Parse the information from a server key + * + * The data should be the 8-byte secret key. + */ +static int rxkad_preparse_server_key(struct key_preparsed_payload *prep) +{ + struct crypto_skcipher *ci; + + if (prep->datalen != 8) + return -EINVAL; + + memcpy(&prep->payload.data[2], prep->data, 8); + + ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _leave(" = %ld", PTR_ERR(ci)); + return PTR_ERR(ci); + } + + if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) + BUG(); + + prep->payload.data[0] = ci; + _leave(" = 0"); + return 0; +} + +static void rxkad_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + + if (prep->payload.data[0]) + crypto_free_skcipher(prep->payload.data[0]); +} + +static void rxkad_destroy_server_key(struct key *key) +{ + if (key->payload.data[0]) { + crypto_free_skcipher(key->payload.data[0]); + key->payload.data[0] = NULL; + } +} + +/* * initialise connection security */ -static int rxkad_init_connection_security(struct rxrpc_connection *conn) +static int rxkad_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) { struct crypto_sync_skcipher *ci; - struct rxrpc_key_token *token; int ret; _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key)); - token = conn->params.key->payload.data[0]; conn->security_ix = token->security_index; ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); @@ -73,32 +120,68 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) switch (conn->params.security_level) { case RXRPC_SECURITY_PLAIN: - break; case RXRPC_SECURITY_AUTH: - conn->size_align = 8; - conn->security_size = sizeof(struct rxkad_level1_hdr); - break; case RXRPC_SECURITY_ENCRYPT: - conn->size_align = 8; - conn->security_size = sizeof(struct rxkad_level2_hdr); break; default: ret = -EKEYREJECTED; goto error; } - conn->cipher = ci; - ret = 0; + ret = rxkad_prime_packet_security(conn, ci); + if (ret < 0) + goto error_ci; + + conn->rxkad.cipher = ci; + return 0; + +error_ci: + crypto_free_sync_skcipher(ci); error: _leave(" = %d", ret); return ret; } /* + * Work out how much data we can put in a packet. + */ +static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, + size_t *_buf_size, size_t *_data_size, size_t *_offset) +{ + size_t shdr, buf_size, chunk; + + switch (call->conn->params.security_level) { + default: + buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + shdr = 0; + goto out; + case RXRPC_SECURITY_AUTH: + shdr = sizeof(struct rxkad_level1_hdr); + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxkad_level2_hdr); + break; + } + + buf_size = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN); + + chunk = buf_size - shdr; + if (remain < chunk) + buf_size = round_up(shdr + remain, RXKAD_ALIGN); + +out: + *_buf_size = buf_size; + *_data_size = chunk; + *_offset = shdr; + return 0; +} + +/* * prime the encryption state with the invariant parts of a connection's * description */ -static int rxkad_prime_packet_security(struct rxrpc_connection *conn) +static int rxkad_prime_packet_security(struct rxrpc_connection *conn, + struct crypto_sync_skcipher *ci) { struct skcipher_request *req; struct rxrpc_key_token *token; @@ -116,7 +199,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) if (!tmpbuf) return -ENOMEM; - req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS); + req = skcipher_request_alloc(&ci->base, GFP_NOFS); if (!req) { kfree(tmpbuf); return -ENOMEM; @@ -131,13 +214,13 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) tmpbuf[3] = htonl(conn->security_ix); sg_init_one(&sg, tmpbuf, tmpsize); - skcipher_request_set_sync_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); skcipher_request_free(req); - memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv)); + memcpy(&conn->rxkad.csum_iv, tmpbuf + 2, sizeof(conn->rxkad.csum_iv)); kfree(tmpbuf); _leave(" = 0"); return 0; @@ -149,7 +232,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) */ static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call) { - struct crypto_skcipher *tfm = &call->conn->cipher->base; + struct crypto_skcipher *tfm = &call->conn->rxkad.cipher->base; struct skcipher_request *cipher_req = call->cipher_req; if (!cipher_req) { @@ -176,15 +259,14 @@ static void rxkad_free_call_crypto(struct rxrpc_call *call) * partially encrypt a packet (level 1 security) */ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 data_size, - void *sechdr, + struct sk_buff *skb, u32 data_size, struct skcipher_request *req) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxkad_level1_hdr hdr; struct rxrpc_crypt iv; struct scatterlist sg; + size_t pad; u16 check; _enter(""); @@ -193,13 +275,19 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, data_size |= (u32)check << 16; hdr.data_size = htonl(data_size); - memcpy(sechdr, &hdr, sizeof(hdr)); + memcpy(skb->head, &hdr, sizeof(hdr)); + + pad = sizeof(struct rxkad_level1_hdr) + data_size; + pad = RXKAD_ALIGN - pad; + pad &= RXKAD_ALIGN - 1; + if (pad) + skb_put_zero(skb, pad); /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg, sechdr, 8); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + sg_init_one(&sg, skb->head, 8); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -215,7 +303,6 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct sk_buff *skb, u32 data_size, - void *sechdr, struct skcipher_request *req) { const struct rxrpc_key_token *token; @@ -224,6 +311,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct rxrpc_crypt iv; struct scatterlist sg[16]; unsigned int len; + size_t pad; u16 check; int err; @@ -235,14 +323,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; - memcpy(sechdr, &rxkhdr, sizeof(rxkhdr)); + memcpy(skb->head, &rxkhdr, sizeof(rxkhdr)); + + pad = sizeof(struct rxkad_level2_hdr) + data_size; + pad = RXKAD_ALIGN - pad; + pad &= RXKAD_ALIGN - 1; + if (pad) + skb_put_zero(skb, pad); /* encrypt from the session key */ token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + sg_init_one(&sg[0], skb->head, sizeof(rxkhdr)); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); crypto_skcipher_encrypt(req); @@ -252,11 +346,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, if (skb_shinfo(skb)->nr_frags > 16) goto out; - len = data_size + call->conn->size_align - 1; - len &= ~(call->conn->size_align - 1); + len = round_up(data_size, RXKAD_ALIGN); sg_init_table(sg, ARRAY_SIZE(sg)); - err = skb_to_sgvec(skb, sg, 0, len); + err = skb_to_sgvec(skb, sg, 8, len); if (unlikely(err < 0)) goto out; skcipher_request_set_crypt(req, sg, sg, len, iv.x); @@ -275,8 +368,7 @@ out: */ static int rxkad_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, - size_t data_size, - void *sechdr) + size_t data_size) { struct rxrpc_skb_priv *sp; struct skcipher_request *req; @@ -291,7 +383,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq, data_size); - if (!call->conn->cipher) + if (!call->conn->rxkad.cipher) return 0; ret = key_validate(call->conn->params.key); @@ -303,7 +395,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, return -ENOMEM; /* continue encrypting from where we left off */ - memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* calculate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); @@ -312,7 +404,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -329,12 +421,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call, ret = 0; break; case RXRPC_SECURITY_AUTH: - ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr, - req); + ret = rxkad_secure_packet_auth(call, skb, data_size, req); break; case RXRPC_SECURITY_ENCRYPT: - ret = rxkad_secure_packet_encrypt(call, skb, data_size, - sechdr, req); + ret = rxkad_secure_packet_encrypt(call, skb, data_size, req); break; default: ret = -EPERM; @@ -380,7 +470,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); crypto_skcipher_decrypt(req); @@ -472,7 +562,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_decrypt(req); @@ -538,7 +628,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, _enter("{%d{%x}},{#%u}", call->debug_id, key_serial(call->conn->params.key), seq); - if (!call->conn->cipher) + if (!call->conn->rxkad.cipher) return 0; req = rxkad_get_call_crypto(call); @@ -546,7 +636,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, return -ENOMEM; /* continue encrypting from where we left off */ - memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* validate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); @@ -555,7 +645,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -648,16 +738,12 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + _enter("{%d}", conn->debug_id); - ret = key_validate(conn->server_key); - if (ret < 0) - return ret; - - get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce)); + get_random_bytes(&conn->rxkad.nonce, sizeof(conn->rxkad.nonce)); challenge.version = htonl(2); - challenge.nonce = htonl(conn->security_nonce); + challenge.nonce = htonl(conn->rxkad.nonce); challenge.min_level = htonl(0); challenge.__padding = 0; @@ -785,7 +871,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, struct rxrpc_crypt iv; struct scatterlist sg[1]; - req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS); + req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) return -ENOMEM; @@ -794,7 +880,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, sg_init_table(sg, 1); sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); - skcipher_request_set_sync_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_encrypt(req); @@ -892,6 +978,7 @@ other_error: * decrypt the kerberos IV ticket in the response */ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, + struct key *server_key, struct sk_buff *skb, void *ticket, size_t ticket_len, struct rxrpc_crypt *_session_key, @@ -911,30 +998,17 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, u32 abort_code; u8 *p, *q, *name, *end; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key)); + _enter("{%d},{%x}", conn->debug_id, key_serial(server_key)); *_expiry = 0; - ret = key_validate(conn->server_key); - if (ret < 0) { - switch (ret) { - case -EKEYEXPIRED: - abort_code = RXKADEXPIRED; - goto other_error; - default: - abort_code = RXKADNOAUTH; - goto other_error; - } - } - - ASSERT(conn->server_key->payload.data[0] != NULL); + ASSERT(server_key->payload.data[0] != NULL); ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); - memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv)); + memcpy(&iv, &server_key->payload.data[2], sizeof(iv)); ret = -ENOMEM; - req = skcipher_request_alloc(conn->server_key->payload.data[0], - GFP_NOFS); + req = skcipher_request_alloc(server_key->payload.data[0], GFP_NOFS); if (!req) goto temporary_error; @@ -1090,6 +1164,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; + struct key *server_key; const char *eproto; time64_t expiry; void *ticket; @@ -1097,7 +1172,27 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, __be32 csum; int ret, i; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + _enter("{%d}", conn->debug_id); + + server_key = rxrpc_look_up_server_security(conn, skb, 0, 0); + if (IS_ERR(server_key)) { + switch (PTR_ERR(server_key)) { + case -ENOKEY: + abort_code = RXKADUNKNOWNKEY; + break; + case -EKEYEXPIRED: + abort_code = RXKADEXPIRED; + break; + default: + abort_code = RXKADNOAUTH; + break; + } + trace_rxrpc_abort(0, "SVK", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + abort_code, PTR_ERR(server_key)); + *_abort_code = abort_code; + return -EPROTO; + } ret = -ENOMEM; response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); @@ -1109,8 +1204,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), response, sizeof(*response)) < 0) goto protocol_error; - if (!pskb_pull(skb, sizeof(*response))) - BUG(); version = ntohl(response->version); ticket_len = ntohl(response->ticket_len); @@ -1141,12 +1234,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, eproto = tracepoint_string("rxkad_tkt_short"); abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), ticket, ticket_len) < 0) goto protocol_error_free; - ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key, - &expiry, _abort_code); + ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, + &session_key, &expiry, _abort_code); if (ret < 0) goto temporary_error_free_ticket; @@ -1196,7 +1289,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, eproto = tracepoint_string("rxkad_rsp_seq"); abort_code = RXKADOUTOFSEQUENCE; - if (ntohl(response->encrypted.inc_nonce) != conn->security_nonce + 1) + if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) goto protocol_error_free; eproto = tracepoint_string("rxkad_rsp_level"); @@ -1225,6 +1318,7 @@ protocol_error_free: protocol_error: kfree(response); trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); + key_put(server_key); *_abort_code = abort_code; return -EPROTO; @@ -1237,6 +1331,7 @@ temporary_error: * ENOMEM. We just want to send the challenge again. Note that we * also come out this way if the ticket decryption fails. */ + key_put(server_key); return ret; } @@ -1247,8 +1342,8 @@ static void rxkad_clear(struct rxrpc_connection *conn) { _enter(""); - if (conn->cipher) - crypto_free_sync_skcipher(conn->cipher); + if (conn->rxkad.cipher) + crypto_free_sync_skcipher(conn->rxkad.cipher); } /* @@ -1296,8 +1391,11 @@ const struct rxrpc_security rxkad = { .no_key_abort = RXKADUNKNOWNKEY, .init = rxkad_init, .exit = rxkad_exit, + .preparse_server_key = rxkad_preparse_server_key, + .free_preparse_server_key = rxkad_free_preparse_server_key, + .destroy_server_key = rxkad_destroy_server_key, .init_connection_security = rxkad_init_connection_security, - .prime_packet_security = rxkad_prime_packet_security, + .how_much_data = rxkad_how_much_data, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 9b1fb9ed0717..50cb5f1ee0c0 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -55,7 +55,7 @@ void rxrpc_exit_security(void) /* * look up an rxrpc security module */ -static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) +const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) { if (security_index >= ARRAY_SIZE(rxrpc_security_types)) return NULL; @@ -81,16 +81,17 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) if (ret < 0) return ret; - token = key->payload.data[0]; - if (!token) - return -EKEYREJECTED; + for (token = key->payload.data[0]; token; token = token->next) { + sec = rxrpc_security_lookup(token->security_index); + if (sec) + goto found; + } + return -EKEYREJECTED; - sec = rxrpc_security_lookup(token->security_index); - if (!sec) - return -EKEYREJECTED; +found: conn->security = sec; - ret = conn->security->init_connection_security(conn); + ret = conn->security->init_connection_security(conn, token); if (ret < 0) { conn->security = &rxrpc_no_security; return ret; @@ -101,22 +102,16 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) } /* - * Find the security key for a server connection. + * Set the ops a server connection. */ -bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx, - const struct rxrpc_security **_sec, - struct key **_key, - struct sk_buff *skb) +const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx, + struct sk_buff *skb) { const struct rxrpc_security *sec; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - key_ref_t kref = NULL; - char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { trace_rxrpc_abort(0, "SVS", @@ -124,35 +119,72 @@ bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock RX_INVALID_OPERATION, EKEYREJECTED); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; - return false; + return NULL; } - if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE) - goto out; - - if (!rx->securities) { + if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE && + !rx->securities) { trace_rxrpc_abort(0, "SVR", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EKEYREJECTED); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; - return false; + skb->priority = sec->no_key_abort; + return NULL; } + return sec; +} + +/* + * Find the security key for a server connection. + */ +struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 kvno, u32 enctype) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct key *key = ERR_PTR(-EKEYREJECTED); + key_ref_t kref = NULL; + char kdesc[5 + 1 + 3 + 1 + 12 + 1 + 12 + 1]; + int ret; + + _enter(""); + + if (enctype) + sprintf(kdesc, "%u:%u:%u:%u", + sp->hdr.serviceId, sp->hdr.securityIndex, kvno, enctype); + else if (kvno) + sprintf(kdesc, "%u:%u:%u", + sp->hdr.serviceId, sp->hdr.securityIndex, kvno); + else + sprintf(kdesc, "%u:%u", + sp->hdr.serviceId, sp->hdr.securityIndex); + + rcu_read_lock(); + + rx = rcu_dereference(conn->params.local->service); + if (!rx) + goto out; + /* look through the service's keyring */ kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc, true); if (IS_ERR(kref)) { - trace_rxrpc_abort(0, "SVK", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - sec->no_key_abort, EKEYREJECTED); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = sec->no_key_abort; - return false; + key = ERR_CAST(kref); + goto out; + } + + key = key_ref_to_ptr(kref); + + ret = key_validate(key); + if (ret < 0) { + key_put(key); + key = ERR_PTR(ret); + goto out; } out: - *_sec = sec; - *_key = key_ref_to_ptr(kref); - return true; + rcu_read_unlock(); + return key; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index d27140c836cc..af8ad6c30b9f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -327,7 +327,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, rxrpc_send_ack_packet(call, false, NULL); if (!skb) { - size_t size, chunk, max, space; + size_t remain, bufsize, chunk, offset; _debug("alloc"); @@ -342,24 +342,21 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, goto maybe_error; } - max = RXRPC_JUMBO_DATALEN; - max -= call->conn->security_size; - max &= ~(call->conn->size_align - 1UL); - - chunk = max; - if (chunk > msg_data_left(msg) && !more) - chunk = msg_data_left(msg); - - space = chunk + call->conn->size_align; - space &= ~(call->conn->size_align - 1UL); - - size = space + call->conn->security_size; + /* Work out the maximum size of a packet. Assume that + * the security header is going to be in the padded + * region (enc blocksize), but the trailer is not. + */ + remain = more ? INT_MAX : msg_data_left(msg); + ret = call->conn->security->how_much_data(call, remain, + &bufsize, &chunk, &offset); + if (ret < 0) + goto maybe_error; - _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset); /* create a buffer that we can retain until it's ACK'd */ skb = sock_alloc_send_skb( - sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + sk, bufsize, msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto maybe_error; @@ -371,9 +368,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, ASSERTCMP(skb->mark, ==, 0); - _debug("HS: %u", call->conn->security_size); - skb_reserve(skb, call->conn->security_size); - skb->len += call->conn->security_size; + __skb_put(skb, offset); sp->remain = chunk; if (sp->remain > skb_tailroom(skb)) @@ -422,17 +417,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, (msg_data_left(msg) == 0 && !more)) { struct rxrpc_connection *conn = call->conn; uint32_t seq; - size_t pad; - - /* pad out if we're using security */ - if (conn->security_ix) { - pad = conn->security_size + skb->mark; - pad = conn->size_align - pad; - pad &= conn->size_align - 1; - _debug("pad %zu", pad); - if (pad) - skb_put_zero(skb, pad); - } seq = call->tx_top + 1; @@ -446,8 +430,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - ret = call->security->secure_packet( - call, skb, skb->mark, skb->head); + ret = call->security->secure_packet(call, skb, skb->mark); if (ret < 0) goto out; diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c new file mode 100644 index 000000000000..ead3471307ee --- /dev/null +++ b/net/rxrpc/server_key.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxRPC key management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * RxRPC keys should have a description of describing their purpose: + * "afs@CAMBRIDGE.REDHAT.COM> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/skcipher.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/key-type.h> +#include <linux/ctype.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <keys/rxrpc-type.h> +#include <keys/user-type.h> +#include "ar-internal.h" + +static int rxrpc_vet_description_s(const char *); +static int rxrpc_preparse_s(struct key_preparsed_payload *); +static void rxrpc_free_preparse_s(struct key_preparsed_payload *); +static void rxrpc_destroy_s(struct key *); +static void rxrpc_describe_s(const struct key *, struct seq_file *); + +/* + * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the + * description and the key material as the payload. + */ +struct key_type key_type_rxrpc_s = { + .name = "rxrpc_s", + .flags = KEY_TYPE_NET_DOMAIN, + .vet_description = rxrpc_vet_description_s, + .preparse = rxrpc_preparse_s, + .free_preparse = rxrpc_free_preparse_s, + .instantiate = generic_key_instantiate, + .destroy = rxrpc_destroy_s, + .describe = rxrpc_describe_s, +}; + +/* + * Vet the description for an RxRPC server key. + */ +static int rxrpc_vet_description_s(const char *desc) +{ + unsigned long service, sec_class; + char *p; + + service = simple_strtoul(desc, &p, 10); + if (*p != ':' || service > 65535) + return -EINVAL; + sec_class = simple_strtoul(p + 1, &p, 10); + if ((*p && *p != ':') || sec_class < 1 || sec_class > 255) + return -EINVAL; + return 0; +} + +/* + * Preparse a server secret key. + */ +static int rxrpc_preparse_s(struct key_preparsed_payload *prep) +{ + const struct rxrpc_security *sec; + unsigned int service, sec_class; + int n; + + _enter("%zu", prep->datalen); + + if (!prep->orig_description) + return -EINVAL; + + if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2) + return -EINVAL; + + sec = rxrpc_security_lookup(sec_class); + if (!sec) + return -ENOPKG; + + prep->payload.data[1] = (struct rxrpc_security *)sec; + + return sec->preparse_server_key(prep); +} + +static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) +{ + const struct rxrpc_security *sec = prep->payload.data[1]; + + if (sec) + sec->free_preparse_server_key(prep); +} + +static void rxrpc_destroy_s(struct key *key) +{ + const struct rxrpc_security *sec = key->payload.data[1]; + + if (sec) + sec->destroy_server_key(key); +} + +static void rxrpc_describe_s(const struct key *key, struct seq_file *m) +{ + const struct rxrpc_security *sec = key->payload.data[1]; + + seq_puts(m, key->description); + if (sec && sec->describe_server_key) + sec->describe_server_key(key, m); +} + +/* + * grab the security keyring for a server socket + */ +int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = memdup_sockptr_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); + + key = request_key(&key_type_keyring, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->securities = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} diff --git a/net/sched/Makefile b/net/sched/Makefile index 66bbf9a98f9e..dd14ef413fda 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -5,6 +5,7 @@ obj-y := sch_generic.o sch_mq.o +obj-$(CONFIG_INET) += sch_frag.o obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f66417d5d2c3..2e85b636b27b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -22,6 +22,22 @@ #include <net/act_api.h> #include <net/netlink.h> +#ifdef CONFIG_INET +DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); +EXPORT_SYMBOL_GPL(tcf_frag_xmit_count); +#endif + +int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) +{ +#ifdef CONFIG_INET + if (static_branch_unlikely(&tcf_frag_xmit_count)) + return sch_frag_xmit_hook(skb, xmit); +#endif + + return xmit(skb); +} +EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit); + static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { @@ -215,6 +231,36 @@ static size_t tcf_action_fill_size(const struct tc_action *act) return sz; } +static int +tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cookie *cookie; + + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; + if (tcf_action_copy_stats(skb, a, 0)) + goto nla_put_failure; + if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index)) + goto nla_put_failure; + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); + goto nla_put_failure; + } + } + rcu_read_unlock(); + + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -248,7 +294,9 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, index--; goto nla_put_failure; } - err = tcf_action_dump_1(skb, p, 0, 0); + err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ? + tcf_action_dump_terse(skb, p, true) : + tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -256,7 +304,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, } nla_nest_end(skb, nest); n_i++; - if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) && + if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) && n_i >= TCA_ACT_MAX_PRIO) goto done; } @@ -266,7 +314,7 @@ done: mutex_unlock(&idrinfo->lock); if (n_i) { - if (act_flags & TCA_FLAG_LARGE_DUMP_ON) + if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } return n_i; @@ -651,7 +699,7 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) return res; } -/*TCA_ACT_MAX_PRIO is 32, there count upto 32 */ +/*TCA_ACT_MAX_PRIO is 32, there count up to 32 */ #define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) @@ -752,34 +800,6 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -static int -tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cookie *cookie; - - if (nla_put_string(skb, TCA_KIND, a->ops->kind)) - goto nla_put_failure; - if (tcf_action_copy_stats(skb, a, 0)) - goto nla_put_failure; - - rcu_read_lock(); - cookie = rcu_dereference(a->act_cookie); - if (cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { - rcu_read_unlock(); - goto nla_put_failure; - } - } - rcu_read_unlock(); - - return 0; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -787,7 +807,7 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - if (tcf_action_dump_terse(skb, a)) + if (tcf_action_dump_terse(skb, a, false)) goto nla_put_failure; if (a->hw_stats != TCA_ACT_HW_STATS_ANY && @@ -832,7 +852,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; - err = terse ? tcf_action_dump_terse(skb, a) : + err = terse ? tcf_action_dump_terse(skb, a, false) : tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; @@ -935,7 +955,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { + if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); goto err_out; } @@ -1469,7 +1489,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, } static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { - [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON), + [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON | + TCA_ACT_FLAG_TERSE_DUMP), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index a4c7ba35a343..e48e980c3b93 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -65,7 +65,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, * In case a different well-known TC_ACT opcode has been * returned, it will overwrite the default one. * - * For everything else that is unkown, TC_ACT_UNSPEC is + * For everything else that is unknown, TC_ACT_UNSPEC is * returned. */ switch (filter_res) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index aba3cd85f284..83a5c6722a06 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -296,7 +296,8 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) goto err_insert; ct_ft->nf_ft.type = &flowtable_ct; - ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD; + ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD | + NF_FLOWTABLE_COUNTER; err = nf_flow_table_init(&ct_ft->nf_ft); if (err) goto err_init; @@ -540,7 +541,8 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); - nf_ct_acct_update(ct, dir, skb->len); + if (nf_ft->flags & NF_FLOWTABLE_COUNTER) + nf_ct_acct_update(ct, dir, skb->len); return true; } @@ -1541,6 +1543,8 @@ static int __init ct_init_module(void) if (err) goto err_register; + static_branch_inc(&tcf_frag_xmit_count); + return 0; err_register: @@ -1552,6 +1556,7 @@ err_tbl_init: static void __exit ct_cleanup_module(void) { + static_branch_dec(&tcf_frag_xmit_count); tcf_unregister_action(&act_ct_ops, &ct_net_ops); tcf_ct_flow_tables_uninit(); destroy_workqueue(act_ct_wq); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8dc3bec0d325..ac7297f42355 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -166,7 +166,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (unlikely(!tname)) goto err1; if (tb[TCA_IPT_TABLE] == NULL || - nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) + nla_strscpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) strcpy(tname, "mangle"); t = kmemdup(td, td->u.target_size, GFP_KERNEL); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e24b7e2331cd..7153c67f641e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -205,6 +205,18 @@ release_idr: return err; } +static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) +{ + int err; + + if (!want_ingress) + err = tcf_dev_queue_xmit(skb, dev_queue_xmit); + else + err = netif_receive_skb(skb); + + return err; +} + static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -287,18 +299,15 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - if (skb_tc_reinsert(skb, res)) + err = tcf_mirred_forward(res->ingress, skb); + if (err) tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } } - if (!want_ingress) - err = dev_queue_xmit(skb2); - else - err = netif_receive_skb(skb2); - + err = tcf_mirred_forward(want_ingress, skb2); if (err) { out: tcf_action_inc_overlimit_qstats(&m->common); diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index f40bf9771cb9..d1486ea496a2 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -105,6 +105,9 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_MPLS_ACT_MODIFY: + if (!pskb_may_pull(skb, + skb_network_offset(skb) + MPLS_HLEN)) + goto drop; new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); if (skb_mpls_update_lse(skb, new_lse)) goto drop; @@ -426,6 +429,7 @@ static void __exit mpls_cleanup_module(void) module_init(mpls_init_module); module_exit(mpls_cleanup_module); +MODULE_SOFTDEP("post: mpls_gso"); MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MPLS manipulation actions"); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index a4f3d0f0daa9..726cc956d06f 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -52,7 +52,7 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; - nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } @@ -71,7 +71,7 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata, spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); - nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index faeabff283a2..37b77bd30974 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -223,7 +223,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) static bool tcf_proto_check_kind(struct nlattr *kind, char *name) { if (kind) - return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ; + return nla_strscpy(name, kind, IFNAMSIZ) < 0; memset(name, 0, IFNAMSIZ); return false; } @@ -652,12 +652,12 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) block_cb->indr.binder_type, &block->flow_block, tcf_block_shared(block), &extack); + rtnl_lock(); down_write(&block->cb_lock); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); - up_write(&block->cb_lock); - rtnl_lock(); tcf_block_unbind(block, &bo); + up_write(&block->cb_lock); rtnl_unlock(); } @@ -991,13 +991,12 @@ __tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) */ struct tcf_proto * -tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp, - bool rtnl_held) +tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp); if (tp) - tcf_proto_put(tp, rtnl_held, NULL); + tcf_proto_put(tp, true, NULL); return tp_next; } @@ -1924,15 +1923,14 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event, - bool rtnl_held) + struct tcf_chain *chain, int event) { struct tcf_proto *tp; - for (tp = tcf_get_next_proto(chain, NULL, rtnl_held); - tp; tp = tcf_get_next_proto(chain, tp, rtnl_held)) + for (tp = tcf_get_next_proto(chain, NULL); + tp; tp = tcf_get_next_proto(chain, tp)) tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false, rtnl_held); + q, parent, NULL, event, false, true); } static void tfilter_put(struct tcf_proto *tp, void *fh) @@ -2262,7 +2260,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER, rtnl_held); + chain, RTM_DELTFILTER); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; @@ -2895,7 +2893,7 @@ replay: break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER, true); + chain, RTM_DELTFILTER); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference @@ -2940,7 +2938,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) struct tcf_chain *chain; long index_start; long index; - u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) @@ -2955,13 +2952,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; - /* If we work with block index, q is NULL and parent value - * will never be used in the following code. The check - * in tcf_fill_node prevents it. However, compiler does not - * see that far, so set parent to zero to silence the warning - * about parent being uninitialized. - */ - parent = 0; } else { const struct Qdisc_class_ops *cops; struct net_device *dev; @@ -2971,13 +2961,11 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) return skb->len; - parent = tcm->tcm_parent; - if (!parent) { + if (!tcm->tcm_parent) q = dev->qdisc; - parent = q->handle; - } else { + else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); - } + if (!q) goto out; cops = q->ops->cl_ops; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index d36949d9382c..2e288f88ff02 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -238,7 +238,7 @@ static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) } } - /* Something went wrong if we are trying to replace a non-existant + /* Something went wrong if we are trying to replace a non-existent * node. Mind as well halt instead of silently failing. */ BUG_ON(1); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index a4d09b1fb66a..f17b049ea530 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -41,7 +41,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, break; case TCF_EM_ALIGN_U32: - /* Worth checking boundries? The branching seems + /* Worth checking boundaries? The branching seems * to get worse. Visit again. */ val = get_unaligned_be32(ptr); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2a76a2f5ed88..51cb553e4317 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1170,7 +1170,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; - if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to @@ -1943,8 +1943,8 @@ static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, chain = tcf_get_next_chain(block, chain)) { struct tcf_proto *tp; - for (tp = tcf_get_next_proto(chain, NULL, true); - tp; tp = tcf_get_next_proto(chain, tp, true)) { + for (tp = tcf_get_next_proto(chain, NULL); + tp; tp = tcf_get_next_proto(chain, tp)) { struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1c281cc81f57..007bd2d9f1ff 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -466,10 +466,10 @@ drop: __maybe_unused * non-ATM interfaces. */ -static void sch_atm_dequeue(unsigned long data) +static void sch_atm_dequeue(struct tasklet_struct *t) { - struct Qdisc *sch = (struct Qdisc *)data; - struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_qdisc_data *p = from_tasklet(p, t, task); + struct Qdisc *sch = qdisc_from_priv(p); struct atm_flow_data *flow; struct sk_buff *skb; @@ -563,7 +563,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, if (err) return err; - tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch); + tasklet_setup(&p->task, sch_atm_dequeue); return 0; } diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 2eaac2ff380f..459cc240eda9 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -50,6 +50,7 @@ * locredit = max_frame_size * (sendslope / port_transmit_rate) */ +#include <linux/ethtool.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c new file mode 100644 index 000000000000..e1e77d3fb6c0 --- /dev/null +++ b/net/sched/sch_frag.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/dst.h> +#include <net/ip.h> +#include <net/ip6_fib.h> + +struct sch_frag_data { + unsigned long dst; + struct qdisc_skb_cb cb; + __be16 inner_protocol; + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[VLAN_ETH_HLEN]; + int (*xmit)(struct sk_buff *skb); +}; + +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); + +static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *qdisc_skb_cb(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + if (data->vlan_tci & VLAN_CFI_MASK) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, + data->vlan_tci & ~VLAN_CFI_MASK); + else + __vlan_hwaccel_clear_tag(skb); + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + return data->xmit(skb); +} + +static void sch_frag_prepare_frag(struct sk_buff *skb, + int (*xmit)(struct sk_buff *skb)) +{ + unsigned int hlen = skb_network_offset(skb); + struct sch_frag_data *data; + + data = this_cpu_ptr(&sch_frag_data_storage); + data->dst = skb->_skb_refdst; + data->cb = *qdisc_skb_cb(skb); + data->xmit = xmit; + data->inner_protocol = skb->inner_protocol; + if (skb_vlan_tag_present(skb)) + data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK; + else + data->vlan_tci = 0; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static unsigned int +sch_frag_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops sch_frag_dst_ops = { + .family = AF_UNSPEC, + .mtu = sch_frag_dst_get_mtu, +}; + +static int sch_fragment(struct net *net, struct sk_buff *skb, + u16 mru, int (*xmit)(struct sk_buff *skb)) +{ + int ret = -1; + + if (skb_network_offset(skb) > VLAN_ETH_HLEN) { + net_warn_ratelimited("L2 header too long to fragment\n"); + goto err; + } + + if (skb_protocol(skb, true) == htons(ETH_P_IP)) { + struct dst_entry sch_frag_dst; + unsigned long orig_dst; + + sch_frag_prepare_frag(skb, xmit); + dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + sch_frag_dst.dev = skb->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &sch_frag_dst); + IPCB(skb)->frag_max_size = mru; + + ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + refdst_drop(orig_dst); + } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { + unsigned long orig_dst; + struct rt6_info sch_frag_rt; + + sch_frag_prepare_frag(skb, xmit); + memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); + dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + sch_frag_rt.dst.dev = skb->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &sch_frag_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, + sch_frag_xmit); + refdst_drop(orig_dst); + } else { + net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", + netdev_name(skb->dev), + ntohs(skb_protocol(skb, true)), mru, + skb->dev->mtu); + goto err; + } + + return ret; +err: + kfree_skb(skb); + return ret; +} + +int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) +{ + u16 mru = qdisc_skb_cb(skb)->mru; + int err; + + if (mru && skb->len > mru + skb->dev->hard_header_len) + err = sch_fragment(dev_net(skb->dev), skb, mru, xmit); + else + err = xmit(skb); + + return err; +} +EXPORT_SYMBOL_GPL(sch_frag_xmit_hook); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 84f82771cdf5..0c345e43a09a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -330,7 +330,7 @@ static s64 tabledist(s64 mu, s32 sigma, /* default uniform distribution */ if (dist == NULL) - return ((rnd % (2 * sigma)) + mu) - sigma; + return ((rnd % (2 * (u32)sigma)) + mu) - sigma; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; @@ -812,6 +812,10 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_packets = INT_MAX; if (q->slot_config.max_bytes == 0) q->slot_config.max_bytes = INT_MAX; + + /* capping dist_jitter to the range acceptable by tabledist() */ + q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); + q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; if (q->slot_config.min_delay | q->slot_config.max_delay | @@ -1037,6 +1041,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_NETEM_SLOT]) get_slot(q, tb[TCA_NETEM_SLOT]); + /* capping jitter to the range acceptable by tabledist() */ + q->jitter = min_t(s64, abs(q->jitter), INT_MAX); + return ret; get_table_failure: diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index c65077f0c0f3..5a457ff61acd 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -405,7 +405,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero - * 3. If average dq_rate_estimator is enabled, we have atleast one + * 3. If average dq_rate_estimator is enabled, we have at least one * estimate for the avg_dq_rate ie., is a non-zero value */ if ((vars->qdelay < params->target / 2) && diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index b0ad7687ee2c..26fb8a62996b 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -6,6 +6,7 @@ * */ +#include <linux/ethtool.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 39d7fa9569f8..5da599ff84a9 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -11,6 +11,7 @@ menuconfig IP_SCTP select CRYPTO_HMAC select CRYPTO_SHA1 select LIBCRC32C + select NET_UDP_TUNNEL help Stream Control Transmission Protocol diff --git a/net/sctp/associola.c b/net/sctp/associola.c index fdb69d46276d..336df4b36655 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -99,6 +99,8 @@ static struct sctp_association *sctp_association_init( */ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + asoc->encap_port = sp->encap_port; + /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; @@ -624,6 +626,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, */ peer->hbinterval = asoc->hbinterval; + peer->encap_port = asoc->encap_port; + /* Set the path max_retrans. */ peer->pathmaxrxt = asoc->pathmaxrxt; diff --git a/net/sctp/input.c b/net/sctp/input.c index 55d4fc6f371d..d508f6f3dd08 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -449,7 +449,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) - sctp_association_hold(asoc); + sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); @@ -458,7 +458,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, "encountered!\n", __func__); if (del_timer(&t->proto_unreach_timer)) - sctp_association_put(asoc); + sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 8a58f42d6d19..c3e89c776e66 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -55,6 +55,7 @@ #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/sctp/sctp.h> +#include <net/udp_tunnel.h> #include <linux/uaccess.h> @@ -191,33 +192,53 @@ out: return ret; } -static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) +static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { + struct dst_entry *dst = dst_clone(t->dst); + struct flowi6 *fl6 = &t->fl.u.ip6; struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi6 *fl6 = &transport->fl.u.ip6; __u8 tclass = np->tclass; - int res; + __be32 label; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - if (transport->dscp & SCTP_DSCP_SET_MASK) - tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + if (t->dscp & SCTP_DSCP_SET_MASK) + tclass = t->dscp & SCTP_DSCP_VAL_MASK; if (INET_ECN_is_capable(tclass)) IP6_ECN_flow_xmit(sk, fl6->flowlabel); - if (!(transport->param_flags & SPP_PMTUD_ENABLE)) + if (!(t->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - rcu_read_lock(); - res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - tclass, sk->sk_priority); - rcu_read_unlock(); - return res; + if (!t->encap_port || !sctp_sk(sk)->udp_port) { + int res; + + skb_dst_set(skb, dst); + rcu_read_lock(); + res = ip6_xmit(sk, skb, fl6, sk->sk_mark, + rcu_dereference(np->opt), + tclass, sk->sk_priority); + rcu_read_unlock(); + return res; + } + + if (skb_is_gso(skb)) + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + + skb->encapsulation = 1; + skb_reset_inner_mac_header(skb); + skb_reset_inner_transport_header(skb); + skb_set_inner_ipproto(skb, IPPROTO_SCTP); + label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); + + return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, + &fl6->daddr, tclass, ip6_dst_hoplimit(dst), + label, sctp_sk(sk)->udp_port, t->encap_port, false); } /* Returns the dst cache entry for the given source and destination ip @@ -1053,6 +1074,7 @@ static struct inet_protosw sctpv6_stream_protosw = { static int sctp6_rcv(struct sk_buff *skb) { + SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb) ? -1 : 0; } diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 74847d613835..ce281a9a2875 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -27,7 +27,11 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; - gso_reset_checksum(skb, ~0); + /* csum and csum_start in GSO CB may be needed to do the UDP + * checksum when it's a UDP tunneling packet. + */ + SKB_GSO_CB(skb)->csum = (__force __wsum)~0; + SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } diff --git a/net/sctp/output.c b/net/sctp/output.c index 1441eaf460bb..6614c9fdc51e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -508,20 +508,14 @@ merge: sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; - rcu_read_lock(); - if (skb_dst(head) != tp->dst) { - dst_hold(tp->dst); - sk_setup_caps(sk, tp->dst); - } - rcu_read_unlock(); goto chksum; } if (sctp_checksum_disable) return 1; - if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) || - dst_xfrm(skb_dst(head)) || packet->ipfragok) { + if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || + dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); @@ -548,7 +542,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; - struct dst_entry *dst; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; @@ -585,13 +578,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->checksum = 0; /* drop packet if no dst */ - dst = dst_clone(tp->dst); - if (!dst) { + if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } - skb_dst_set(head, dst); + + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + rcu_read_unlock(); /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 25833238fe93..6f2bbfeec3a4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -44,6 +44,7 @@ #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> +#include <net/udp_tunnel.h> #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) @@ -840,6 +841,92 @@ static int sctp_ctl_sock_init(struct net *net) return 0; } +static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb) +{ + SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source; + + skb_set_transport_header(skb, sizeof(struct udphdr)); + sctp_rcv(skb); + return 0; +} + +static int sctp_udp_err_lookup(struct sock *sk, struct sk_buff *skb) +{ + struct sctp_association *asoc; + struct sctp_transport *t; + int family; + + skb->transport_header += sizeof(struct udphdr); + family = (ip_hdr(skb)->version == 4) ? AF_INET : AF_INET6; + sk = sctp_err_lookup(dev_net(skb->dev), family, skb, sctp_hdr(skb), + &asoc, &t); + if (!sk) + return -ENOENT; + + sctp_err_finish(sk, t); + return 0; +} + +int sctp_udp_sock_start(struct net *net) +{ + struct udp_tunnel_sock_cfg tuncfg = {NULL}; + struct udp_port_cfg udp_conf = {0}; + struct socket *sock; + int err; + + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + udp_conf.local_udp_port = htons(net->sctp.udp_port); + err = udp_sock_create(net, &udp_conf, &sock); + if (err) { + pr_err("Failed to create the SCTP UDP tunneling v4 sock\n"); + return err; + } + + tuncfg.encap_type = 1; + tuncfg.encap_rcv = sctp_udp_rcv; + tuncfg.encap_err_lookup = sctp_udp_err_lookup; + setup_udp_tunnel_sock(net, sock, &tuncfg); + net->sctp.udp4_sock = sock->sk; + +#if IS_ENABLED(CONFIG_IPV6) + memset(&udp_conf, 0, sizeof(udp_conf)); + + udp_conf.family = AF_INET6; + udp_conf.local_ip6 = in6addr_any; + udp_conf.local_udp_port = htons(net->sctp.udp_port); + udp_conf.use_udp6_rx_checksums = true; + udp_conf.ipv6_v6only = true; + err = udp_sock_create(net, &udp_conf, &sock); + if (err) { + pr_err("Failed to create the SCTP UDP tunneling v6 sock\n"); + udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); + net->sctp.udp4_sock = NULL; + return err; + } + + tuncfg.encap_type = 1; + tuncfg.encap_rcv = sctp_udp_rcv; + tuncfg.encap_err_lookup = sctp_udp_err_lookup; + setup_udp_tunnel_sock(net, sock, &tuncfg); + net->sctp.udp6_sock = sock->sk; +#endif + + return 0; +} + +void sctp_udp_sock_stop(struct net *net) +{ + if (net->sctp.udp4_sock) { + udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); + net->sctp.udp4_sock = NULL; + } + if (net->sctp.udp6_sock) { + udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket); + net->sctp.udp6_sock = NULL; + } +} + /* Register address family specific functions. */ int sctp_register_af(struct sctp_af *af) { @@ -971,25 +1058,44 @@ static int sctp_inet_supported_addrs(const struct sctp_sock *opt, } /* Wrapper routine that calls the ip transmit routine. */ -static inline int sctp_v4_xmit(struct sk_buff *skb, - struct sctp_transport *transport) +static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) { - struct inet_sock *inet = inet_sk(skb->sk); + struct dst_entry *dst = dst_clone(t->dst); + struct flowi4 *fl4 = &t->fl.u.ip4; + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); __u8 dscp = inet->tos; + __be16 df = 0; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, - &transport->fl.u.ip4.daddr); + skb->len, &fl4->saddr, &fl4->daddr); - if (transport->dscp & SCTP_DSCP_SET_MASK) - dscp = transport->dscp & SCTP_DSCP_VAL_MASK; + if (t->dscp & SCTP_DSCP_SET_MASK) + dscp = t->dscp & SCTP_DSCP_VAL_MASK; + + inet->pmtudisc = t->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO + : IP_PMTUDISC_DONT; + SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? - IP_PMTUDISC_DO : IP_PMTUDISC_DONT; + if (!t->encap_port || !sctp_sk(sk)->udp_port) { + skb_dst_set(skb, dst); + return __ip_queue_xmit(sk, skb, &t->fl, dscp); + } - SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); + if (skb_is_gso(skb)) + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; - return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); + if (ip_dont_fragment(sk, dst) && !skb->ignore_df) + df = htons(IP_DF); + + skb->encapsulation = 1; + skb_reset_inner_mac_header(skb); + skb_reset_inner_transport_header(skb); + skb_set_inner_ipproto(skb, IPPROTO_SCTP); + udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, + fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, + sctp_sk(sk)->udp_port, t->encap_port, false, false); + return 0; } static struct sctp_af sctp_af_inet; @@ -1054,9 +1160,15 @@ static struct inet_protosw sctp_stream_protosw = { .flags = SCTP_PROTOSW_FLAG }; +static int sctp4_rcv(struct sk_buff *skb) +{ + SCTP_INPUT_CB(skb)->encap_port = 0; + return sctp_rcv(skb); +} + /* Register with IP layer. */ static const struct net_protocol sctp_protocol = { - .handler = sctp_rcv, + .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, .netns_ok = 1, @@ -1271,6 +1383,12 @@ static int __net_init sctp_defaults_init(struct net *net) /* Enable ECN by default. */ net->sctp.ecn_enable = 1; + /* Set UDP tunneling listening port to 0 by default */ + net->sctp.udp_port = 0; + + /* Set remote encap port to 0 by default */ + net->sctp.encap_port = 0; + /* Set SCOPE policy to enabled */ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9a56ae2f3651..f77484df097b 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1142,6 +1142,26 @@ nodata: return retval; } +struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_new_encap_port_hdr nep; + struct sctp_chunk *retval; + + retval = sctp_make_abort(asoc, chunk, + sizeof(struct sctp_errhdr) + sizeof(nep)); + if (!retval) + goto nodata; + + sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); + nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; + nep.new_port = chunk->transport->encap_port; + sctp_addto_chunk(retval, sizeof(nep), &nep); + +nodata: + return retval; +} + /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport) @@ -2321,6 +2341,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ + asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index aa821e71f05e..0948f14ce221 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -419,7 +419,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t) /* Try again later. */ if (!mod_timer(&transport->proto_unreach_timer, jiffies + (HZ/20))) - sctp_association_hold(asoc); + sctp_transport_hold(transport); goto out_unlock; } @@ -435,7 +435,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t) out_unlock: bh_unlock_sock(sk); - sctp_association_put(asoc); + sctp_transport_put(transport); } /* Handle the timeout of the RE-CONFIG timer. */ @@ -1601,12 +1601,12 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, break; case SCTP_CMD_INIT_FAILED: - sctp_cmd_init_failed(commands, asoc, cmd->obj.u32); + sctp_cmd_init_failed(commands, asoc, cmd->obj.u16); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, - subtype, chunk, cmd->obj.u32); + subtype, chunk, cmd->obj.u16); break; case SCTP_CMD_INIT_COUNTER_INC: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c669f8bd1eab..af2b7041fa4e 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -87,6 +87,13 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8( const union sctp_subtype type, void *arg, struct sctp_cmd_seq *commands); +static enum sctp_disposition sctp_sf_new_encap_port( + struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, + void *arg, + struct sctp_cmd_seq *commands); static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); static enum sctp_disposition sctp_stop_t1_and_abort( @@ -1493,6 +1500,10 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); + + if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port) + return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands); + /* Grab the INIT header. */ chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data; @@ -3392,6 +3403,45 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8( sctp_packet_append_chunk(packet, abort); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet)); + + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); + + sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return SCTP_DISPOSITION_CONSUME; +} + +/* Handling of SCTP Packets Containing an INIT Chunk Matching an + * Existing Associations when the UDP encap port is incorrect. + * + * From Section 4 at draft-tuexen-tsvwg-sctp-udp-encaps-cons-03. + */ +static enum sctp_disposition sctp_sf_new_encap_port( + struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, + void *arg, + struct sctp_cmd_seq *commands) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *abort; + + packet = sctp_ootb_pkt_new(net, asoc, chunk); + if (!packet) + return SCTP_DISPOSITION_NOMEM; + + abort = sctp_make_new_encap_port(asoc, chunk); + if (!abort) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } + + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet)); @@ -6268,6 +6318,8 @@ static struct sctp_packet *sctp_ootb_pkt_new( if (!transport) goto nomem; + transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; + /* Cache a route for the transport with the chunk's destination as * the source address. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 53d0a4161df3..a710917c5ac7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4417,6 +4417,55 @@ out: return retval; } +static int sctp_setsockopt_encap_port(struct sock *sk, + struct sctp_udpencaps *encap, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_transport *t; + __be16 encap_port; + + if (optlen != sizeof(*encap)) + return -EINVAL; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + encap_port = (__force __be16)encap->sue_port; + if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) { + t = sctp_addr_id2transport(sk, &encap->sue_address, + encap->sue_assoc_id); + if (!t) + return -EINVAL; + + t->encap_port = encap_port; + return 0; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, encap->sue_assoc_id); + if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + return -EINVAL; + + /* If changes are for association, also apply encap_port to + * each transport. + */ + if (asoc) { + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) + t->encap_port = encap_port; + + return 0; + } + + sctp_sk(sk)->encap_port = encap_port; + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4636,6 +4685,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_setsockopt_pf_expose(sk, kopt, optlen); break; + case SCTP_REMOTE_UDP_ENCAPS_PORT: + retval = sctp_setsockopt_encap_port(sk, kopt, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4876,6 +4928,8 @@ static int sctp_init_sock(struct sock *sk) * be modified via SCTP_PEER_ADDR_PARAMS */ sp->hbinterval = net->sctp.hb_interval; + sp->udp_port = htons(net->sctp.udp_port); + sp->encap_port = htons(net->sctp.encap_port); sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; sp->ps_retrans = net->sctp.ps_retrans; @@ -7790,6 +7844,65 @@ out: return retval; } +static int sctp_getsockopt_encap_port(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_association *asoc; + struct sctp_udpencaps encap; + struct sctp_transport *t; + __be16 encap_port; + + if (len < sizeof(encap)) + return -EINVAL; + + len = sizeof(encap); + if (copy_from_user(&encap, optval, len)) + return -EFAULT; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) { + t = sctp_addr_id2transport(sk, &encap.sue_address, + encap.sue_assoc_id); + if (!t) { + pr_debug("%s: failed no transport\n", __func__); + return -EINVAL; + } + + encap_port = t->encap_port; + goto out; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, encap.sue_assoc_id); + if (!asoc && encap.sue_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + pr_debug("%s: failed no association\n", __func__); + return -EINVAL; + } + + if (asoc) { + encap_port = asoc->encap_port; + goto out; + } + + encap_port = sctp_sk(sk)->encap_port; + +out: + encap.sue_port = (__force uint16_t)encap_port; + if (copy_to_user(optval, &encap, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8010,6 +8123,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); break; + case SCTP_REMOTE_UDP_ENCAPS_PORT: + retval = sctp_getsockopt_encap_port(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index c16c80963e55..e92df779af73 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -36,6 +36,7 @@ static int rto_alpha_max = 1000; static int rto_beta_max = 1000; static int pf_expose_max = SCTP_PF_EXPOSE_MAX; static int ps_retrans_max = SCTP_PS_RETRANS_MAX; +static int udp_port_max = 65535; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = @@ -48,6 +49,8 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, @@ -291,6 +294,24 @@ static struct ctl_table sctp_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "udp_port", + .data = &init_net.sctp.udp_port, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_sctp_do_udp_port, + .extra1 = SYSCTL_ZERO, + .extra2 = &udp_port_max, + }, + { + .procname = "encap_port", + .data = &init_net.sctp.encap_port, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = &udp_port_max, + }, + { .procname = "addr_scope_policy", .data = &init_net.sctp.scope_policy, .maxlen = sizeof(int), @@ -477,6 +498,47 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write, return ret; } +static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + unsigned int min = *(unsigned int *)ctl->extra1; + unsigned int max = *(unsigned int *)ctl->extra2; + struct ctl_table tbl; + int ret, new_value; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.udp_port; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + struct sock *sk = net->sctp.ctl_sock; + + if (new_value > max || new_value < min) + return -EINVAL; + + net->sctp.udp_port = new_value; + sctp_udp_sock_stop(net); + if (new_value) { + ret = sctp_udp_sock_start(net); + if (ret) + net->sctp.udp_port = 0; + } + + /* Update the value in the control socket */ + lock_sock(sk); + sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); + release_sock(sk); + } + + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 806af58f4375..bf0ac467e757 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -8,7 +8,7 @@ * * This file is part of the SCTP kernel implementation * - * This module provides the abstraction for an SCTP tranport representing + * This module provides the abstraction for an SCTP transport representing * a remote transport address. For local transport addresses, we just use * union sctp_addr. * @@ -123,7 +123,7 @@ void sctp_transport_free(struct sctp_transport *transport) /* Delete the T3_rtx timer if it's active. * There is no point in not doing this now and letting * structure hang around in memory since we know - * the tranport is going away. + * the transport is going away. */ if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); @@ -133,7 +133,7 @@ void sctp_transport_free(struct sctp_transport *transport) /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) - sctp_association_put(transport->asoc); + sctp_transport_put(transport); sctp_transport_put(transport); } diff --git a/net/smc/Makefile b/net/smc/Makefile index cb1254541f37..77e54fe42b1c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 82be0bd0f6e8..47340b3b514f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -45,6 +45,7 @@ #include "smc_ib.h" #include "smc_ism.h" #include "smc_pnet.h" +#include "smc_netlink.h" #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" @@ -552,8 +553,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, return smc_connect_fallback(smc, reason_code); } -/* abort connecting */ -static void smc_connect_abort(struct smc_sock *smc, int local_first) +static void smc_conn_abort(struct smc_sock *smc, int local_first) { if (local_first) smc_lgr_cleanup_early(&smc->conn); @@ -669,7 +669,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->smc_type_v1 = SMC_TYPE_N; } /* else RDMA is supported for this connection */ } - if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini)) + if (smc_ism_is_v2_capable() && smc_find_ism_v2_device_clnt(smc, ini)) ini->smc_type_v2 = SMC_TYPE_N; /* if neither ISM nor RDMA are supported, fallback */ @@ -814,7 +814,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; connect_abort: - smc_connect_abort(smc, ini->first_contact_local); + smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; @@ -893,7 +893,7 @@ static int smc_connect_ism(struct smc_sock *smc, return 0; connect_abort: - smc_connect_abort(smc, ini->first_contact_local); + smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_server_lgr_pending); smc->connect_nonblock = 0; @@ -921,7 +921,7 @@ static int smc_connect_check_aclc(struct smc_init_info *ini, /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { - u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1; + u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct smc_clc_msg_accept_confirm_v2 *aclc2; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; @@ -946,9 +946,9 @@ static int __smc_connect(struct smc_sock *smc) version); ini->smcd_version = SMC_V1; - ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0; + ini->smcd_version |= smc_ism_is_v2_capable() ? SMC_V2 : 0; ini->smc_type_v1 = SMC_TYPE_B; - ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N; + ini->smc_type_v2 = smc_ism_is_v2_capable() ? SMC_TYPE_D : SMC_TYPE_N; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { @@ -979,7 +979,8 @@ static int __smc_connect(struct smc_sock *smc) /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); - version = aclc->hdr.version == SMC_V1 ? SMC_V1 : version; + version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; + ini->smcd_version = version; if (rc) goto vlan_cleanup; @@ -1317,13 +1318,10 @@ static void smc_listen_out_err(struct smc_sock *new_smc) /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, - struct smc_init_info *ini, u8 version) + int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ - if (ini->first_contact_local) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); + smc_conn_abort(new_smc, local_first); if (reason_code < 0) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; @@ -1346,6 +1344,7 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, { struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; struct smc_clc_v2_extension *pclc_v2_ext; + int rc = SMC_CLC_DECL_PEERNOSMC; ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; @@ -1353,29 +1352,30 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, if (pclc->hdr.version > SMC_V1) ini->smcd_version |= ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0; - if (!smc_ism_v2_capable) { + if (!(ini->smcd_version & SMC_V2)) { + rc = SMC_CLC_DECL_PEERNOSMC; + goto out; + } + if (!smc_ism_is_v2_capable()) { ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOISM2SUPP; goto out; } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); - if (!pclc_smcd_v2_ext) + if (!pclc_smcd_v2_ext) { ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2DEXT; + } out: - if (!ini->smcd_version) { - if (pclc->hdr.typev1 == SMC_TYPE_B || - pclc->hdr.typev2 == SMC_TYPE_B) - return SMC_CLC_DECL_NOSMCDEV; - if (pclc->hdr.typev1 == SMC_TYPE_D || - pclc->hdr.typev2 == SMC_TYPE_D) - return SMC_CLC_DECL_NOSMCDDEV; - return SMC_CLC_DECL_NOSMCRDEV; - } + if (!ini->smcd_version) + return rc; return 0; } @@ -1427,10 +1427,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); if (rc) { - if (ini->first_contact_local) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); + smc_conn_abort(new_smc, ini->first_contact_local); return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; } @@ -1473,6 +1470,12 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, } } +static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +{ + if (!ini->rc) + ini->rc = rc; +} + static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) @@ -1483,7 +1486,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, unsigned int matches = 0; u8 smcd_version; u8 *eid = NULL; - int i; + int i, rc; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; @@ -1492,8 +1495,10 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); if (!smcd_v2_ext || - !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */ + !smc_v2_ext->hdr.flag.seid) { /* no system EID support for SMCD */ + smc_find_ism_store_rc(SMC_CLC_DECL_NOSEID, ini); goto not_found; + } mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) @@ -1525,9 +1530,12 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, ini->smcd_version = SMC_V2; ini->is_smcd = true; ini->ism_selected = i; - if (smc_listen_ism_init(new_smc, ini)) + rc = smc_listen_ism_init(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); /* try next active ISM device */ continue; + } return; /* matching and usable V2 ISM device found */ } /* no V2 ISM device could be initialized */ @@ -1544,19 +1552,23 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); + int rc = 0; /* check if ISM V1 is available */ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); - if (smc_find_ism_device(new_smc, ini)) + rc = smc_find_ism_device(new_smc, ini); + if (rc) goto not_found; ini->ism_selected = 0; - if (!smc_listen_ism_init(new_smc, ini)) + rc = smc_listen_ism_init(new_smc, ini); + if (!rc) return; /* V1 ISM device found */ not_found: + smc_find_ism_store_rc(rc, ini); ini->ism_dev[0] = NULL; ini->is_smcd = false; } @@ -1613,16 +1625,16 @@ static int smc_listen_find_device(struct smc_sock *new_smc, return 0; if (!(ini->smcd_version & SMC_V1)) - return SMC_CLC_DECL_NOSMCDEV; + return ini->rc ?: SMC_CLC_DECL_NOSMCD2DEV; /* check for matching IP prefix and subnet length */ rc = smc_listen_prfx_check(new_smc, pclc); if (rc) - return rc; + return ini->rc ?: rc; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) - return SMC_CLC_DECL_GETVLANERR; + return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ smc_find_ism_v1_device_serv(new_smc, pclc, ini); @@ -1630,10 +1642,14 @@ static int smc_listen_find_device(struct smc_sock *new_smc, return 0; if (pclc->hdr.typev1 == SMC_TYPE_D) - return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */ + /* skip RDMA and decline */ + return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA is available */ - return smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + smc_find_ism_store_rc(rc, ini); + + return (!rc) ? 0 : ini->rc; } /* listen worker: finish RDMA setup */ @@ -1666,7 +1682,7 @@ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); - u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1; + u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; @@ -1768,7 +1784,8 @@ static void smc_listen_work(struct work_struct *work) out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: - smc_listen_decline(new_smc, rc, ini, version); + smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, + version); out_free: kfree(ini); kfree(buf); @@ -2479,10 +2496,14 @@ static int __init smc_init(void) smc_ism_init(); smc_clc_init(); - rc = smc_pnet_init(); + rc = smc_nl_init(); if (rc) goto out_pernet_subsys; + rc = smc_pnet_init(); + if (rc) + goto out_nl; + rc = -ENOMEM; smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) @@ -2553,6 +2574,8 @@ out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); out_pnet: smc_pnet_exit(); +out_nl: + smc_nl_exit(); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); @@ -2570,6 +2593,7 @@ static void __exit smc_exit(void) proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); + smc_nl_exit(); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b1ce6ccbfaec..f23f558054a7 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -389,9 +389,9 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) * Context: * - tasklet context */ -static void smcd_cdc_rx_tsklet(unsigned long data) +static void smcd_cdc_rx_tsklet(struct tasklet_struct *t) { - struct smc_connection *conn = (struct smc_connection *)data; + struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet); struct smcd_cdc_msg *data_cdc; struct smcd_cdc_msg cdc; struct smc_sock *smc; @@ -411,7 +411,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) */ void smcd_cdc_rx_init(struct smc_connection *conn) { - tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn); + tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet); } /***************************** init, exit, misc ******************************/ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 696d89c2dce4..e286dafd6e88 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -772,6 +772,11 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, return len > 0 ? 0 : len; } +void smc_clc_get_hostname(u8 **host) +{ + *host = &smc_hostname[0]; +} + void __init smc_clc_init(void) { struct new_utsname *u; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index b3f46ab79e47..32d37f7b70f2 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -37,6 +37,11 @@ #define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ #define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ #define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */ +#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */ +#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ +#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ +#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -124,7 +129,7 @@ struct smc_clc_v2_extension { struct smc_clnt_opts_area_hdr hdr; u8 roce[16]; /* RoCEv2 GID */ u8 reserved[16]; - u8 user_eids[0][SMC_MAX_EID_LEN]; + u8 user_eids[][SMC_MAX_EID_LEN]; }; struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ @@ -143,7 +148,7 @@ struct smc_clc_msg_smcd { /* SMC-D GID information */ struct smc_clc_smcd_v2_extension { u8 system_eid[SMC_MAX_EID_LEN]; u8 reserved[16]; - struct smc_clc_smcd_gid_chid gidchid[0]; + struct smc_clc_smcd_gid_chid gidchid[]; }; struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ @@ -329,5 +334,6 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, u8 version); void smc_clc_init(void) __init; +void smc_clc_get_hostname(u8 **host); #endif diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d790c43c473f..59342b519e34 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -16,6 +16,8 @@ #include <linux/wait.h> #include <linux/reboot.h> #include <linux/mutex.h> +#include <linux/list.h> +#include <linux/smc.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -30,12 +32,13 @@ #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" +#include "smc_netlink.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) -static struct smc_lgr_list smc_lgr_list = { /* established link groups */ +struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, @@ -63,6 +66,16 @@ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, return &smc_lgr_list.list; } +static void smc_ibdev_cnt_inc(struct smc_link *lnk) +{ + atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + +static void smc_ibdev_cnt_dec(struct smc_link *lnk) +{ + atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group @@ -139,6 +152,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) } if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; + atomic_inc(&conn->lnk->conn_cnt); return 0; } @@ -180,6 +194,8 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn) struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); + if (conn->lnk) + atomic_dec(&conn->lnk->conn_cnt); lgr->conns_num--; conn->alert_token_local = 0; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ @@ -201,6 +217,361 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) conn->lgr = NULL; } +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char hostname[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_seid[SMC_MAX_EID_LEN + 1]; + struct smcd_dev *smcd_dev; + struct nlattr *attrs; + u8 *seid = NULL; + u8 *host = NULL; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_SYS_INFO); + if (!nlh) + goto errmsg; + if (cb_ctx->pos[0]) + goto errout; + attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) + goto errattr; + smc_clc_get_hostname(&host); + if (host) { + snprintf(hostname, sizeof(hostname), "%s", host); + if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) + goto errattr; + } + mutex_lock(&smcd_dev_list.mutex); + smcd_dev = list_first_entry_or_null(&smcd_dev_list.list, + struct smcd_dev, list); + if (smcd_dev) + smc_ism_get_system_eid(smcd_dev, &seid); + mutex_unlock(&smcd_dev_list.mutex); + if (seid && smc_ism_is_v2_capable()) { + snprintf(smc_seid, sizeof(smc_seid), "%s", seid); + if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) + goto errattr; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_nl_fill_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_target[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *attrs; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) + goto errattr; + snprintf(smc_target, sizeof(smc_target), "%s", lgr->pnet_id); + if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, + struct smc_link *link, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX + 1]; + u8 smc_gid_target[41]; + struct nlattr *attrs; + u32 link_uid = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LINK_SMCR); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, + atomic_read(&link->conn_cnt))) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) + goto errattr; + snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); + if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) + goto errattr; + memcpy(&link_uid, link->link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) + goto errattr; + memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->gid); + if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->peer_gid); + if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCR); + if (!nlh) + goto errmsg; + if (smc_nl_fill_lgr(lgr, skb, cb)) + goto errout; + + genlmsg_end(skb, nlh); + if (!list_links) + goto out; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) + goto errout; + } +out: + return 0; + +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[0]; + int num = 0; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&smc_lgr->lock); + cb_ctx->pos[0] = num; +} + +static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + char smc_eid[SMC_MAX_EID_LEN + 1]; + struct nlattr *v2_attrs; + struct nlattr *attrs; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCD); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) + goto errattr; + snprintf(smc_pnet, sizeof(smc_pnet), "%s", lgr->smcd->pnetid); + if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) + goto errattr; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_V2); + if (!v2_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) + goto errv2attr; + snprintf(smc_host, sizeof(smc_host), "%s", lgr->peer_hostname); + if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) + goto errv2attr; + snprintf(smc_eid, sizeof(smc_eid), "%s", lgr->negotiated_eid); + if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[1]; + int rc = 0, num = 0; + + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry(lgr, &dev->lgr_list, list) { + if (!lgr->is_smcd) + continue; + if (num < snum) + goto next; + rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&dev->lgr_lock); + cb_ctx->pos[1] = num; + return rc; +} + +static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smcd_dev *smcd_dev; + int snum = cb_ctx->pos[0]; + int rc = 0, num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd_dev, &dev_list->list, list) { + if (list_empty(&smcd_dev->lgr_list)) + continue; + if (num < snum) + goto next; + rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; + return rc; +} + +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = false; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = true; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + void smc_lgr_cleanup_early(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; @@ -300,6 +671,15 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr) return link_id; } +static void smcr_copy_dev_info_to_link(struct smc_link *link) +{ + struct smc_ib_device *smcibdev = link->smcibdev; + + snprintf(link->ibname, sizeof(link->ibname), "%s", + smcibdev->ibdev->name); + link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; +} + int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { @@ -313,7 +693,10 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_idx = link_idx; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + smc_ibdev_cnt_inc(lnk); + smcr_copy_dev_info_to_link(lnk); lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!ini->ib_dev->initialized) { @@ -355,6 +738,7 @@ free_link_mem: clear_llc_lnk: smc_llc_link_clear(lnk, false); out: + smc_ibdev_cnt_dec(lnk); put_device(&ini->ib_dev->ibdev->dev); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; @@ -526,6 +910,14 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, return rc; } +static void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk) +{ + atomic_dec(&conn->lnk->conn_cnt); + conn->lnk = to_lnk; + atomic_inc(&conn->lnk->conn_cnt); +} + struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { @@ -574,7 +966,7 @@ again: smc->sk.sk_state == SMC_PEERABORTWAIT || smc->sk.sk_state == SMC_PROCESSABORT) { spin_lock_bh(&conn->send_lock); - conn->lnk = to_lnk; + smc_switch_link_and_count(conn, to_lnk); spin_unlock_bh(&conn->send_lock); continue; } @@ -588,7 +980,7 @@ again: } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); - conn->lnk = to_lnk; + smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); @@ -737,6 +1129,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); @@ -1309,7 +1702,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && - lgr->vlan_id == ini->vlan_id && + (ini->smcd_version == SMC_V2 || + lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ @@ -1615,8 +2009,11 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : - ERR_PTR(-EIO); + if (rc == -ENOMEM) + return ERR_PTR(-EAGAIN); + if (rc == -ENOSPC) + return ERR_PTR(-ENOSPC); + return ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f1e867ce2e63..e8e448771f85 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,7 +13,10 @@ #define _SMC_CORE_H #include <linux/atomic.h> +#include <linux/smc.h> +#include <linux/pci.h> #include <rdma/ib_verbs.h> +#include <net/genetlink.h> #include "smc.h" #include "smc_ib.h" @@ -124,11 +127,14 @@ struct smc_link { u8 link_is_asym; /* is link asymmetric? */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ + char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ + int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ + atomic_t conn_cnt; /* connections on this link */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -301,6 +307,7 @@ struct smc_init_info { u8 first_contact_peer; u8 first_contact_local; unsigned short vlan_id; + u32 rc; /* SMC-R */ struct smc_clc_msg_local *ib_lcl; struct smc_ib_device *ib_dev; @@ -362,6 +369,45 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +struct smc_pci_dev { + __u32 pci_fid; + __u16 pci_pchid; + __u16 pci_vendor; + __u16 pci_device; + __u8 pci_id[SMC_PCI_ID_STR_LEN]; +}; + +static inline void smc_set_pci_values(struct pci_dev *pci_dev, + struct smc_pci_dev *smc_dev) +{ + smc_dev->pci_vendor = pci_dev->vendor; + smc_dev->pci_device = pci_dev->device; + snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", + pci_name(pci_dev)); +#if IS_ENABLED(CONFIG_S390) + { /* Set s390 specific PCI information */ + struct zpci_dev *zdev; + + zdev = to_zpci(pci_dev); + smc_dev->pci_fid = zdev->fid; + smc_dev->pci_pchid = zdev->pchid; + } +#endif +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; @@ -409,6 +455,10 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index f15fca59b4b2..c952986a6aca 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -31,19 +31,6 @@ static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) return (struct smc_diag_dump_ctx *)cb->ctx; } -static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) -{ - sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", - be16_to_cpu(((__be16 *)gid_raw)[0]), - be16_to_cpu(((__be16 *)gid_raw)[1]), - be16_to_cpu(((__be16 *)gid_raw)[2]), - be16_to_cpu(((__be16 *)gid_raw)[3]), - be16_to_cpu(((__be16 *)gid_raw)[4]), - be16_to_cpu(((__be16 *)gid_raw)[5]), - be16_to_cpu(((__be16 *)gid_raw)[6]), - be16_to_cpu(((__be16 *)gid_raw)[7])); -} - static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); @@ -160,17 +147,17 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, - .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, - .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + .lnk[0].ibport = smc->conn.lnk->ibport, + .lnk[0].link_id = smc->conn.lnk->link_id, }; memcpy(linfo.lnk[0].ibname, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, - sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); + sizeof(smc->conn.lnk->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, - smc->conn.lgr->lnk[0].gid); + smc->conn.lnk->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, - smc->conn.lgr->lnk[0].peer_gid); + smc->conn.lnk->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 1c314dbdc7fa..89ea10675a7d 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -25,6 +25,7 @@ #include "smc_core.h" #include "smc_wr.h" #include "smc.h" +#include "smc_netlink.h" #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ @@ -198,9 +199,9 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (!IS_ERR(ndev) && - ((!vlan_id && !is_vlan_dev(attr->ndev)) || - (vlan_id && is_vlan_dev(attr->ndev) && - vlan_dev_vlan_id(attr->ndev) == vlan_id)) && + ((!vlan_id && !is_vlan_dev(ndev)) || + (vlan_id && is_vlan_dev(ndev) && + vlan_dev_vlan_id(ndev) == vlan_id)) && attr->gid_type == IB_GID_TYPE_ROCE) { rcu_read_unlock(); if (gid) @@ -326,6 +327,161 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) return rc; } +static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, + struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr; + bool rc = false; + int i; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (lgr->is_smcd) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (lgr->type == SMC_LGR_SINGLE || + lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { + rc = true; + goto out; + } + } + } +out: + spin_unlock_bh(&smc_lgr->lock); + return rc; +} + +static int smc_nl_handle_dev_port(struct sk_buff *skb, + struct ib_device *ibdev, + struct smc_ib_device *smcibdev, + int port) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *port_attrs; + unsigned char port_state; + int lnk_count = 0; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); + if (!port_attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, + smcibdev->pnetid_by_user[port])) + goto errattr; + snprintf(smc_pnet, sizeof(smc_pnet), "%s", + (char *)&smcibdev->pnetid[port]); + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, + smcibdev->ndev_ifidx[port])) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) + goto errattr; + port_state = smc_ib_port_active(smcibdev, port + 1); + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) + goto errattr; + lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) + goto errattr; + nla_nest_end(skb, port_attrs); + return 0; +errattr: + nla_nest_cancel(skb, port_attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX + 1]; + struct smc_pci_dev smc_pci_dev; + struct pci_dev *pci_dev; + unsigned char is_crit; + struct nlattr *attrs; + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCR); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); + if (!attrs) + goto errout; + is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) + goto errattr; + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); + smc_set_pci_values(pci_dev, &smc_pci_dev); + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) + goto errattr; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) + goto errattr; + snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); + if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) + goto errattr; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(smcibdev->ibdev, i)) + continue; + if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, + smcibdev, i - 1)) + goto errattr; + } + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_ib_device *smcibdev; + int snum = cb_ctx->pos[0]; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcibdev, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); + return skb->len; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -557,6 +713,49 @@ out: static struct ib_client smc_ib_client; +static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) +{ + struct ib_device *ibdev = smcibdev->ibdev; + struct net_device *ndev; + + if (!ibdev->ops.get_netdev) + return; + ndev = ibdev->ops.get_netdev(ibdev, port + 1); + if (ndev) { + smcibdev->ndev_ifidx[port] = ndev->ifindex; + dev_put(ndev); + } +} + +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) +{ + struct smc_ib_device *smcibdev; + struct ib_device *libdev; + struct net_device *lndev; + u8 port_cnt; + int i; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { + libdev = smcibdev->ibdev; + if (!libdev->ops.get_netdev) + continue; + lndev = libdev->ops.get_netdev(libdev, i + 1); + if (lndev) + dev_put(lndev); + if (lndev != ndev) + continue; + if (event == NETDEV_REGISTER) + smcibdev->ndev_ifidx[i] = ndev->ifindex; + if (event == NETDEV_UNREGISTER) + smcibdev->ndev_ifidx[i] = 0; + } + } + mutex_unlock(&smc_ib_devices.mutex); +} + /* callback function for ib_register_client() */ static int smc_ib_add_dev(struct ib_device *ibdev) { @@ -596,6 +795,7 @@ static int smc_ib_add_dev(struct ib_device *ibdev) if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); + smc_copy_netdev_ifindex(smcibdev, i); pr_warn_ratelimited("smc: ib device %s port %d has pnetid " "%.16s%s\n", smcibdev->ibdev->name, i + 1, diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 2ce481187dd0..3085f5180da7 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -30,6 +30,7 @@ struct smc_ib_devices { /* list of smc ib devices definition */ }; extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ +extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; @@ -53,11 +54,15 @@ struct smc_ib_device { /* ib-device infos for smc */ atomic_t lnk_cnt; /* number of links on ibdev */ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ struct mutex mutex; /* protect dev setup+cleanup */ + atomic_t lnk_cnt_by_port[SMC_MAX_PORTS]; + /* number of links per port */ + int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */ }; struct smc_buf_desc; struct smc_link; +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event); int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); @@ -87,4 +92,5 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, unsigned short vlan_id, u8 gid[], u8 *sgid_index); bool smc_ib_is_valid_local_systemid(void); +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 6abbdd09a580..524ef64a191a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,13 +15,14 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" +#include "smc_netlink.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; -bool smc_ism_v2_capable; +static bool smc_ism_v2_capable; /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) @@ -51,6 +52,12 @@ u16 smc_ism_get_chid(struct smcd_dev *smcd) return smcd->ops->get_chid(smcd); } +/* HW supports ISM V2 and thus System EID is defined */ +bool smc_ism_is_v2_capable(void) +{ + return smc_ism_v2_capable; +} + /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { @@ -201,6 +208,96 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, return rc; } +static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smc_pci_dev smc_pci_dev; + struct nlattr *port_attrs; + struct nlattr *attrs; + int use_cnt = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCD); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); + if (!attrs) + goto errout; + use_cnt = atomic_read(&smcd->lgr_cnt); + if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) + goto errattr; + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) + goto errattr; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) + goto errattr; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); + if (!port_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) + goto errportattr; + snprintf(smc_pnet, sizeof(smc_pnet), "%s", smcd->pnetid); + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errportattr; + + nla_nest_end(skb, port_attrs); + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errportattr: + nla_nest_cancel(skb, port_attrs); +errattr: + nla_nest_cancel(skb, attrs); +errout: + nlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int snum = cb_ctx->pos[0]; + struct smcd_dev *smcd; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcd_dev(smcd, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 8048e09ddcf8..113efc7352ed 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -10,6 +10,7 @@ #define SMCD_ISM_H #include <linux/uio.h> +#include <linux/types.h> #include <linux/mutex.h> #include "smc.h" @@ -20,9 +21,6 @@ struct smcd_dev_list { /* List of SMCD devices */ }; extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ -extern bool smc_ism_v2_capable; /* HW supports ISM V2 and thus - * System EID is defined - */ struct smc_ism_vlanid { /* VLAN id set on ISM device */ struct list_head list; @@ -52,5 +50,7 @@ int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(struct smcd_dev *dev, u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); +bool smc_ism_is_v2_capable(void); void smc_ism_init(void); +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c new file mode 100644 index 000000000000..140419a19dbf --- /dev/null +++ b/net/smc/smc_netlink.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to interact with SMC module + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/if.h> +#include <linux/smc.h> + +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_ib.h" +#include "smc_netlink.h" + +#define SMC_CMD_MAX_ATTR 1 + +/* SMC_GENL generic netlink operation definition */ +static const struct genl_ops smc_gen_nl_ops[] = { + { + .cmd = SMC_NETLINK_GET_SYS_INFO, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_sys_info, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_LINK_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_link, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_device, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_device, + }, +}; + +static const struct nla_policy smc_gen_nl_policy[2] = { + [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, +}; + +/* SMC_GENL family definition */ +struct genl_family smc_gen_nl_family __ro_after_init = { + .hdrsize = 0, + .name = SMC_GENL_FAMILY_NAME, + .version = SMC_GENL_FAMILY_VERSION, + .maxattr = SMC_CMD_MAX_ATTR, + .policy = smc_gen_nl_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_gen_nl_ops, + .n_ops = ARRAY_SIZE(smc_gen_nl_ops) +}; + +int __init smc_nl_init(void) +{ + return genl_register_family(&smc_gen_nl_family); +} + +void smc_nl_exit(void) +{ + genl_unregister_family(&smc_gen_nl_family); +} diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h new file mode 100644 index 000000000000..3477265cba6c --- /dev/null +++ b/net/smc/smc_netlink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC Generic netlink operations + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#ifndef _SMC_NETLINK_H +#define _SMC_NETLINK_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +extern struct genl_family smc_gen_nl_family; + +struct smc_nl_dmp_ctx { + int pos[2]; +}; + +static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) +{ + return (struct smc_nl_dmp_ctx *)c->ctx; +} + +int smc_nl_init(void) __init; +void smc_nl_exit(void); + +#endif diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index f3c18b991d35..6f6d33edb135 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -827,9 +827,11 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 1e23cdd41eb1..cbc73a7e4d59 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -131,9 +131,9 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) wake_up(&link->wr_tx_wait); } -static void smc_wr_tx_tasklet_fn(unsigned long data) +static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -435,9 +435,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) } } -static void smc_wr_rx_tasklet_fn(unsigned long data) +static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -698,10 +698,8 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev) void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn, - (unsigned long)smcibdev); - tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn, - (unsigned long)smcibdev); + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) diff --git a/net/socket.c b/net/socket.c index 6e6cccc2104f..bfef11ba35b8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -52,6 +52,7 @@ * Based upon Swansea University Computer Society NET3.039 */ +#include <linux/ethtool.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> @@ -64,7 +65,6 @@ #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/if_bridge.h> -#include <linux/if_frad.h> #include <linux/if_vlan.h> #include <linux/ptp_classify.h> #include <linux/init.h> @@ -1027,17 +1027,6 @@ void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) } EXPORT_SYMBOL(vlan_ioctl_set); -static DEFINE_MUTEX(dlci_ioctl_mutex); -static int (*dlci_ioctl_hook) (unsigned int, void __user *); - -void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) -{ - mutex_lock(&dlci_ioctl_mutex); - dlci_ioctl_hook = hook; - mutex_unlock(&dlci_ioctl_mutex); -} -EXPORT_SYMBOL(dlci_ioctl_set); - static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { @@ -1156,17 +1145,6 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; - case SIOCADDDLCI: - case SIOCDELDLCI: - err = -ENOPKG; - if (!dlci_ioctl_hook) - request_module("dlci"); - - mutex_lock(&dlci_ioctl_mutex); - if (dlci_ioctl_hook) - err = dlci_ioctl_hook(cmd, argp); - mutex_unlock(&dlci_ioctl_mutex); - break; case SIOCGSKNS: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -3427,8 +3405,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBRDELBR: case SIOCGIFVLAN: case SIOCSIFVLAN: - case SIOCADDDLCI: - case SIOCDELDLCI: case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index eadc0ede928c..8241f5a4a01c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -781,7 +781,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, } /** - * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication + * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace + * communication * @parent: dentry of directory to create new "pipe" in * @name: name of pipe * @private: private data to associate with the pipe, for the caller's use diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index a18b36b5422d..3aad6ef18504 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -63,19 +63,20 @@ static int proc_do_xprt(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; - size_t len; + ssize_t len; - if ((*ppos && !write) || !*lenp) { + if (write || *ppos) { *lenp = 0; return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - *lenp = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); - if (*lenp < 0) { + if (len < 0) { *lenp = 0; return -EINVAL; } + *lenp = len; return 0; } diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 0f1eaed1bd1b..abe29d1aa23a 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -55,12 +55,11 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); - u32 *tmp = (u32 *)id; memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); - tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; - pr_info("Own node identity %s, cluster identity %u\n", + tn->trial_addr = hash128to32(id); + pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } @@ -76,7 +75,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) } tn->trial_addr = addr; tn->addr_trial_end = jiffies; - pr_info("32-bit node address hash set to %x\n", addr); + pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 31bee0ea7b3e..1a11831bef62 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 650414110452..a4389ef08a98 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -72,6 +72,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, /** * tipc_media_find - locates specified media object by name + * @name: name to locate */ struct tipc_media *tipc_media_find(const char *name) { @@ -86,6 +87,7 @@ struct tipc_media *tipc_media_find(const char *name) /** * media_find_id - locates specified media object by type identifier + * @type: type identifier to locate */ static struct tipc_media *media_find_id(u8 type) { @@ -100,6 +102,9 @@ static struct tipc_media *media_find_id(u8 type) /** * tipc_media_addr_printf - record media address in print buffer + * @buf: output buffer + * @len: output buffer size remaining + * @a: input media address */ int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) { @@ -127,7 +132,7 @@ int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) * @name: ptr to bearer name string * @name_parts: ptr to area for bearer name components (or NULL if not needed) * - * Returns 1 if bearer name is valid, otherwise 0. + * Return: 1 if bearer name is valid, otherwise 0. */ static int bearer_name_validate(const char *name, struct tipc_bearer_names *name_parts) @@ -139,10 +144,7 @@ static int bearer_name_validate(const char *name, u32 if_len; /* copy bearer name & ensure length is OK */ - name_copy[TIPC_MAX_BEARER_NAME - 1] = 0; - /* need above in case non-Posix strncpy() doesn't pad with nulls */ - strncpy(name_copy, name, TIPC_MAX_BEARER_NAME); - if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0) + if (strscpy(name_copy, name, TIPC_MAX_BEARER_NAME) < 0) return 0; /* ensure all component parts of bearer name are present */ @@ -169,6 +171,8 @@ static int bearer_name_validate(const char *name, /** * tipc_bearer_find - locates bearer object with matching bearer name + * @net: the applicable net namespace + * @name: bearer name to locate */ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) { @@ -231,6 +235,11 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) /** * tipc_enable_bearer - enable bearer with the given name + * @net: the applicable net namespace + * @name: bearer name to enable + * @disc_domain: bearer domain + * @prio: bearer priority + * @attr: nlattr array */ static int tipc_enable_bearer(struct net *net, const char *name, u32 disc_domain, u32 prio, @@ -345,6 +354,8 @@ rejected: /** * tipc_reset_bearer - Reset all links established over this bearer + * @net: the applicable net namespace + * @b: the target bearer */ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) { @@ -366,7 +377,9 @@ void tipc_bearer_put(struct tipc_bearer *b) } /** - * bearer_disable + * bearer_disable - disable this bearer + * @net: the applicable net namespace + * @b: the bearer to disable * * Note: This routine assumes caller holds RTNL lock. */ @@ -437,6 +450,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, } /* tipc_disable_l2_media - detach TIPC bearer from an L2 interface + * @b: the target bearer * * Mark L2 bearer as inactive so that incoming buffers are thrown away */ @@ -453,6 +467,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b) /** * tipc_l2_send_msg - send a TIPC packet out over an L2 interface + * @net: the associated network namespace * @skb: the packet to be sent * @b: the bearer through which the packet is to be sent * @dest: peer destination address diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index bc0023119da2..6bf4550aa1ac 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -93,7 +93,8 @@ struct tipc_bearer; * @raw2addr: convert from raw addr format to media addr format * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure - * @window: default window (in packets) before declaring link congestion + * @min_win: minimum window (in packets) before declaring link congestion + * @max_win: maximum window (in packets) before declaring link congestion * @mtu: max packet size bearer can support for media type not dependent on * underlying device MTU * @type_id: TIPC media identifier @@ -138,12 +139,15 @@ struct tipc_media { * @pt: packet type for bearer * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer - * @window: default window size for bearer + * @min_win: minimum window (in packets) before declaring link congestion + * @max_win: maximum window (in packets) before declaring link congestion * @tolerance: default link tolerance for bearer * @domain: network domain to which links can be established * @identity: array index of this bearer within TIPC bearer array - * @link_req: ptr to (optional) structure making periodic link setup requests + * @disc: ptr to link setup request * @net_plane: network plane ('A' through 'H') currently associated with bearer + * @up: bearer up flag (bit 0) + * @refcnt: tipc_bearer reference counter * * Note: media-specific code is responsible for initialization of the fields * indicated below when a bearer is enabled; TIPC's generic bearer code takes diff --git a/net/tipc/core.c b/net/tipc/core.c index c2ff42900b53..5cc1f0307215 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -81,8 +81,6 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; - INIT_LIST_HEAD(&tn->dist_queue); - err = tipc_bcast_init(net); if (err) goto out_bclink; diff --git a/net/tipc/core.h b/net/tipc/core.h index 1d57a4d3b05e..03de7b213f55 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -3,6 +3,7 @@ * * Copyright (c) 2005-2006, 2013-2018 Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -132,9 +133,6 @@ struct tipc_net { spinlock_t nametbl_lock; struct name_table *nametbl; - /* Name dist queue */ - struct list_head dist_queue; - /* Topology subscription server */ struct tipc_topsrv *topsrv; atomic_t subscription_count; @@ -213,6 +211,17 @@ static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; } +static inline u32 hash128to32(char *bytes) +{ + __be32 *tmp = (__be32 *)bytes; + u32 res; + + res = ntohl(tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]); + if (likely(res)) + return res; + return ntohl(tmp[0] | tmp[1] | tmp[2] | tmp[3]); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 740ab9ae41a6..f4fca8f7f63f 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB @@ -51,7 +51,7 @@ #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ -/** +/* * TIPC Key ids */ enum { @@ -63,7 +63,7 @@ enum { KEY_MAX = KEY_3, }; -/** +/* * TIPC Crypto statistics */ enum { @@ -90,7 +90,7 @@ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; -/** +/* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 @@ -123,6 +123,8 @@ struct tipc_key { /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs + * @tfm: cipher handle/key + * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; @@ -138,7 +140,7 @@ struct tipc_tfm { * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key - * @hint[]: a hint for user key + * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation @@ -166,6 +168,7 @@ struct tipc_aead { /** * struct tipc_crypto_stats - TIPC Crypto statistics + * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; @@ -194,6 +197,7 @@ struct tipc_crypto_stats { * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication + * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { @@ -324,6 +328,8 @@ do { \ /** * tipc_aead_key_validate - Validate a AEAD user key + * @ukey: pointer to user key data + * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { @@ -477,6 +483,7 @@ static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it + * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { @@ -714,9 +721,9 @@ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, * @__dnode: TIPC dest node if "known" * * Return: - * 0 : if the encryption has completed - * -EINPROGRESS/-EBUSY : if a callback will be performed - * < 0 : the encryption has failed + * * 0 : if the encryption has completed + * * -EINPROGRESS/-EBUSY : if a callback will be performed + * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, @@ -870,9 +877,9 @@ static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) * @b: TIPC bearer where the message has been received * * Return: - * 0 : if the decryption has completed - * -EINPROGRESS/-EBUSY : if a callback will be performed - * < 0 : the decryption has failed + * * 0 : if the decryption has completed + * * -EINPROGRESS/-EBUSY : if a callback will be performed + * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) @@ -1001,7 +1008,7 @@ static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * - * Returns "true" if this is a valid encryption message, otherwise "false" + * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { @@ -1674,12 +1681,12 @@ static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, * Otherwise, the skb is freed! * * Return: - * 0 : the encryption has succeeded (or no encryption) - * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made - * -ENOKEK : the encryption has failed due to no key - * -EKEYREVOKED : the encryption has failed due to key revoked - * -ENOMEM : the encryption has failed due to no memory - * < 0 : the encryption has failed due to other reasons + * * 0 : the encryption has succeeded (or no encryption) + * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made + * * -ENOKEK : the encryption has failed due to no key + * * -EKEYREVOKED : the encryption has failed due to key revoked + * * -ENOMEM : the encryption has failed due to no memory + * * < 0 : the encryption has failed due to other reasons */ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_bearer *b, struct tipc_media_addr *dst, @@ -1799,12 +1806,12 @@ exit: * cluster key(s) can be taken for decryption (- recursive). * * Return: - * 0 : the decryption has successfully completed - * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made - * -ENOKEY : the decryption has failed due to no key - * -EBADMSG : the decryption has failed due to bad message - * -ENOMEM : the decryption has failed due to no memory - * < 0 : the decryption has failed due to other reasons + * * 0 : the decryption has successfully completed + * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made + * * -ENOKEY : the decryption has failed due to no key + * * -EBADMSG : the decryption has failed due to bad message + * * -ENOMEM : the decryption has failed due to no memory + * * < 0 : the decryption has failed due to other reasons */ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b) diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index e71193bd5e36..ce7d4cc8a9e0 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/** +/* * net/tipc/crypto.h: Include file for TIPC crypto * * Copyright (c) 2019, Ericsson AB @@ -53,7 +53,7 @@ #define TIPC_AES_GCM_IV_SIZE 12 #define TIPC_AES_GCM_TAG_SIZE 16 -/** +/* * TIPC crypto modes: * - CLUSTER_KEY: * One single key is used for both TX & RX in all nodes in the cluster. @@ -69,7 +69,7 @@ enum { extern int sysctl_tipc_max_tfms __read_mostly; extern int sysctl_tipc_key_exchange_enabled __read_mostly; -/** +/* * TIPC encryption message format: * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 diff --git a/net/tipc/discover.c b/net/tipc/discover.c index d4ecacddb40c..5380f605b851 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -74,6 +74,7 @@ struct tipc_discoverer { /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace + * @skb: buffer containing message * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ @@ -341,7 +342,7 @@ exit: * @dest: destination address for request messages * @skb: pointer to created frame * - * Returns 0 if successful, otherwise -errno. + * Return: 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) @@ -380,7 +381,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, /** * tipc_disc_delete - destroy object sending periodic link setup requests - * @d: ptr to link duest structure + * @d: ptr to link dest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { diff --git a/net/tipc/group.c b/net/tipc/group.c index b1fcd2ad5ecf..3e137d8c9d2f 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -2,6 +2,7 @@ * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -359,7 +360,7 @@ struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) return &grp->dests; } -void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, +void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; diff --git a/net/tipc/group.h b/net/tipc/group.h index 76b4e5a7b39d..ea4c3be64c78 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -2,6 +2,7 @@ * net/tipc/group.h: Include file for TIPC group unicast/multicast functions * * Copyright (c) 2017, Ericsson AB + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -50,7 +51,7 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp); void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance); struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); -void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, +void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope); u32 tipc_group_exclude(struct tipc_group *grp); void tipc_group_filter_msg(struct tipc_group *grp, diff --git a/net/tipc/link.c b/net/tipc/link.c index 06b880da2a8e..6ae2140eb4f7 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -120,6 +120,34 @@ struct tipc_stats { * @reasm_buf: head of partially reassembled inbound message fragments * @bc_rcvr: marks that this is a broadcast receiver link * @stats: collects statistics regarding link activity + * @session: session to be used by link + * @snd_nxt_state: next send seq number + * @rcv_nxt_state: next rcv seq number + * @in_session: have received ACTIVATE_MSG from peer + * @active: link is active + * @if_name: associated interface name + * @rst_cnt: link reset counter + * @drop_point: seq number for failover handling (FIXME) + * @failover_reasm_skb: saved failover msg ptr (FIXME) + * @failover_deferdq: deferred message queue for failover processing (FIXME) + * @transmq: the link's transmit queue + * @backlog: link's backlog by priority (importance) + * @snd_nxt: next sequence number to be used + * @rcv_unacked: # messages read by user, but not yet acked back to peer + * @deferdq: deferred receive queue + * @window: sliding window size for congestion handling + * @min_win: minimal send window to be used by link + * @ssthresh: slow start threshold for congestion handling + * @max_win: maximal send window to be used by link + * @cong_acks: congestion acks for congestion avoidance (FIXME) + * @checkpoint: seq number for congestion window size handling + * @reasm_tnlmsg: fragmentation/reassembly area for tunnel protocol message + * @last_gap: last gap ack blocks for bcast (FIXME) + * @last_ga: ptr to gap ack blocks + * @bc_rcvlink: the peer specific link used for broadcast reception + * @bc_sndlink: the namespace global link used for broadcast sending + * @nack_state: bcast nack state + * @bc_peer_is_up: peer has acked the bcast init msg */ struct tipc_link { u32 addr; @@ -450,7 +478,6 @@ u32 tipc_link_state(struct tipc_link *l) * @min_win: minimal send window to be used by link * @max_win: maximal send window to be used by link * @session: session to be used by link - * @ownnode: identity of own node * @peer: node id of peer node * @peer_caps: bitmap describing peer node capabilities * @bc_sndlink: the namespace global link used for broadcast sending @@ -458,8 +485,10 @@ u32 tipc_link_state(struct tipc_link *l) * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery * @link: return value, pointer to put the created link + * @self: local unicast link id + * @peer_id: 128-bit ID of peer * - * Returns true if link was created, otherwise false + * Return: true if link was created, otherwise false */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, @@ -532,8 +561,13 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery * @link: return value, pointer to put the created link + * @ownnode: identity of own node + * @peer: node id of peer node + * @peer_id: 128-bit ID of peer + * @peer_caps: bitmap describing peer node capabilities + * @bc_sndlink: the namespace global link used for broadcast sending * - * Returns true if link was created, otherwise false + * Return: true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, @@ -788,7 +822,7 @@ static void link_profile_stats(struct tipc_link *l) * tipc_link_too_silent - check if link is "too silent" * @l: tipc link to be checked * - * Returns true if the link 'silent_intv_cnt' is about to reach the + * Return: true if the link 'silent_intv_cnt' is about to reach the * 'abort_limit' value, otherwise false */ bool tipc_link_too_silent(struct tipc_link *l) @@ -990,8 +1024,8 @@ void tipc_link_reset(struct tipc_link *l) * @xmitq: returned list of packets to be sent by caller * * Consumes the buffer chain. - * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted + * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS */ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) @@ -1260,7 +1294,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, pr_warn("Dropping received illegal msg type\n"); kfree_skb(skb); return true; - }; + } } /* tipc_link_input - process packet that has passed link protocol check @@ -2376,7 +2410,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!msg_peer_node_is_up(hdr)) return rc; - /* Open when peer ackowledges our bcast init msg (pkt #1) */ + /* Open when peer acknowledges our bcast init msg (pkt #1) */ if (msg_ack(hdr)) l->bc_peer_is_up = true; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 2a78aa701572..2aca86021df5 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -58,11 +58,13 @@ static unsigned int align(unsigned int i) /** * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) + * @gfp: memory allocation flags * - * Returns a new buffer with data pointers set to the specified size. + * Return: a new buffer with data pointers set to the specified size. * - * NOTE: Headroom is reserved to allow prepending of a data link header. - * There may also be unrequested tailroom present at the buffer's end. + * NOTE: + * Headroom is reserved to allow prepending of a data link header. + * There may also be unrequested tailroom present at the buffer's end. */ struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { @@ -150,12 +152,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - if (skb_cloned(frag)) - frag = skb_copy(frag, GFP_ATOMIC); + *buf = NULL; + frag = skb_unshare(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; - *buf = NULL; TIPC_SKB_CB(head)->tail = NULL; if (skb_is_nonlinear(head)) { skb_walk_frags(head, tail) { @@ -208,8 +209,9 @@ err: * @m: the data to be appended * @mss: max allowable size of buffer * @dlen: size of data to be appended - * @txq: queue to appand to - * Returns the number og 1k blocks appended or errno value + * @txq: queue to append to + * + * Return: the number of 1k blocks appended or errno value */ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) @@ -313,7 +315,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) * @pktmax: max size of a fragment incl. the header * @frags: returned fragment skb list * - * Returns 0 if the fragmentation is successful, otherwise: -EINVAL + * Return: 0 if the fragmentation is successful, otherwise: -EINVAL * or -ENOMEM */ int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, @@ -368,6 +370,7 @@ error: * tipc_msg_build - create buffer chain containing specified header and data * @mhdr: Message header, to be prepended to data * @m: User message + * @offset: buffer offset for fragmented messages (FIXME) * @dsz: Total length of user data * @pktmax: Max packet size that can be used * @list: Buffer or chain of buffers to be returned to caller @@ -375,7 +378,7 @@ error: * Note that the recursive call we are making here is safe, since it can * logically go only one further level down. * - * Returns message data size or errno: -ENOMEM, -EFAULT + * Return: message data size or errno: -ENOMEM, -EFAULT */ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int pktmax, struct sk_buff_head *list) @@ -486,7 +489,7 @@ error: * @msg: message to be appended * @max: max allowable size for the bundle buffer * - * Returns "true" if bundling has been performed, otherwise "false" + * Return: "true" if bundling has been performed, otherwise "false" */ static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, u32 max) @@ -581,9 +584,9 @@ bundle: * @skb: buffer to be extracted from. * @iskb: extracted inner buffer, to be returned * @pos: position in outer message of msg to be extracted. - * Returns position of next msg + * Returns position of next msg. * Consumes outer buffer when last packet extracted - * Returns true when there is an extracted buffer, otherwise false + * Return: true when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { @@ -627,7 +630,7 @@ none: * @skb: buffer containing message to be reversed; will be consumed * @err: error code to be set in message, if any * Replaces consumed buffer with new one when successful - * Returns true if success, otherwise false + * Return: true if success, otherwise false */ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { @@ -699,10 +702,11 @@ bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy) /** * tipc_msg_lookup_dest(): try to find new destination for named message + * @net: pointer to associated network namespace * @skb: the buffer containing the message. * @err: error code to be used by caller if lookup fails * Does not consume buffer - * Returns true if a destination is found, false otherwise + * Return: true if a destination is found, false otherwise */ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index fe4edce459ad..6cf57c3bfa27 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -50,6 +50,8 @@ struct distr_queue_item { /** * publ_to_item - add publication info to a publication message + * @p: publication info + * @i: location of item in the message */ static void publ_to_item(struct distr_item *i, struct publication *p) { @@ -62,6 +64,10 @@ static void publ_to_item(struct distr_item *i, struct publication *p) /** * named_prepare_buf - allocate & initialize a publication message + * @net: the associated network namespace + * @type: message type + * @size: payload size + * @dest: destination node * * The buffer returned is of size INT_H_SIZE + payload size */ @@ -83,6 +89,8 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, /** * tipc_named_publish - tell other nodes about a new publication by this node + * @net: the associated network namespace + * @publ: the new publication */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) { @@ -111,6 +119,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node + * @net: the associated network namespace + * @publ: the withdrawn publication */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) { @@ -138,9 +148,11 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) /** * named_distribute - prepare name info for bulk distribution to another node + * @net: the associated network namespace * @list: list of messages (buffers) to be returned from this function * @dnode: node to be updated * @pls: linked list of publication items to be packed into buffer chain + * @seqno: sequence number for this message */ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 dnode, struct list_head *pls, u16 seqno) @@ -194,6 +206,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, /** * tipc_named_node_up - tell specified node about all publications by this node + * @net: the associated network namespace + * @dnode: destination node + * @capabilities: peer node's capabilities */ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { @@ -217,6 +232,9 @@ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) /** * tipc_publ_purge - remove publication associated with a failed node + * @net: the associated network namespace + * @publ: the publication to remove + * @addr: failed node's address * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. @@ -244,24 +262,6 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) kfree_rcu(p, rcu); } -/** - * tipc_dist_queue_purge - remove deferred updates from a node that went down - */ -static void tipc_dist_queue_purge(struct net *net, u32 addr) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct distr_queue_item *e, *tmp; - - spin_lock_bh(&tn->nametbl_lock); - list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { - if (e->node != addr) - continue; - list_del(&e->next); - kfree(e); - } - spin_unlock_bh(&tn->nametbl_lock); -} - void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr, u16 capabilities) { @@ -272,7 +272,6 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); - tipc_dist_queue_purge(net, addr); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests--; @@ -282,9 +281,13 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, /** * tipc_update_nametbl - try to process a nametable update and notify * subscribers + * @net: the associated network namespace + * @i: location of item in the message + * @node: node address + * @dtype: name distributor message type * * tipc_nametbl_lock must be held. - * Returns the publication item if successful, otherwise NULL. + * Return: the publication item if successful, otherwise NULL. */ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) @@ -366,6 +369,10 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, /** * tipc_named_rcv - process name table update messages sent by another node + * @net: the associated network namespace + * @namedq: queue to receive from + * @rcv_nxt: store last received seqno here + * @open: last bulk msg was received (FIXME) */ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) @@ -393,6 +400,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, /** * tipc_named_reinit - re-initialize local publications + * @net: the associated network namespace * * This routine is called whenever TIPC networking is enabled. * All name table entries published by this node are updated to reflect diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 092323158f06..e231e6964d61 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -46,7 +46,7 @@ * @type: name sequence type * @lower: name sequence lower bound * @upper: name sequence upper bound - * @ref: publishing port reference + * @port: publishing port reference * @key: publication key * * ===> All fields are stored in network byte order. <=== diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 2ac33d32edc2..ee5ac40ea2b6 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +104,8 @@ RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree - * @start, end: the range (end >= start) for matching + * @start: beginning of the search range (end >= start) for matching + * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ @@ -117,7 +119,8 @@ RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching - * @start, end: the range (end >= start) for matching + * @start: beginning of the search range (end >= start) for matching + * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. @@ -166,7 +169,8 @@ static struct service_range *service_range_match_first(struct rb_node *n, /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts - * @start, end: the range (end >= start) for matching + * @start: beginning of the search range (end >= start) for matching + * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. @@ -218,6 +222,13 @@ static int hash(int x) /** * tipc_publ_create - create a publication structure + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @scope: publication scope + * @node: network address of publishing socket + * @port: publishing port + * @key: publication key */ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, u32 scope, u32 node, u32 port, @@ -245,6 +256,8 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, /** * tipc_service_create - create a service structure for the specified 'type' + * @type: service type + * @hd: name_table services list * * Allocates a single range structure and sets it to all 0's. */ @@ -361,6 +374,9 @@ err: /** * tipc_service_remove_publ - remove a publication from a service + * @sr: service_range to remove publication from + * @node: target node + * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *sr, u32 node, u32 key) @@ -377,7 +393,7 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, return NULL; } -/** +/* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) @@ -395,6 +411,8 @@ static int tipc_publ_sort(void *priv, struct list_head *a, * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range + * @service: the tipc_service to attach the @sub to + * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) @@ -403,12 +421,12 @@ static void tipc_service_subscribe(struct tipc_service *service, struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; - struct tipc_name_seq ns; + struct tipc_service_range r; u32 filter; - ns.type = tipc_sub_read(sb, seq.type); - ns.lower = tipc_sub_read(sb, seq.lower); - ns.upper = tipc_sub_read(sb, seq.upper); + r.type = tipc_sub_read(sb, seq.type); + r.lower = tipc_sub_read(sb, seq.lower); + r.upper = tipc_sub_read(sb, seq.upper); filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); @@ -418,7 +436,7 @@ static void tipc_service_subscribe(struct tipc_service *service, return; INIT_LIST_HEAD(&publ_list); - service_range_foreach_match(sr, service, ns.lower, ns.upper) { + service_range_foreach_match(sr, service, r.lower, r.upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) @@ -528,14 +546,16 @@ exit: /** * tipc_nametbl_translate - perform service instance to socket translation - * - * On entry, 'dnode' is the search domain used during translation. + * @net: network namespace + * @type: message type + * @instance: message instance + * @dnode: the search domain used during translation * * On exit: * - if translation is deferred to another node, leave 'dnode' unchanged and - * return 0 + * return 0 * - if translation is attempted and succeeds, set 'dnode' to the publishing - * node and return the published (non-zero) port number + * node and return the published (non-zero) port number * - if translation is attempted and fails, set 'dnode' to 0 and return 0 * * Note that for legacy users (node configured with Z.C.N address format) the @@ -756,6 +776,11 @@ exit: /** * tipc_nametbl_withdraw - withdraw a service binding + * @net: network namespace + * @type: service type + * @lower: service range lower bound + * @upper: service range upper bound + * @key: target publication key */ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper, u32 key) @@ -791,6 +816,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, /** * tipc_nametbl_subscribe - add a subscription object to the name table + * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { @@ -821,6 +847,7 @@ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) /** * tipc_nametbl_unsubscribe - remove a subscription object from name table + * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { @@ -870,7 +897,9 @@ int tipc_nametbl_init(struct net *net) } /** - * tipc_service_delete - purge all publications for a service and delete it + * tipc_service_delete - purge all publications for a service and delete it + * @net: the associated network namespace + * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 8064e1986e2c..5a82a01369d6 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -60,8 +60,8 @@ struct tipc_group; * @key: publication key, unique across the cluster * @id: publication id * @binding_node: all publications from the same node which bound this one - * - Remote publications: in node->publ_list - * Used by node/name distr to withdraw publications when node is lost + * - Remote publications: in node->publ_list; + * Used by node/name distr to withdraw publications when node is lost * - Local/node scope publications: in name_table->node_scope list * - Local/cluster scope publications: in name_table->cluster_scope list * @binding_sock: all publications from the same socket which bound this one @@ -92,13 +92,16 @@ struct publication { /** * struct name_table - table containing all existing port name publications - * @seq_hlist: name sequence hash lists + * @services: name sequence hash lists * @node_scope: all local publications with node scope * - used by name_distr during re-init of name table * @cluster_scope: all local publications with cluster scope * - used by name_distr to send bulk updates to new nodes * - used by name_distr during re-init of name table + * @cluster_scope_lock: lock for accessing @cluster_scope * @local_publ_count: number of publications issued by this node + * @rc_dests: destination node counter + * @snd_nxt: next sequence number to be used */ struct name_table { struct hlist_head services[TIPC_NAMETBL_SIZE]; diff --git a/net/tipc/net.c b/net/tipc/net.c index 0bb2323201da..a129f661bee3 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -132,7 +132,7 @@ static void tipc_net_finalize(struct net *net, u32 addr) tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + tipc_nametbl_publish(net, TIPC_NODE_STATE, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); } diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 1c7aa51cc2a3..82f154989418 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -118,7 +118,8 @@ static void tipc_tlv_init(struct sk_buff *skb, u16 type) skb_put(skb, sizeof(struct tlv_desc)); } -static int tipc_tlv_sprintf(struct sk_buff *skb, const char *fmt, ...) +static __printf(2, 3) int tipc_tlv_sprintf(struct sk_buff *skb, + const char *fmt, ...) { int n; u16 len; @@ -588,7 +589,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return 0; tipc_tlv_sprintf(msg->rep, "\nLink <%s>\n", - nla_data(link[TIPC_NLA_LINK_NAME])); + (char *)nla_data(link[TIPC_NLA_LINK_NAME])); if (link[TIPC_NLA_LINK_BROADCAST]) { __fill_bc_link_stat(msg, prop, stats); @@ -695,7 +696,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); - nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME], + nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, diff --git a/net/tipc/node.c b/net/tipc/node.c index d269ebe382e1..c4b87d2cc0e3 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -82,7 +82,7 @@ struct tipc_bclink_entry { /** * struct tipc_node - TIPC node structure * @addr: network address of node - * @ref: reference counter to node object + * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain @@ -90,9 +90,11 @@ struct tipc_bclink_entry { * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node + * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not + * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) @@ -100,9 +102,16 @@ struct tipc_bclink_entry { * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any + * @peer_id: 128-bit ID of peer + * @peer_id_string: ID string of peer * @publ_list: list of publications + * @conn_sks: list of connections (FIXME) + * @timer: node's keepalive timer + * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node + * @peer_net: peer's net namespace + * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { @@ -267,6 +276,7 @@ char *tipc_node_get_id_str(struct tipc_node *node) #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node + * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) @@ -814,6 +824,9 @@ static void tipc_node_timeout(struct timer_list *t) /** * __tipc_node_link_up - handle addition of link + * @n: target tipc_node + * @bearer_id: id of the bearer + * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ @@ -880,6 +893,9 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, /** * tipc_node_link_up - handle addition of link + * @n: target tipc_node + * @bearer_id: id of the bearer + * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ @@ -900,10 +916,11 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. - * This can happen when e.g. + * This can happen when e.g.:: + * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network - * disturbance, wrong session, etc.) + * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> @@ -940,6 +957,10 @@ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, /** * __tipc_node_link_down - handle loss of link + * @n: target tipc_node + * @bearer_id: id of the bearer + * @xmitq: queue for messages to be xmited on + * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, @@ -1525,11 +1546,13 @@ static void node_lost_contact(struct tipc_node *n, /** * tipc_node_get_linkname - get the name of a link * + * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer + * @len: size of @linkname output buffer * - * Returns 0 on success + * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) @@ -1638,7 +1661,7 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) return; default: return; - }; + } } /** @@ -1648,7 +1671,7 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. - * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF + * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) @@ -1881,9 +1904,11 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id /** * tipc_node_check_state - check and if necessary update node state + * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet - * Returns true if state and msg are ok, otherwise false + * @xmitq: queue for messages to be xmited on + * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) @@ -2182,6 +2207,8 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); } + /* Update MTU for node link entry */ + e->mtu = tipc_link_mss(e->link); tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e795a8a2955b..cebcc104dc70 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,8 +1,9 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2017, Ericsson AB + * Copyright (c) 2001-2007, 2012-2019, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -79,19 +80,32 @@ struct sockaddr_pair { * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages - * #cong_links: list of congested links + * @cong_links: list of congested links * @publications: list of publications for port * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @conn_timeout: the time we can wait for an unresponded setup request + * @probe_unacked: probe has not received ack yet * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links * @snt_unacked: # messages sent by socket, and not yet acked by peer + * @snd_win: send window size + * @peer_caps: peer capabilities mask * @rcv_unacked: # messages read by user, but not yet acked back to peer + * @rcv_win: receive window size * @peer: 'connected' peer for dgram/rdm * @node: hash table node * @mc_method: cookie for use between socket and broadcast layer * @rcu: rcu struct for tipc_sock + * @group: TIPC communications group + * @oneway: message count in one direction (FIXME) + * @nagle_start: current nagle value + * @snd_backlog: send backlog count + * @msg_acc: messages accepted; used in managing backlog and nagle + * @pkt_cnt: TIPC socket packet count + * @expect_ack: whether this TIPC socket is expecting an ack + * @nodelay: setsockopt() TIPC_NODELAY setting + * @group_is_open: TIPC socket group is fully open (FIXME) */ struct tipc_sock { struct sock sk; @@ -138,9 +152,9 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern); static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq); + struct tipc_service_range const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq); + struct tipc_service_range const *seq); static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); @@ -260,6 +274,7 @@ static void tsk_set_nagle(struct tipc_sock *tsk) /** * tsk_advance_rx_queue - discard first buffer in socket receive queue + * @sk: network socket * * Caller must hold socket lock */ @@ -288,6 +303,8 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) /** * tsk_rej_rx_queue - reject all buffers in socket receive queue + * @sk: network socket + * @error: response error code * * Caller must hold socket lock */ @@ -441,7 +458,7 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) * This routine creates additional data structures used by the TIPC socket, * initializes them, and links them together. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, int kern) @@ -606,7 +623,7 @@ static void __tipc_shutdown(struct socket *sock, int error) * are returned or discarded according to the "destination droppable" setting * specified for the message by the sender. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_release(struct socket *sock) { @@ -644,75 +661,77 @@ static int tipc_release(struct socket *sock) } /** - * tipc_bind - associate or disassocate TIPC name(s) with a socket + * __tipc_bind - associate or disassocate TIPC name(s) with a socket * @sock: socket structure - * @uaddr: socket address describing name(s) and desired operation - * @uaddr_len: size of socket address data structure + * @skaddr: socket address describing name(s) and desired operation + * @alen: size of socket address data structure * * Name and name sequence binding is indicated using a positive scope value; * a negative scope value unbinds the specified name. Specifying no name * (i.e. a socket address length of 0) unbinds all names from the socket. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise * * NOTE: This routine doesn't need to take the socket lock since it doesn't * access any non-constant socket information. */ -static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, - int uaddr_len) +static int __tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) { - struct sock *sk = sock->sk; - struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; - struct tipc_sock *tsk = tipc_sk(sk); - int res = -EINVAL; + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr; + struct tipc_sock *tsk = tipc_sk(sock->sk); - lock_sock(sk); - if (unlikely(!uaddr_len)) { - res = tipc_sk_withdraw(tsk, 0, NULL); - goto exit; - } - if (tsk->group) { - res = -EACCES; - goto exit; - } - if (uaddr_len < sizeof(struct sockaddr_tipc)) { - res = -EINVAL; - goto exit; - } - if (addr->family != AF_TIPC) { - res = -EAFNOSUPPORT; - goto exit; - } + if (unlikely(!alen)) + return tipc_sk_withdraw(tsk, 0, NULL); - if (addr->addrtype == TIPC_ADDR_NAME) + if (addr->addrtype == TIPC_SERVICE_ADDR) addr->addr.nameseq.upper = addr->addr.nameseq.lower; - else if (addr->addrtype != TIPC_ADDR_NAMESEQ) { - res = -EAFNOSUPPORT; - goto exit; - } - if ((addr->addr.nameseq.type < TIPC_RESERVED_TYPES) && - (addr->addr.nameseq.type != TIPC_TOP_SRV) && - (addr->addr.nameseq.type != TIPC_CFG_SRV)) { - res = -EACCES; - goto exit; - } + if (tsk->group) + return -EACCES; - res = (addr->scope >= 0) ? - tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : - tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); -exit: - release_sock(sk); + if (addr->scope >= 0) + return tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq); + else + return tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); +} + +int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +{ + int res; + + lock_sock(sock->sk); + res = __tipc_bind(sock, skaddr, alen); + release_sock(sock->sk); return res; } +static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +{ + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr; + + if (alen) { + if (alen < sizeof(struct sockaddr_tipc)) + return -EINVAL; + if (addr->family != AF_TIPC) + return -EAFNOSUPPORT; + if (addr->addrtype > TIPC_SERVICE_ADDR) + return -EAFNOSUPPORT; + if (addr->addr.nameseq.type < TIPC_RESERVED_TYPES) { + pr_warn_once("Can't bind to reserved service type %u\n", + addr->addr.nameseq.type); + return -EACCES; + } + } + return tipc_sk_bind(sock, skaddr, alen); +} + /** * tipc_getname - get port ID of socket or peer socket * @sock: socket structure * @uaddr: area for returned socket address * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise * * NOTE: This routine doesn't need to take the socket lock since it only * accesses socket information that is unchanging (or which changes in @@ -737,7 +756,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, addr->addr.id.node = tipc_own_addr(sock_net(sk)); } - addr->addrtype = TIPC_ADDR_ID; + addr->addrtype = TIPC_SOCKET_ADDR; addr->family = AF_TIPC; addr->scope = 0; addr->addr.name.domain = 0; @@ -751,7 +770,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * @sock: socket for which to calculate the poll bits * @wait: ??? * - * Returns pollmask value + * Return: pollmask value * * COMMENTARY: * It appears that the usual socket locking mechanisms are not useful here @@ -813,9 +832,9 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ -static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, +static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, struct msghdr *msg, size_t dlen, long timeout) { struct sock *sk = sock->sk; @@ -873,6 +892,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, /** * tipc_send_group_msg - send a message to a member in the group * @net: network namespace + * @tsk: tipc socket * @m: message to send * @mb: group member * @dnode: destination node @@ -928,7 +948,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -972,7 +992,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -1057,7 +1077,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -1131,7 +1151,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -1168,6 +1188,7 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, /** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets + * @net: the associated network namespace * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup * @@ -1307,6 +1328,8 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack) * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. + * @inputq: buffer list containing the buffers + * @xmitq: output message area */ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff_head *inputq, @@ -1374,7 +1397,7 @@ exit: * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections. * (Note: 'SYN+' is prohibited on SOCK_STREAM.) * - * Returns the number of bytes sent on success, or errno otherwise + * Return: the number of bytes sent on success, or errno otherwise */ static int tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) @@ -1400,7 +1423,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) bool syn = !tipc_sk_type_connectionless(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; - struct tipc_name_seq *seq; + struct tipc_service_range *seq; struct sk_buff_head pkts; u32 dport = 0, dnode = 0; u32 type = 0, inst = 0; @@ -1419,9 +1442,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (grp) { if (!dest) return tipc_send_group_bcast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_NAME) + if (dest->addrtype == TIPC_SERVICE_ADDR) return tipc_send_group_anycast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_ID) + if (dest->addrtype == TIPC_SOCKET_ADDR) return tipc_send_group_unicast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_MCAST) return tipc_send_group_mcast(sock, m, dlen, timeout); @@ -1441,7 +1464,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; - if (dest->addrtype == TIPC_ADDR_NAME) { + if (dest->addrtype == TIPC_SERVICE_ADDR) { tsk->conn_type = dest->addr.name.name.type; tsk->conn_instance = dest->addr.name.name.instance; } @@ -1452,14 +1475,14 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (dest->addrtype == TIPC_ADDR_MCAST) return tipc_sendmcast(sock, seq, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_NAME) { + if (dest->addrtype == TIPC_SERVICE_ADDR) { type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; dnode = dest->addr.name.domain; dport = tipc_nametbl_translate(net, type, inst, &dnode); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_ADDR_ID) { + } else if (dest->addrtype == TIPC_SOCKET_ADDR) { dnode = dest->addr.id.node; } else { return -EINVAL; @@ -1471,7 +1494,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(rc)) return rc; - if (dest->addrtype == TIPC_ADDR_NAME) { + if (dest->addrtype == TIPC_SERVICE_ADDR) { msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); msg_set_nametype(hdr, type); @@ -1479,7 +1502,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dport); - } else { /* TIPC_ADDR_ID */ + } else { /* TIPC_SOCKET_ADDR */ msg_set_type(hdr, TIPC_DIRECT_MSG); msg_set_lookup_scope(hdr, 0); msg_set_destnode(hdr, dnode); @@ -1519,7 +1542,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) * * Used for SOCK_STREAM data. * - * Returns the number of bytes sent on success (or partial success), + * Return: the number of bytes sent on success (or partial success), * or errno if no data sent */ static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz) @@ -1627,7 +1650,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) * * Used for SOCK_SEQPACKET messages. * - * Returns the number of bytes sent on success, or errno otherwise + * Return: the number of bytes sent on success, or errno otherwise */ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz) { @@ -1684,7 +1707,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) return; srcaddr->sock.family = AF_TIPC; - srcaddr->sock.addrtype = TIPC_ADDR_ID; + srcaddr->sock.addrtype = TIPC_SOCKET_ADDR; srcaddr->sock.scope = 0; srcaddr->sock.addr.id.ref = msg_origport(hdr); srcaddr->sock.addr.id.node = msg_orignode(hdr); @@ -1696,7 +1719,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) /* Group message users may also want to know sending member's id */ srcaddr->member.family = AF_TIPC; - srcaddr->member.addrtype = TIPC_ADDR_NAME; + srcaddr->member.addrtype = TIPC_SERVICE_ADDR; srcaddr->member.scope = 0; srcaddr->member.addr.name.name.type = msg_nametype(hdr); srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; @@ -1712,7 +1735,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) * * Note: Ancillary data is not captured if not requested by receiver. * - * Returns 0 if successful, otherwise errno + * Return: 0 if successful, otherwise errno */ static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) @@ -1862,6 +1885,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) /** * tipc_recvmsg - receive packet-oriented message + * @sock: network socket * @m: descriptor for message info * @buflen: length of user buffer area * @flags: receive flags @@ -1869,7 +1893,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. * If the complete message doesn't fit in user area, truncate it. * - * Returns size of returned message data, errno otherwise + * Return: size of returned message data, errno otherwise */ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) @@ -1970,6 +1994,7 @@ exit: /** * tipc_recvstream - receive stream-oriented data + * @sock: network socket * @m: descriptor for message info * @buflen: total size of user buffer area * @flags: receive flags @@ -1977,7 +2002,7 @@ exit: * Used for SOCK_STREAM messages only. If not enough data is available * will optionally wait for more; never truncates data. * - * Returns size of returned message data, errno otherwise + * Return: size of returned message data, errno otherwise */ static int tipc_recvstream(struct socket *sock, struct msghdr *m, size_t buflen, int flags) @@ -2155,7 +2180,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, * @tsk: TIPC socket * @skb: pointer to message buffer. * @xmitq: for Nagle ACK if any - * Returns true if message should be added to receive queue, false otherwise + * Return: true if message should be added to receive queue, false otherwise */ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff_head *xmitq) @@ -2269,7 +2294,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, * TIPC_HIGH_IMPORTANCE (8 MB) * TIPC_CRITICAL_IMPORTANCE (16 MB) * - * Returns overload limit according to corresponding message importance + * Return: overload limit according to corresponding message importance */ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) { @@ -2292,12 +2317,12 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. + * @xmitq: output message area (FIXME) * * Enqueues message on receive queue if acceptable; optionally handles * disconnect indication for a connected socket. * * Called with socket lock already taken - * */ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *xmitq) @@ -2387,6 +2412,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) * @inputq: list of incoming buffers with potentially different destinations * @sk: socket where the buffers should be enqueued * @dport: port number for the socket + * @xmitq: output queue * * Caller must hold socket lock */ @@ -2439,6 +2465,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /** * tipc_sk_rcv - handle a chain of incoming buffers + * @net: the associated network namespace * @inputq: buffer list containing the buffers * Consumes all buffers in list until inputq is empty * Note: may be called in multiple threads referring to the same queue @@ -2531,7 +2558,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr) * @destlen: size of socket address data structure * @flags: file-related flags associated with socket * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int destlen, int flags) @@ -2624,7 +2651,7 @@ exit: * @sock: socket structure * @len: (unused) * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_listen(struct socket *sock, int len) { @@ -2676,8 +2703,9 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * @sock: listening socket * @new_sock: new socket that is to be connected * @flags: file-related flags associated with socket + * @kern: caused by kernel or by userspace? * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) @@ -2756,7 +2784,7 @@ exit: * * Terminates connection (if necessary), then purges socket's receive queue. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_shutdown(struct socket *sock, int how) { @@ -2864,7 +2892,7 @@ static void tipc_sk_timeout(struct timer_list *t) } static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq) + struct tipc_service_range const *seq) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -2892,7 +2920,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, } static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq) + struct tipc_service_range const *seq) { struct net *net = sock_net(&tsk->sk); struct publication *publ; @@ -3039,7 +3067,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; - struct tipc_name_seq seq; + struct tipc_service_range seq; int rc; if (mreq->type < TIPC_RESERVED_TYPES) @@ -3076,7 +3104,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; - struct tipc_name_seq seq; + struct tipc_service_range seq; int scope; if (!grp) @@ -3099,7 +3127,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) * For stream sockets only, accepts and ignores all IPPROTO_TCP options * (to ease compatibility). * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, unsigned int ol) @@ -3193,14 +3221,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, * For stream sockets only, returns 0 length result for all IPPROTO_TCP options * (to ease compatibility). * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, char __user *ov, int __user *ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_name_seq seq; + struct tipc_service_range seq; int len, scope; u32 value; int res; @@ -3301,12 +3329,12 @@ static int tipc_socketpair(struct socket *sock1, struct socket *sock2) u32 onode = tipc_own_addr(sock_net(sock1->sk)); tsk1->peer.family = AF_TIPC; - tsk1->peer.addrtype = TIPC_ADDR_ID; + tsk1->peer.addrtype = TIPC_SOCKET_ADDR; tsk1->peer.scope = TIPC_NODE_SCOPE; tsk1->peer.addr.id.ref = tsk2->portid; tsk1->peer.addr.id.node = onode; tsk2->peer.family = AF_TIPC; - tsk2->peer.addrtype = TIPC_ADDR_ID; + tsk2->peer.addrtype = TIPC_SOCKET_ADDR; tsk2->peer.scope = TIPC_NODE_SCOPE; tsk2->peer.addr.id.ref = tsk1->portid; tsk2->peer.addr.id.node = onode; @@ -3397,7 +3425,7 @@ static struct proto tipc_proto = { /** * tipc_socket_init - initialize TIPC socket interface * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ int tipc_socket_init(void) { @@ -3796,10 +3824,11 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) /** * tipc_sk_filtering - check if a socket should be traced * @sk: the socket to be examined - * @sysctl_tipc_sk_filter[]: the socket tuple for filtering, - * (portid, sock type, name type, name lower, name upper) * - * Returns true if the socket meets the socket tuple data + * @sysctl_tipc_sk_filter is used as the socket tuple for filtering: + * (portid, sock type, name type, name lower, name upper) + * + * Return: true if the socket meets the socket tuple data * (value 0 = 'any') or when there is no tuple set (all = 0), * otherwise false */ @@ -3864,7 +3893,7 @@ u32 tipc_sock_get_portid(struct sock *sk) * @sk: tipc sk to be checked * @skb: tipc msg to be checked * - * Returns true if the socket rx queue allocation is > 90%, otherwise false + * Return: true if the socket rx queue allocation is > 90%, otherwise false */ bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb) @@ -3882,7 +3911,7 @@ bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb) * @sk: tipc sk to be checked * @skb: tipc msg to be checked * - * Returns true if the socket rx queue allocation is > 90%, otherwise false + * Return: true if the socket rx queue allocation is > 90%, otherwise false */ bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index b11575afc66f..02cdf166807d 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -74,7 +74,7 @@ int tipc_dump_done(struct netlink_callback *cb); u32 tipc_sock_get_portid(struct sock *sk); bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); - +int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen); int tsk_set_importance(struct sock *sk, int imp); #endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index f340e53da625..f6ad0005218c 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,12 +56,14 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, } /** - * tipc_sub_check_overlap - test for subscription overlap with the - * given values + * tipc_sub_check_overlap - test for subscription overlap with the given values + * @seq: tipc_name_seq to check + * @found_lower: lower value to test + * @found_upper: upper value to test * - * Returns 1 if there is overlap, otherwise 0. + * Return: 1 if there is overlap, otherwise 0. */ -int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, +int tipc_sub_check_overlap(struct tipc_service_range *seq, u32 found_lower, u32 found_upper) { if (found_lower < seq->lower) @@ -79,7 +82,7 @@ void tipc_sub_report_overlap(struct tipc_subscription *sub, { struct tipc_subscr *s = &sub->evt.s; u32 filter = tipc_sub_read(s, filter); - struct tipc_name_seq seq; + struct tipc_service_range seq; seq.type = tipc_sub_read(s, seq.type); seq.lower = tipc_sub_read(s, seq.lower); diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 6ebbec1bedd1..3ded27391d54 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -3,6 +3,7 @@ * * Copyright (c) 2003-2017, Ericsson AB * Copyright (c) 2005-2007, 2012-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,12 +48,15 @@ struct tipc_conn; /** * struct tipc_subscription - TIPC network topology subscription object - * @subscriber: pointer to its subscriber - * @seq: name sequence associated with subscription + * @kref: reference count for this subscription + * @net: network namespace associated with subscription * @timer: timer governing subscription duration (optional) - * @nameseq_list: adjacent subscriptions in name sequence's subscription list + * @service_list: adjacent subscriptions in name sequence's subscription list * @sub_list: adjacent subscriptions in subscriber's subscription list * @evt: template for events generated by subscription + * @conid: connection identifier of topology server + * @inactive: true if this subscription is inactive + * @lock: serialize up/down and timer events */ struct tipc_subscription { struct kref kref; @@ -63,7 +67,7 @@ struct tipc_subscription { struct tipc_event evt; int conid; bool inactive; - spinlock_t lock; /* serialize up/down and timer events */ + spinlock_t lock; }; struct tipc_subscription *tipc_sub_subscribe(struct net *net, @@ -71,8 +75,8 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net, int conid); void tipc_sub_unsubscribe(struct tipc_subscription *sub); -int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, - u32 found_upper); +int tipc_sub_check_overlap(struct tipc_service_range *seq, + u32 found_lower, u32 found_upper); void tipc_sub_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port, u32 node, diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 5f6f86051c83..5522865deae9 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -519,13 +519,13 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) goto err; saddr.family = AF_TIPC; - saddr.addrtype = TIPC_ADDR_NAMESEQ; - saddr.addr.nameseq.type = TIPC_TOP_SRV; + saddr.addrtype = TIPC_SERVICE_RANGE; + saddr.addr.nameseq.type = TIPC_TOP_SRV; saddr.addr.nameseq.lower = TIPC_TOP_SRV; saddr.addr.nameseq.upper = TIPC_TOP_SRV; saddr.scope = TIPC_NODE_SCOPE; - rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); + rc = tipc_sk_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); if (rc < 0) goto err; rc = kernel_listen(lsock, 0); @@ -664,12 +664,18 @@ static int tipc_topsrv_start(struct net *net) ret = tipc_topsrv_work_start(srv); if (ret < 0) - return ret; + goto err_start; ret = tipc_topsrv_create_listener(srv); if (ret < 0) - tipc_topsrv_work_stop(srv); + goto err_create; + return 0; + +err_create: + tipc_topsrv_work_stop(srv); +err_start: + kfree(srv); return ret; } diff --git a/net/tipc/trace.c b/net/tipc/trace.c index 265f6a26aa3d..7d2931521e0e 100644 --- a/net/tipc/trace.c +++ b/net/tipc/trace.c @@ -36,7 +36,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" -/** +/* * socket tuples for filtering in socket traces: * (portid, sock type, name type, name lower, name upper) */ diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 1d17f4470ee2..21e75e28e86a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -64,6 +64,11 @@ * * This is the bearer level originating address used in neighbor discovery * messages, and all fields should be in network byte order + * + * @proto: Ethernet protocol in use + * @port: port being used + * @ipv4: IPv4 address of neighbor + * @ipv6: IPv6 address of neighbor */ struct udp_media_addr { __be16 proto; @@ -88,6 +93,7 @@ struct udp_replicast { * @ubsock: bearer associated socket * @ifindex: local address scope * @work: used to schedule deferred work on a bearer + * @rcast: associated udp_replicast container */ struct udp_bearer { struct tipc_bearer __rcu *bearer; @@ -772,7 +778,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto free; - /** + /* * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index cec86229a6a0..f7fb7d2c1de1 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -327,7 +327,7 @@ static int tls_device_record_close(struct sock *sk, /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, - record_type, prot->version); + record_type); return ret; } @@ -694,36 +694,51 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, - s64 resync_req, u32 *seq) + s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); + u16 i; + + *rcd_delta = 0; if (is_async) { + /* shouldn't get to wraparound: + * too long in async stage, something bad happened + */ + if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) + return false; + /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ - if (between(*seq, req_seq, req_end) && + if (before(*seq, req_seq)) + return false; + if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; + resync_async->rcd_delta++; + return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ - while (resync_async->loglen) { - if (req_seq == resync_async->log[resync_async->loglen - 1] && - atomic64_try_cmpxchg(&resync_async->req, - &resync_req, 0)) { - resync_async->loglen = 0; + for (i = 0; i < resync_async->loglen; i++) + if (req_seq == resync_async->log[i] && + atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { + *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; + resync_async->loglen = 0; + resync_async->rcd_delta = 0; return true; } - resync_async->loglen--; - } + + resync_async->loglen = 0; + resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, @@ -741,6 +756,7 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; + u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -786,8 +802,9 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, - resync_req, &seq)) + resync_req, &seq, &rcd_delta)) return; + tls_bigint_subtract(rcd_sn, rcd_delta); break; } @@ -981,7 +998,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { - u16 nonce_size, tag_size, iv_size, rec_seq_size; + u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_record_info *start_marker_record; @@ -1022,6 +1039,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv; rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE; + salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE; rec_seq = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; break; @@ -1042,6 +1060,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) prot->tag_size = tag_size; prot->overhead_size = prot->prepend_size + prot->tag_size; prot->iv_size = iv_size; + prot->salt_size = salt_size; ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); if (!ctx->tx.iv) { @@ -1245,6 +1264,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); tls_ctx->netdev = NULL; + } else { + set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); @@ -1274,7 +1295,8 @@ static int tls_device_down(struct net_device *netdev) if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); - if (ctx->rx_conf == TLS_HW) + if (ctx->rx_conf == TLS_HW && + !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); WRITE_ONCE(ctx->netdev, NULL); diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 28895333701e..d946817ed065 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -49,7 +49,8 @@ static int tls_enc_record(struct aead_request *aead_req, struct crypto_aead *aead, char *aad, char *iv, __be64 rcd_sn, struct scatter_walk *in, - struct scatter_walk *out, int *in_len) + struct scatter_walk *out, int *in_len, + struct tls_prot_info *prot) { unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE]; struct scatterlist sg_in[3]; @@ -73,8 +74,7 @@ static int tls_enc_record(struct aead_request *aead_req, len -= TLS_CIPHER_AES_GCM_128_IV_SIZE; tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE, - (char *)&rcd_sn, sizeof(rcd_sn), buf[0], - TLS_1_2_VERSION); + (char *)&rcd_sn, buf[0], prot); memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); @@ -140,7 +140,7 @@ static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead, static int tls_enc_records(struct aead_request *aead_req, struct crypto_aead *aead, struct scatterlist *sg_in, struct scatterlist *sg_out, char *aad, char *iv, - u64 rcd_sn, int len) + u64 rcd_sn, int len, struct tls_prot_info *prot) { struct scatter_walk out, in; int rc; @@ -150,7 +150,7 @@ static int tls_enc_records(struct aead_request *aead_req, do { rc = tls_enc_record(aead_req, aead, aad, iv, - cpu_to_be64(rcd_sn), &in, &out, &len); + cpu_to_be64(rcd_sn), &in, &out, &len, prot); rcd_sn++; } while (rc == 0 && len); @@ -348,7 +348,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, payload_len, sync_size, dummy_buf); if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv, - rcd_sn, sync_size + payload_len) < 0) + rcd_sn, sync_size + payload_len, + &tls_ctx->prot_info) < 0) goto free_nskb; complete_skb(nskb, skb, tcp_payload_offset); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 8d93cea99f2c..47b7c5334c34 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -521,6 +521,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, case TLS_CIPHER_AES_CCM_128: optsize = sizeof(struct tls12_crypto_info_aes_ccm_128); break; + case TLS_CIPHER_CHACHA20_POLY1305: + optsize = sizeof(struct tls12_crypto_info_chacha20_poly1305); + break; default: rc = -EINVAL; goto err_crypto_info; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 95ab5545a931..01d933ae5f16 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -505,7 +505,7 @@ static int tls_do_encryption(struct sock *sk, memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, prot->iv_size + prot->salt_size); - xor_iv_with_seq(prot->version, rec->iv_data, tls_ctx->tx.rec_seq); + xor_iv_with_seq(prot, rec->iv_data, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; @@ -748,14 +748,13 @@ static int tls_push_record(struct sock *sk, int flags, sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); tls_make_aad(rec->aad_space, msg_pl->sg.size + prot->tail_size, - tls_ctx->tx.rec_seq, prot->rec_seq_size, - record_type, prot->version); + tls_ctx->tx.rec_seq, record_type, prot); tls_fill_prepend(tls_ctx, page_address(sg_page(&msg_en->sg.data[i])) + msg_en->sg.data[i].offset, msg_pl->sg.size + prot->tail_size, - record_type, prot->version); + record_type); tls_ctx->pending_open_record_frags = false; @@ -1295,6 +1294,12 @@ static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, return NULL; } + if (!skb_queue_empty(&sk->sk_receive_queue)) { + __strp_unpause(&ctx->strp); + if (ctx->recv_pkt) + return ctx->recv_pkt; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) return NULL; @@ -1465,19 +1470,19 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, kfree(mem); return err; } - if (prot->version == TLS_1_3_VERSION) + if (prot->version == TLS_1_3_VERSION || + prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) memcpy(iv + iv_offset, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv)); else memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); - xor_iv_with_seq(prot->version, iv, tls_ctx->rx.rec_seq); + xor_iv_with_seq(prot, iv, tls_ctx->rx.rec_seq); /* Prepare AAD */ tls_make_aad(aad, rxm->full_len - prot->overhead_size + prot->tail_size, - tls_ctx->rx.rec_seq, prot->rec_seq_size, - ctx->control, prot->version); + tls_ctx->rx.rec_seq, ctx->control, prot); /* Prepare sgin */ sg_init_table(sgin, n_sgin); @@ -1913,7 +1918,7 @@ pick_next_record: * another message type */ msg->msg_flags |= MSG_EOR; - if (ctx->control != TLS_RECORD_TYPE_DATA) + if (control != TLS_RECORD_TYPE_DATA) goto recv_end; } else { break; @@ -2070,7 +2075,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) data_len = ((header[4] & 0xFF) | (header[3] << 8)); cipher_overhead = prot->tag_size; - if (prot->version != TLS_1_3_VERSION) + if (prot->version != TLS_1_3_VERSION && + prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) cipher_overhead += prot->iv_size; if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead + @@ -2290,6 +2296,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; struct tls12_crypto_info_aes_ccm_128 *ccm_128_info; + struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; @@ -2402,6 +2409,21 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) cipher_name = "ccm(aes)"; break; } + case TLS_CIPHER_CHACHA20_POLY1305: { + chacha20_poly1305_info = (void *)crypto_info; + nonce_size = 0; + tag_size = TLS_CIPHER_CHACHA20_POLY1305_TAG_SIZE; + iv_size = TLS_CIPHER_CHACHA20_POLY1305_IV_SIZE; + iv = chacha20_poly1305_info->iv; + rec_seq_size = TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE; + rec_seq = chacha20_poly1305_info->rec_seq; + keysize = TLS_CIPHER_CHACHA20_POLY1305_KEY_SIZE; + key = chacha20_poly1305_info->key; + salt = chacha20_poly1305_info->salt; + salt_size = TLS_CIPHER_CHACHA20_POLY1305_SALT_SIZE; + cipher_name = "rfc7539(chacha20,poly1305)"; + break; + } default: rc = -EINVAL; goto free_priv; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9e93bc201cc0..d2834047c6db 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -438,7 +438,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) case SOCK_STREAM: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; - else if (remote_cid <= VMADDR_CID_HOST) + else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g) new_transport = transport_g2h; else new_transport = transport_h2g; @@ -739,7 +739,7 @@ static struct sock *__vsock_create(struct net *net, vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; } else { - vsk->trusted = capable(CAP_NET_ADMIN); + vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; @@ -2072,8 +2072,7 @@ static long vsock_dev_do_ioctl(struct file *filp, break; default: - pr_err("Unknown ioctl %d\n", cmd); - retval = -EINVAL; + retval = -ENOIOCTLCMD; } return retval; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 0edda1edf988..5956939eebb7 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -841,8 +841,10 @@ void virtio_transport_release(struct vsock_sock *vsk) virtio_transport_free_pkt(pkt); } - if (remove_sock) + if (remove_sock) { + sock_set_flag(sk, SOCK_DONE); vsock_remove_sock(vsk); + } } EXPORT_SYMBOL_GPL(virtio_transport_release); @@ -1132,8 +1134,8 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, lock_sock(sk); - /* Check if sk has been released before lock_sock */ - if (sk->sk_shutdown == SHUTDOWN_MASK) { + /* Check if sk has been closed before lock_sock */ + if (sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset_no_sock(t, pkt); release_sock(sk); sock_put(sk); diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig deleted file mode 100644 index d13762bc4abc..000000000000 --- a/net/wimax/Kconfig +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# WiMAX LAN device configuration -# - -menuconfig WIMAX - tristate "WiMAX Wireless Broadband support" - depends on RFKILL || !RFKILL - help - - Select to configure support for devices that provide - wireless broadband connectivity using the WiMAX protocol - (IEEE 802.16). - - Please note that most of these devices require signing up - for a service plan with a provider. - - The different WiMAX drivers can be enabled in the menu entry - - Device Drivers > Network device support > WiMAX Wireless - Broadband devices - - If unsure, it is safe to select M (module). - -config WIMAX_DEBUG_LEVEL - int "WiMAX debug level" - depends on WIMAX - default 8 - help - - Select the maximum debug verbosity level to be compiled into - the WiMAX stack code. - - By default, debug messages are disabled at runtime and can - be selectively enabled for different parts of the code using - the sysfs debug-levels file. - - If set at zero, this will compile out all the debug code. - - It is recommended that it is left at 8. diff --git a/net/wimax/Makefile b/net/wimax/Makefile deleted file mode 100644 index c2a71ae487ac..000000000000 --- a/net/wimax/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -obj-$(CONFIG_WIMAX) += wimax.o - -wimax-y := \ - id-table.o \ - op-msg.o \ - op-reset.o \ - op-rfkill.o \ - op-state-get.o \ - stack.o - -wimax-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h deleted file mode 100644 index ebc287cde336..000000000000 --- a/net/wimax/debug-levels.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Linux WiMAX Stack - * Debug levels control file for the wimax module - * - * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - */ -#ifndef __debug_levels__h__ -#define __debug_levels__h__ - -/* Maximum compile and run time debug level for all submodules */ -#define D_MODULENAME wimax -#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL - -#include <linux/wimax/debug.h> - -/* List of all the enabled modules */ -enum d_module { - D_SUBMODULE_DECLARE(debugfs), - D_SUBMODULE_DECLARE(id_table), - D_SUBMODULE_DECLARE(op_msg), - D_SUBMODULE_DECLARE(op_reset), - D_SUBMODULE_DECLARE(op_rfkill), - D_SUBMODULE_DECLARE(op_state_get), - D_SUBMODULE_DECLARE(stack), -}; - -#endif /* #ifndef __debug_levels__h__ */ diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c deleted file mode 100644 index 3c54bb6b925a..000000000000 --- a/net/wimax/debugfs.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Debugfs support - * - * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - */ -#include <linux/debugfs.h> -#include <linux/wimax.h> -#include "wimax-internal.h" - -#define D_SUBMODULE debugfs -#include "debug-levels.h" - -void wimax_debugfs_add(struct wimax_dev *wimax_dev) -{ - struct net_device *net_dev = wimax_dev->net_dev; - struct dentry *dentry; - char buf[128]; - - snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name); - dentry = debugfs_create_dir(buf, NULL); - wimax_dev->debugfs_dentry = dentry; - - d_level_register_debugfs("wimax_dl_", debugfs, dentry); - d_level_register_debugfs("wimax_dl_", id_table, dentry); - d_level_register_debugfs("wimax_dl_", op_msg, dentry); - d_level_register_debugfs("wimax_dl_", op_reset, dentry); - d_level_register_debugfs("wimax_dl_", op_rfkill, dentry); - d_level_register_debugfs("wimax_dl_", op_state_get, dentry); - d_level_register_debugfs("wimax_dl_", stack, dentry); -} - -void wimax_debugfs_rm(struct wimax_dev *wimax_dev) -{ - debugfs_remove_recursive(wimax_dev->debugfs_dentry); -} diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c deleted file mode 100644 index 02eee37b7e31..000000000000 --- a/net/wimax/id-table.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Mappping of generic netlink family IDs to net devices - * - * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * We assign a single generic netlink family ID to each device (to - * simplify lookup). - * - * We need a way to map family ID to a wimax_dev pointer. - * - * The idea is to use a very simple lookup. Using a netlink attribute - * with (for example) the interface name implies a heavier search over - * all the network devices; seemed kind of a waste given that we know - * we are looking for a WiMAX device and that most systems will have - * just a single WiMAX adapter. - * - * We put all the WiMAX devices in the system in a linked list and - * match the generic link family ID against the list. - * - * By using a linked list, the case of a single adapter in the system - * becomes (almost) no overhead, while still working for many more. If - * it ever goes beyond two, I'll be surprised. - */ -#include <linux/device.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> -#include <linux/list.h> -#include <linux/wimax.h> -#include "wimax-internal.h" - - -#define D_SUBMODULE id_table -#include "debug-levels.h" - - -static DEFINE_SPINLOCK(wimax_id_table_lock); -static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table); - - -/* - * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping - * - * @wimax_dev: WiMAX device descriptor to associate to the Generic - * Netlink family ID. - * - * Look for an empty spot in the ID table; if none found, double the - * table's size and get the first spot. - */ -void wimax_id_table_add(struct wimax_dev *wimax_dev) -{ - d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev); - spin_lock(&wimax_id_table_lock); - list_add(&wimax_dev->id_table_node, &wimax_id_table); - spin_unlock(&wimax_id_table_lock); - d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev); -} - - -/* - * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info - * - * The generic netlink family ID has been filled out in the - * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in - * the mapping table and reference the wimax_dev. - * - * When done, the reference should be dropped with - * 'dev_put(wimax_dev->net_dev)'. - */ -struct wimax_dev *wimax_dev_get_by_genl_info( - struct genl_info *info, int ifindex) -{ - struct wimax_dev *wimax_dev = NULL; - - d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex); - spin_lock(&wimax_id_table_lock); - list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { - if (wimax_dev->net_dev->ifindex == ifindex) { - dev_hold(wimax_dev->net_dev); - goto found; - } - } - wimax_dev = NULL; - d_printf(1, NULL, "wimax: no devices found with ifindex %d\n", - ifindex); -found: - spin_unlock(&wimax_id_table_lock); - d_fnend(3, NULL, "(info %p ifindex %d) = %p\n", - info, ifindex, wimax_dev); - return wimax_dev; -} - - -/* - * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping - * - * @id: family ID to remove from the table - */ -void wimax_id_table_rm(struct wimax_dev *wimax_dev) -{ - spin_lock(&wimax_id_table_lock); - list_del_init(&wimax_dev->id_table_node); - spin_unlock(&wimax_id_table_lock); -} - - -/* - * Release the gennetlink family id / mapping table - * - * On debug, verify that the table is empty upon removal. We want the - * code always compiled, to ensure it doesn't bit rot. It will be - * compiled out if CONFIG_BUG is disabled. - */ -void wimax_id_table_release(void) -{ - struct wimax_dev *wimax_dev; - -#ifndef CONFIG_BUG - return; -#endif - spin_lock(&wimax_id_table_lock); - list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { - pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n", - __func__, wimax_dev, wimax_dev->net_dev->ifindex); - WARN_ON(1); - } - spin_unlock(&wimax_id_table_lock); -} diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c deleted file mode 100644 index 6460b5785758..000000000000 --- a/net/wimax/op-msg.c +++ /dev/null @@ -1,391 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Generic messaging interface between userspace and driver/device - * - * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This implements a direct communication channel between user space and - * the driver/device, by which free form messages can be sent back and - * forth. - * - * This is intended for device-specific features, vendor quirks, etc. - * - * See include/net/wimax.h - * - * GENERIC NETLINK ENCODING AND CAPACITY - * - * A destination "pipe name" is added to each message; it is up to the - * drivers to assign or use those names (if using them at all). - * - * Messages are encoded as a binary netlink attribute using nla_put() - * using type NLA_UNSPEC (as some versions of libnl still in - * deployment don't yet understand NLA_BINARY). - * - * The maximum capacity of this transport is PAGESIZE per message (so - * the actual payload will be bit smaller depending on the - * netlink/generic netlink attributes and headers). - * - * RECEPTION OF MESSAGES - * - * When a message is received from user space, it is passed verbatim - * to the driver calling wimax_dev->op_msg_from_user(). The return - * value from this function is passed back to user space as an ack - * over the generic netlink protocol. - * - * The stack doesn't do any processing or interpretation of these - * messages. - * - * SENDING MESSAGES - * - * Messages can be sent with wimax_msg(). - * - * If the message delivery needs to happen on a different context to - * that of its creation, wimax_msg_alloc() can be used to get a - * pointer to the message that can be delivered later on with - * wimax_msg_send(). - * - * ROADMAP - * - * wimax_gnl_doit_msg_from_user() Process a message from user space - * wimax_dev_get_by_genl_info() - * wimax_dev->op_msg_from_user() Delivery of message to the driver - * - * wimax_msg() Send a message to user space - * wimax_msg_alloc() - * wimax_msg_send() - */ -#include <linux/device.h> -#include <linux/slab.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include <linux/export.h> -#include "wimax-internal.h" - - -#define D_SUBMODULE op_msg -#include "debug-levels.h" - - -/** - * wimax_msg_alloc - Create a new skb for sending a message to userspace - * - * @wimax_dev: WiMAX device descriptor - * @pipe_name: "named pipe" the message will be sent to - * @msg: pointer to the message data to send - * @size: size of the message to send (in bytes), including the header. - * @gfp_flags: flags for memory allocation. - * - * Returns: %0 if ok, negative errno code on error - * - * Description: - * - * Allocates an skb that will contain the message to send to user - * space over the messaging pipe and initializes it, copying the - * payload. - * - * Once this call is done, you can deliver it with - * wimax_msg_send(). - * - * IMPORTANT: - * - * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as - * wimax_msg_send() depends on skb->data being placed at the - * beginning of the user message. - * - * Unlike other WiMAX stack calls, this call can be used way early, - * even before wimax_dev_add() is called, as long as the - * wimax_dev->net_dev pointer is set to point to a proper - * net_dev. This is so that drivers can use it early in case they need - * to send stuff around or communicate with user space. - */ -struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev, - const char *pipe_name, - const void *msg, size_t size, - gfp_t gfp_flags) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - size_t msg_size; - void *genl_msg; - struct sk_buff *skb; - - msg_size = nla_total_size(size) - + nla_total_size(sizeof(u32)) - + (pipe_name ? nla_total_size(strlen(pipe_name)) : 0); - result = -ENOMEM; - skb = genlmsg_new(msg_size, gfp_flags); - if (skb == NULL) - goto error_new; - genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family, - 0, WIMAX_GNL_OP_MSG_TO_USER); - if (genl_msg == NULL) { - dev_err(dev, "no memory to create generic netlink message\n"); - goto error_genlmsg_put; - } - result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX, - wimax_dev->net_dev->ifindex); - if (result < 0) { - dev_err(dev, "no memory to add ifindex attribute\n"); - goto error_nla_put; - } - if (pipe_name) { - result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME, - pipe_name); - if (result < 0) { - dev_err(dev, "no memory to add pipe_name attribute\n"); - goto error_nla_put; - } - } - result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg); - if (result < 0) { - dev_err(dev, "no memory to add payload (msg %p size %zu) in " - "attribute: %d\n", msg, size, result); - goto error_nla_put; - } - genlmsg_end(skb, genl_msg); - return skb; - -error_nla_put: -error_genlmsg_put: -error_new: - nlmsg_free(skb); - return ERR_PTR(result); -} -EXPORT_SYMBOL_GPL(wimax_msg_alloc); - - -/** - * wimax_msg_data_len - Return a pointer and size of a message's payload - * - * @msg: Pointer to a message created with wimax_msg_alloc() - * @size: Pointer to where to store the message's size - * - * Returns the pointer to the message data. - */ -const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size) -{ - struct nlmsghdr *nlh = (void *) msg->head; - struct nlattr *nla; - - nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), - WIMAX_GNL_MSG_DATA); - if (nla == NULL) { - pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); - return NULL; - } - *size = nla_len(nla); - return nla_data(nla); -} -EXPORT_SYMBOL_GPL(wimax_msg_data_len); - - -/** - * wimax_msg_data - Return a pointer to a message's payload - * - * @msg: Pointer to a message created with wimax_msg_alloc() - */ -const void *wimax_msg_data(struct sk_buff *msg) -{ - struct nlmsghdr *nlh = (void *) msg->head; - struct nlattr *nla; - - nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), - WIMAX_GNL_MSG_DATA); - if (nla == NULL) { - pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); - return NULL; - } - return nla_data(nla); -} -EXPORT_SYMBOL_GPL(wimax_msg_data); - - -/** - * wimax_msg_len - Return a message's payload length - * - * @msg: Pointer to a message created with wimax_msg_alloc() - */ -ssize_t wimax_msg_len(struct sk_buff *msg) -{ - struct nlmsghdr *nlh = (void *) msg->head; - struct nlattr *nla; - - nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), - WIMAX_GNL_MSG_DATA); - if (nla == NULL) { - pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); - return -EINVAL; - } - return nla_len(nla); -} -EXPORT_SYMBOL_GPL(wimax_msg_len); - - -/** - * wimax_msg_send - Send a pre-allocated message to user space - * - * @wimax_dev: WiMAX device descriptor - * - * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the - * ownership of @skb is transferred to this function. - * - * Returns: 0 if ok, < 0 errno code on error - * - * Description: - * - * Sends a free-form message that was preallocated with - * wimax_msg_alloc() and filled up. - * - * Assumes that once you pass an skb to this function for sending, it - * owns it and will release it when done (on success). - * - * IMPORTANT: - * - * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as - * wimax_msg_send() depends on skb->data being placed at the - * beginning of the user message. - * - * Unlike other WiMAX stack calls, this call can be used way early, - * even before wimax_dev_add() is called, as long as the - * wimax_dev->net_dev pointer is set to point to a proper - * net_dev. This is so that drivers can use it early in case they need - * to send stuff around or communicate with user space. - */ -int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb) -{ - struct device *dev = wimax_dev_to_dev(wimax_dev); - void *msg = skb->data; - size_t size = skb->len; - might_sleep(); - - d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size); - d_dump(2, dev, msg, size); - genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL); - d_printf(1, dev, "CTX: genl multicast done\n"); - return 0; -} -EXPORT_SYMBOL_GPL(wimax_msg_send); - - -/** - * wimax_msg - Send a message to user space - * - * @wimax_dev: WiMAX device descriptor (properly referenced) - * @pipe_name: "named pipe" the message will be sent to - * @buf: pointer to the message to send. - * @size: size of the buffer pointed to by @buf (in bytes). - * @gfp_flags: flags for memory allocation. - * - * Returns: %0 if ok, negative errno code on error. - * - * Description: - * - * Sends a free-form message to user space on the device @wimax_dev. - * - * NOTES: - * - * Once the @skb is given to this function, who will own it and will - * release it when done (unless it returns error). - */ -int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name, - const void *buf, size_t size, gfp_t gfp_flags) -{ - int result = -ENOMEM; - struct sk_buff *skb; - - skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags); - if (IS_ERR(skb)) - result = PTR_ERR(skb); - else - result = wimax_msg_send(wimax_dev, skb); - return result; -} -EXPORT_SYMBOL_GPL(wimax_msg); - -/* - * Relays a message from user space to the driver - * - * The skb is passed to the driver-specific function with the netlink - * and generic netlink headers already stripped. - * - * This call will block while handling/relaying the message. - */ -int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - struct device *dev; - struct nlmsghdr *nlh = info->nlhdr; - char *pipe_name; - void *msg_buf; - size_t msg_len; - - might_sleep(); - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) { - pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - dev = wimax_dev_to_dev(wimax_dev); - - /* Unpack arguments */ - result = -EINVAL; - if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) { - dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA " - "attribute\n"); - goto error_no_data; - } - msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]); - msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]); - - if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL) - pipe_name = NULL; - else { - struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME]; - size_t attr_len = nla_len(attr); - /* libnl-1.1 does not yet support NLA_NUL_STRING */ - result = -ENOMEM; - pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL); - if (pipe_name == NULL) - goto error_alloc; - pipe_name[attr_len] = 0; - } - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result == -ENOMEDIUM) - result = 0; - if (result < 0) - goto error_not_ready; - result = -ENOSYS; - if (wimax_dev->op_msg_from_user == NULL) - goto error_noop; - - d_printf(1, dev, - "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n", - nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, - nlh->nlmsg_seq, nlh->nlmsg_pid); - d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len); - d_dump(2, dev, msg_buf, msg_len); - - result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name, - msg_buf, msg_len, info); -error_noop: -error_not_ready: - mutex_unlock(&wimax_dev->mutex); -error_alloc: - kfree(pipe_name); -error_no_data: - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c deleted file mode 100644 index 9899b2e56721..000000000000 --- a/net/wimax/op-reset.c +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Implement and export a method for resetting a WiMAX device - * - * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This implements a simple synchronous call to reset a WiMAX device. - * - * Resets aim at being warm, keeping the device handles active; - * however, when that fails, it falls back to a cold reset (that will - * disconnect and reconnect the device). - */ - -#include <net/wimax.h> -#include <net/genetlink.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include <linux/export.h> -#include "wimax-internal.h" - -#define D_SUBMODULE op_reset -#include "debug-levels.h" - - -/** - * wimax_reset - Reset a WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * Returns: - * - * %0 if ok and a warm reset was done (the device still exists in - * the system). - * - * -%ENODEV if a cold/bus reset had to be done (device has - * disconnected and reconnected, so current handle is not valid - * any more). - * - * -%EINVAL if the device is not even registered. - * - * Any other negative error code shall be considered as - * non-recoverable. - * - * Description: - * - * Called when wanting to reset the device for any reason. Device is - * taken back to power on status. - * - * This call blocks; on successful return, the device has completed the - * reset process and is ready to operate. - */ -int wimax_reset(struct wimax_dev *wimax_dev) -{ - int result = -EINVAL; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st state; - - might_sleep(); - d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); - mutex_lock(&wimax_dev->mutex); - dev_hold(wimax_dev->net_dev); - state = wimax_dev->state; - mutex_unlock(&wimax_dev->mutex); - - if (state >= WIMAX_ST_DOWN) { - mutex_lock(&wimax_dev->mutex_reset); - result = wimax_dev->op_reset(wimax_dev); - mutex_unlock(&wimax_dev->mutex_reset); - } - dev_put(wimax_dev->net_dev); - - d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result); - return result; -} -EXPORT_SYMBOL(wimax_reset); - - -/* - * Exporting to user space over generic netlink - * - * Parse the reset command from user space, return error code. - * - * No attributes. - */ -int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) { - pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - /* Execute the operation and send the result back to user space */ - result = wimax_reset(wimax_dev); - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c deleted file mode 100644 index 248d10b60b05..000000000000 --- a/net/wimax/op-rfkill.c +++ /dev/null @@ -1,431 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * RF-kill framework integration - * - * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This integrates into the Linux Kernel rfkill susbystem so that the - * drivers just have to do the bare minimal work, which is providing a - * method to set the software RF-Kill switch and to report changes in - * the software and hardware switch status. - * - * A non-polled generic rfkill device is embedded into the WiMAX - * subsystem's representation of a device. - * - * FIXME: Need polled support? Let drivers provide a poll routine - * and hand it to rfkill ops then? - * - * All device drivers have to do is after wimax_dev_init(), call - * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update - * initial state and then every time it changes. See wimax.h:struct - * wimax_dev for more information. - * - * ROADMAP - * - * wimax_gnl_doit_rfkill() User space calling wimax_rfkill() - * wimax_rfkill() Kernel calling wimax_rfkill() - * __wimax_rf_toggle_radio() - * - * wimax_rfkill_set_radio_block() RF-Kill subsystem calling - * __wimax_rf_toggle_radio() - * - * __wimax_rf_toggle_radio() - * wimax_dev->op_rfkill_sw_toggle() Driver backend - * __wimax_state_change() - * - * wimax_report_rfkill_sw() Driver reports state change - * __wimax_state_change() - * - * wimax_report_rfkill_hw() Driver reports state change - * __wimax_state_change() - * - * wimax_rfkill_add() Initialize/shutdown rfkill support - * wimax_rfkill_rm() [called by wimax_dev_add/rm()] - */ - -#include <net/wimax.h> -#include <net/genetlink.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include <linux/rfkill.h> -#include <linux/export.h> -#include "wimax-internal.h" - -#define D_SUBMODULE op_rfkill -#include "debug-levels.h" - -/** - * wimax_report_rfkill_hw - Reports changes in the hardware RF switch - * - * @wimax_dev: WiMAX device descriptor - * - * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on, - * %WIMAX_RF_OFF radio off. - * - * When the device detects a change in the state of thehardware RF - * switch, it must call this function to let the WiMAX kernel stack - * know that the state has changed so it can be properly propagated. - * - * The WiMAX stack caches the state (the driver doesn't need to). As - * well, as the change is propagated it will come back as a request to - * change the software state to mirror the hardware state. - * - * If the device doesn't have a hardware kill switch, just report - * it on initialization as always on (%WIMAX_RF_ON, radio on). - */ -void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev, - enum wimax_rf_state state) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st wimax_state; - - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - BUG_ON(state == WIMAX_RF_QUERY); - BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF); - - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result < 0) - goto error_not_ready; - - if (state != wimax_dev->rf_hw) { - wimax_dev->rf_hw = state; - if (wimax_dev->rf_hw == WIMAX_RF_ON && - wimax_dev->rf_sw == WIMAX_RF_ON) - wimax_state = WIMAX_ST_READY; - else - wimax_state = WIMAX_ST_RADIO_OFF; - - result = rfkill_set_hw_state(wimax_dev->rfkill, - state == WIMAX_RF_OFF); - - __wimax_state_change(wimax_dev, wimax_state); - } -error_not_ready: - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n", - wimax_dev, state, result); -} -EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw); - - -/** - * wimax_report_rfkill_sw - Reports changes in the software RF switch - * - * @wimax_dev: WiMAX device descriptor - * - * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on, - * %WIMAX_RF_OFF radio off. - * - * Reports changes in the software RF switch state to the WiMAX stack. - * - * The main use is during initialization, so the driver can query the - * device for its current software radio kill switch state and feed it - * to the system. - * - * On the side, the device does not change the software state by - * itself. In practice, this can happen, as the device might decide to - * switch (in software) the radio off for different reasons. - */ -void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev, - enum wimax_rf_state state) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st wimax_state; - - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - BUG_ON(state == WIMAX_RF_QUERY); - BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF); - - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result < 0) - goto error_not_ready; - - if (state != wimax_dev->rf_sw) { - wimax_dev->rf_sw = state; - if (wimax_dev->rf_hw == WIMAX_RF_ON && - wimax_dev->rf_sw == WIMAX_RF_ON) - wimax_state = WIMAX_ST_READY; - else - wimax_state = WIMAX_ST_RADIO_OFF; - __wimax_state_change(wimax_dev, wimax_state); - rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF); - } -error_not_ready: - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n", - wimax_dev, state, result); -} -EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw); - - -/* - * Callback for the RF Kill toggle operation - * - * This function is called by: - * - * - The rfkill subsystem when the RF-Kill key is pressed in the - * hardware and the driver notifies through - * wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back - * here so the software RF Kill switch state is changed to reflect - * the hardware switch state. - * - * - When the user sets the state through sysfs' rfkill/state file - * - * - When the user calls wimax_rfkill(). - * - * This call blocks! - * - * WARNING! When we call rfkill_unregister(), this will be called with - * state 0! - * - * WARNING: wimax_dev must be locked - */ -static -int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev, - enum wimax_rf_state state) -{ - int result = 0; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st wimax_state; - - might_sleep(); - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - if (wimax_dev->rf_sw == state) - goto out_no_change; - if (wimax_dev->op_rfkill_sw_toggle != NULL) - result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state); - else if (state == WIMAX_RF_OFF) /* No op? can't turn off */ - result = -ENXIO; - else /* No op? can turn on */ - result = 0; /* should never happen tho */ - if (result >= 0) { - result = 0; - wimax_dev->rf_sw = state; - wimax_state = state == WIMAX_RF_ON ? - WIMAX_ST_READY : WIMAX_ST_RADIO_OFF; - __wimax_state_change(wimax_dev, wimax_state); - } -out_no_change: - d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n", - wimax_dev, state, result); - return result; -} - - -/* - * Translate from rfkill state to wimax state - * - * NOTE: Special state handling rules here - * - * Just pretend the call didn't happen if we are in a state where - * we know for sure it cannot be handled (WIMAX_ST_DOWN or - * __WIMAX_ST_QUIESCING). rfkill() needs it to register and - * unregister, as it will run this path. - * - * NOTE: This call will block until the operation is completed. - */ -static int wimax_rfkill_set_radio_block(void *data, bool blocked) -{ - int result; - struct wimax_dev *wimax_dev = data; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_rf_state rf_state; - - d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked); - rf_state = WIMAX_RF_ON; - if (blocked) - rf_state = WIMAX_RF_OFF; - mutex_lock(&wimax_dev->mutex); - if (wimax_dev->state <= __WIMAX_ST_QUIESCING) - result = 0; - else - result = __wimax_rf_toggle_radio(wimax_dev, rf_state); - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n", - wimax_dev, blocked, result); - return result; -} - -static const struct rfkill_ops wimax_rfkill_ops = { - .set_block = wimax_rfkill_set_radio_block, -}; - -/** - * wimax_rfkill - Set the software RF switch state for a WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * @state: New RF state. - * - * Returns: - * - * >= 0 toggle state if ok, < 0 errno code on error. The toggle state - * is returned as a bitmap, bit 0 being the hardware RF state, bit 1 - * the software RF state. - * - * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio - * off (%WIMAX_RF_OFF). - * - * Description: - * - * Called by the user when he wants to request the WiMAX radio to be - * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With - * %WIMAX_RF_QUERY, just the current state is returned. - * - * NOTE: - * - * This call will block until the operation is complete. - */ -int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result < 0) { - /* While initializing, < 1.4.3 wimax-tools versions use - * this call to check if the device is a valid WiMAX - * device; so we allow it to proceed always, - * considering the radios are all off. */ - if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY) - result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF; - goto error_not_ready; - } - switch (state) { - case WIMAX_RF_ON: - case WIMAX_RF_OFF: - result = __wimax_rf_toggle_radio(wimax_dev, state); - if (result < 0) - goto error; - rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF); - break; - case WIMAX_RF_QUERY: - break; - default: - result = -EINVAL; - goto error; - } - result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw; -error: -error_not_ready: - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n", - wimax_dev, state, result); - return result; -} -EXPORT_SYMBOL(wimax_rfkill); - - -/* - * Register a new WiMAX device's RF Kill support - * - * WARNING: wimax_dev->mutex must be unlocked - */ -int wimax_rfkill_add(struct wimax_dev *wimax_dev) -{ - int result; - struct rfkill *rfkill; - struct device *dev = wimax_dev_to_dev(wimax_dev); - - d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); - /* Initialize RF Kill */ - result = -ENOMEM; - rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX, - &wimax_rfkill_ops, wimax_dev); - if (rfkill == NULL) - goto error_rfkill_allocate; - - d_printf(1, dev, "rfkill %p\n", rfkill); - - wimax_dev->rfkill = rfkill; - - rfkill_init_sw_state(rfkill, 1); - result = rfkill_register(wimax_dev->rfkill); - if (result < 0) - goto error_rfkill_register; - - /* If there is no SW toggle op, SW RFKill is always on */ - if (wimax_dev->op_rfkill_sw_toggle == NULL) - wimax_dev->rf_sw = WIMAX_RF_ON; - - d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev); - return 0; - -error_rfkill_register: - rfkill_destroy(wimax_dev->rfkill); -error_rfkill_allocate: - d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result); - return result; -} - - -/* - * Deregister a WiMAX device's RF Kill support - * - * Ick, we can't call rfkill_free() after rfkill_unregister()...oh - * well. - * - * WARNING: wimax_dev->mutex must be unlocked - */ -void wimax_rfkill_rm(struct wimax_dev *wimax_dev) -{ - struct device *dev = wimax_dev_to_dev(wimax_dev); - d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); - rfkill_unregister(wimax_dev->rfkill); - rfkill_destroy(wimax_dev->rfkill); - d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev); -} - - -/* - * Exporting to user space over generic netlink - * - * Parse the rfkill command from user space, return a combination - * value that describe the states of the different toggles. - * - * Only one attribute: the new state requested (on, off or no change, - * just query). - */ - -int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - struct device *dev; - enum wimax_rf_state new_state; - - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) { - pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - dev = wimax_dev_to_dev(wimax_dev); - result = -EINVAL; - if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) { - dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE " - "attribute\n"); - goto error_no_pid; - } - new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]); - - /* Execute the operation and send the result back to user space */ - result = wimax_rfkill(wimax_dev, new_state); -error_no_pid: - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c deleted file mode 100644 index 5bc712de1563..000000000000 --- a/net/wimax/op-state-get.c +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Implement and export a method for getting a WiMAX device current state - * - * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt> - * - * Based on previous WiMAX core work by: - * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - */ - -#include <net/wimax.h> -#include <net/genetlink.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include "wimax-internal.h" - -#define D_SUBMODULE op_state_get -#include "debug-levels.h" - - -/* - * Exporting to user space over generic netlink - * - * Parse the state get command from user space, return a combination - * value that describe the current state. - * - * No attributes. - */ -int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) { - pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - /* Execute the operation and send the result back to user space */ - result = wimax_state_get(wimax_dev); - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/stack.c b/net/wimax/stack.c deleted file mode 100644 index b6dd9d956ed8..000000000000 --- a/net/wimax/stack.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Initialization, addition and removal of wimax devices - * - * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This implements: - * - * - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on - * addition/registration initialize all subfields and allocate - * generic netlink resources for user space communication. On - * removal/unregistration, undo all that. - * - * - device state machine [wimax_state_change()] and support to send - * reports to user space when the state changes - * [wimax_gnl_re_state_change*()]. - * - * See include/net/wimax.h for rationales and design. - * - * ROADMAP - * - * [__]wimax_state_change() Called by drivers to update device's state - * wimax_gnl_re_state_change_alloc() - * wimax_gnl_re_state_change_send() - * - * wimax_dev_init() Init a device - * wimax_dev_add() Register - * wimax_rfkill_add() - * wimax_gnl_add() Register all the generic netlink resources. - * wimax_id_table_add() - * wimax_dev_rm() Unregister - * wimax_id_table_rm() - * wimax_gnl_rm() - * wimax_rfkill_rm() - */ -#include <linux/device.h> -#include <linux/gfp.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> -#include <linux/wimax.h> -#include <linux/module.h> -#include "wimax-internal.h" - - -#define D_SUBMODULE stack -#include "debug-levels.h" - -static char wimax_debug_params[128]; -module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params), - 0644); -MODULE_PARM_DESC(debug, - "String of space-separated NAME:VALUE pairs, where NAMEs " - "are the different debug submodules and VALUE are the " - "initial debug value to set."); - -/* - * Authoritative source for the RE_STATE_CHANGE attribute policy - * - * We don't really use it here, but /me likes to keep the definition - * close to where the data is generated. - */ -/* -static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = { - [WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 }, - [WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 }, -}; -*/ - - -/* - * Allocate a Report State Change message - * - * @header: save it, you need it for _send() - * - * Creates and fills a basic state change message; different code - * paths can then add more attributes to the message as needed. - * - * Use wimax_gnl_re_state_change_send() to send the returned skb. - * - * Returns: skb with the genl message if ok, IS_ERR() ptr on error - * with an errno code. - */ -static -struct sk_buff *wimax_gnl_re_state_change_alloc( - struct wimax_dev *wimax_dev, - enum wimax_st new_state, enum wimax_st old_state, - void **header) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - void *data; - struct sk_buff *report_skb; - - d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n", - wimax_dev, new_state, old_state); - result = -ENOMEM; - report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (report_skb == NULL) { - dev_err(dev, "RE_STCH: can't create message\n"); - goto error_new; - } - /* FIXME: sending a group ID as the seq is wrong */ - data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset, - &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE); - if (data == NULL) { - dev_err(dev, "RE_STCH: can't put data into message\n"); - goto error_put; - } - *header = data; - - result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state); - if (result < 0) { - dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result); - goto error_put; - } - result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state); - if (result < 0) { - dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result); - goto error_put; - } - result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX, - wimax_dev->net_dev->ifindex); - if (result < 0) { - dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n"); - goto error_put; - } - d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n", - wimax_dev, new_state, old_state, report_skb); - return report_skb; - -error_put: - nlmsg_free(report_skb); -error_new: - d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n", - wimax_dev, new_state, old_state, result); - return ERR_PTR(result); -} - - -/* - * Send a Report State Change message (as created with _alloc). - * - * @report_skb: as returned by wimax_gnl_re_state_change_alloc() - * @header: as returned by wimax_gnl_re_state_change_alloc() - * - * Returns: 0 if ok, < 0 errno code on error. - * - * If the message is NULL, pretend it didn't happen. - */ -static -int wimax_gnl_re_state_change_send( - struct wimax_dev *wimax_dev, struct sk_buff *report_skb, - void *header) -{ - int result = 0; - struct device *dev = wimax_dev_to_dev(wimax_dev); - d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n", - wimax_dev, report_skb); - if (report_skb == NULL) { - result = -ENOMEM; - goto out; - } - genlmsg_end(report_skb, header); - genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL); -out: - d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n", - wimax_dev, report_skb, result); - return result; -} - - -static -void __check_new_state(enum wimax_st old_state, enum wimax_st new_state, - unsigned int allowed_states_bm) -{ - if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) { - pr_err("SW BUG! Forbidden state change %u -> %u\n", - old_state, new_state); - } -} - - -/* - * Set the current state of a WiMAX device [unlocking version of - * wimax_state_change(). - */ -void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state) -{ - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st old_state = wimax_dev->state; - struct sk_buff *stch_skb; - void *header; - - d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n", - wimax_dev, new_state, old_state); - - if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) { - dev_err(dev, "SW BUG: requesting invalid state %u\n", - new_state); - goto out; - } - if (old_state == new_state) - goto out; - header = NULL; /* gcc complains? can't grok why */ - stch_skb = wimax_gnl_re_state_change_alloc( - wimax_dev, new_state, old_state, &header); - - /* Verify the state transition and do exit-from-state actions */ - switch (old_state) { - case __WIMAX_ST_NULL: - __check_new_state(old_state, new_state, - 1 << WIMAX_ST_DOWN); - break; - case WIMAX_ST_DOWN: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_UNINITIALIZED - | 1 << WIMAX_ST_RADIO_OFF); - break; - case __WIMAX_ST_QUIESCING: - __check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN); - break; - case WIMAX_ST_UNINITIALIZED: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF); - break; - case WIMAX_ST_RADIO_OFF: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_READY); - break; - case WIMAX_ST_READY: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_SCANNING - | 1 << WIMAX_ST_CONNECTING - | 1 << WIMAX_ST_CONNECTED); - break; - case WIMAX_ST_SCANNING: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_READY - | 1 << WIMAX_ST_CONNECTING - | 1 << WIMAX_ST_CONNECTED); - break; - case WIMAX_ST_CONNECTING: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_READY - | 1 << WIMAX_ST_SCANNING - | 1 << WIMAX_ST_CONNECTED); - break; - case WIMAX_ST_CONNECTED: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_READY); - netif_tx_disable(wimax_dev->net_dev); - netif_carrier_off(wimax_dev->net_dev); - break; - case __WIMAX_ST_INVALID: - default: - dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n", - wimax_dev, wimax_dev->state); - WARN_ON(1); - goto out; - } - - /* Execute the actions of entry to the new state */ - switch (new_state) { - case __WIMAX_ST_NULL: - dev_err(dev, "SW BUG: wimax_dev %p entering NULL state " - "from %u\n", wimax_dev, wimax_dev->state); - WARN_ON(1); /* Nobody can enter this state */ - break; - case WIMAX_ST_DOWN: - break; - case __WIMAX_ST_QUIESCING: - break; - case WIMAX_ST_UNINITIALIZED: - break; - case WIMAX_ST_RADIO_OFF: - break; - case WIMAX_ST_READY: - break; - case WIMAX_ST_SCANNING: - break; - case WIMAX_ST_CONNECTING: - break; - case WIMAX_ST_CONNECTED: - netif_carrier_on(wimax_dev->net_dev); - netif_wake_queue(wimax_dev->net_dev); - break; - case __WIMAX_ST_INVALID: - default: - BUG(); - } - __wimax_state_set(wimax_dev, new_state); - if (!IS_ERR(stch_skb)) - wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header); -out: - d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n", - wimax_dev, new_state, old_state); -} - - -/** - * wimax_state_change - Set the current state of a WiMAX device - * - * @wimax_dev: WiMAX device descriptor (properly referenced) - * @new_state: New state to switch to - * - * This implements the state changes for the wimax devices. It will - * - * - verify that the state transition is legal (for now it'll just - * print a warning if not) according to the table in - * linux/wimax.h's documentation for 'enum wimax_st'. - * - * - perform the actions needed for leaving the current state and - * whichever are needed for entering the new state. - * - * - issue a report to user space indicating the new state (and an - * optional payload with information about the new state). - * - * NOTE: @wimax_dev must be locked - */ -void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state) -{ - /* - * A driver cannot take the wimax_dev out of the - * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If - * the wimax_dev's state is still NULL, we ignore any request - * to change its state because it means it hasn't been yet - * registered. - * - * There is no need to complain about it, as routines that - * call this might be shared from different code paths that - * are called before or after wimax_dev_add() has done its - * job. - */ - mutex_lock(&wimax_dev->mutex); - if (wimax_dev->state > __WIMAX_ST_NULL) - __wimax_state_change(wimax_dev, new_state); - mutex_unlock(&wimax_dev->mutex); -} -EXPORT_SYMBOL_GPL(wimax_state_change); - - -/** - * wimax_state_get() - Return the current state of a WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * Returns: Current state of the device according to its driver. - */ -enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev) -{ - enum wimax_st state; - mutex_lock(&wimax_dev->mutex); - state = wimax_dev->state; - mutex_unlock(&wimax_dev->mutex); - return state; -} -EXPORT_SYMBOL_GPL(wimax_state_get); - - -/** - * wimax_dev_init - initialize a newly allocated instance - * - * @wimax_dev: WiMAX device descriptor to initialize. - * - * Initializes fields of a freshly allocated @wimax_dev instance. This - * function assumes that after allocation, the memory occupied by - * @wimax_dev was zeroed. - */ -void wimax_dev_init(struct wimax_dev *wimax_dev) -{ - INIT_LIST_HEAD(&wimax_dev->id_table_node); - __wimax_state_set(wimax_dev, __WIMAX_ST_NULL); - mutex_init(&wimax_dev->mutex); - mutex_init(&wimax_dev->mutex_reset); -} -EXPORT_SYMBOL_GPL(wimax_dev_init); - -static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = { - [WIMAX_GNL_RESET_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_RFKILL_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_RFKILL_STATE] = { - .type = NLA_U32 /* enum wimax_rf_state */ - }, - [WIMAX_GNL_STGET_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_MSG_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_MSG_DATA] = { - .type = NLA_UNSPEC, /* libnl doesn't grok BINARY yet */ - }, -}; - -static const struct genl_small_ops wimax_gnl_ops[] = { - { - .cmd = WIMAX_GNL_OP_MSG_FROM_USER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_msg_from_user, - }, - { - .cmd = WIMAX_GNL_OP_RESET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_reset, - }, - { - .cmd = WIMAX_GNL_OP_RFKILL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_rfkill, - }, - { - .cmd = WIMAX_GNL_OP_STATE_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_state_get, - }, -}; - - -static -size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size, - unsigned char *addr, size_t addr_len) -{ - unsigned int cnt, total; - - for (total = cnt = 0; cnt < addr_len; cnt++) - total += scnprintf(addr_str + total, addr_str_size - total, - "%02x%c", addr[cnt], - cnt == addr_len - 1 ? '\0' : ':'); - return total; -} - - -/** - * wimax_dev_add - Register a new WiMAX device - * - * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's - * priv data). You must have called wimax_dev_init() on it before. - * - * @net_dev: net device the @wimax_dev is associated with. The - * function expects SET_NETDEV_DEV() and register_netdev() were - * already called on it. - * - * Registers the new WiMAX device, sets up the user-kernel control - * interface (generic netlink) and common WiMAX infrastructure. - * - * Note that the parts that will allow interaction with user space are - * setup at the very end, when the rest is in place, as once that - * happens, the driver might get user space control requests via - * netlink or from debugfs that might translate into calls into - * wimax_dev->op_*(). - */ -int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) -{ - int result; - struct device *dev = net_dev->dev.parent; - char addr_str[32]; - - d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev); - - /* Do the RFKILL setup before locking, as RFKILL will call - * into our functions. - */ - wimax_dev->net_dev = net_dev; - result = wimax_rfkill_add(wimax_dev); - if (result < 0) - goto error_rfkill_add; - - /* Set up user-space interaction */ - mutex_lock(&wimax_dev->mutex); - wimax_id_table_add(wimax_dev); - wimax_debugfs_add(wimax_dev); - - __wimax_state_set(wimax_dev, WIMAX_ST_DOWN); - mutex_unlock(&wimax_dev->mutex); - - wimax_addr_scnprint(addr_str, sizeof(addr_str), - net_dev->dev_addr, net_dev->addr_len); - dev_err(dev, "WiMAX interface %s (%s) ready\n", - net_dev->name, addr_str); - d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev); - return 0; - -error_rfkill_add: - d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n", - wimax_dev, net_dev, result); - return result; -} -EXPORT_SYMBOL_GPL(wimax_dev_add); - - -/** - * wimax_dev_rm - Unregister an existing WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * Unregisters a WiMAX device previously registered for use with - * wimax_add_rm(). - * - * IMPORTANT! Must call before calling unregister_netdev(). - * - * After this function returns, you will not get any more user space - * control requests (via netlink or debugfs) and thus to wimax_dev->ops. - * - * Reentrancy control is ensured by setting the state to - * %__WIMAX_ST_QUIESCING. rfkill operations coming through - * wimax_*rfkill*() will be stopped by the quiescing state; ops coming - * from the rfkill subsystem will be stopped by the support being - * removed by wimax_rfkill_rm(). - */ -void wimax_dev_rm(struct wimax_dev *wimax_dev) -{ - d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev); - - mutex_lock(&wimax_dev->mutex); - __wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING); - wimax_debugfs_rm(wimax_dev); - wimax_id_table_rm(wimax_dev); - __wimax_state_change(wimax_dev, WIMAX_ST_DOWN); - mutex_unlock(&wimax_dev->mutex); - wimax_rfkill_rm(wimax_dev); - d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev); -} -EXPORT_SYMBOL_GPL(wimax_dev_rm); - - -/* Debug framework control of debug levels */ -struct d_level D_LEVEL[] = { - D_SUBMODULE_DEFINE(debugfs), - D_SUBMODULE_DEFINE(id_table), - D_SUBMODULE_DEFINE(op_msg), - D_SUBMODULE_DEFINE(op_reset), - D_SUBMODULE_DEFINE(op_rfkill), - D_SUBMODULE_DEFINE(op_state_get), - D_SUBMODULE_DEFINE(stack), -}; -size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); - - -static const struct genl_multicast_group wimax_gnl_mcgrps[] = { - { .name = "msg", }, -}; - -struct genl_family wimax_gnl_family __ro_after_init = { - .name = "WiMAX", - .version = WIMAX_GNL_VERSION, - .hdrsize = 0, - .maxattr = WIMAX_GNL_ATTR_MAX, - .policy = wimax_gnl_policy, - .module = THIS_MODULE, - .small_ops = wimax_gnl_ops, - .n_small_ops = ARRAY_SIZE(wimax_gnl_ops), - .mcgrps = wimax_gnl_mcgrps, - .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps), -}; - - - -/* Shutdown the wimax stack */ -static -int __init wimax_subsys_init(void) -{ - int result; - - d_fnstart(4, NULL, "()\n"); - d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params, - "wimax.debug"); - - result = genl_register_family(&wimax_gnl_family); - if (unlikely(result < 0)) { - pr_err("cannot register generic netlink family: %d\n", result); - goto error_register_family; - } - - d_fnend(4, NULL, "() = 0\n"); - return 0; - -error_register_family: - d_fnend(4, NULL, "() = %d\n", result); - return result; - -} -module_init(wimax_subsys_init); - - -/* Shutdown the wimax stack */ -static -void __exit wimax_subsys_exit(void) -{ - wimax_id_table_release(); - genl_unregister_family(&wimax_gnl_family); -} -module_exit(wimax_subsys_exit); - -MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>"); -MODULE_DESCRIPTION("Linux WiMAX stack"); -MODULE_LICENSE("GPL"); diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h deleted file mode 100644 index 40751207296c..000000000000 --- a/net/wimax/wimax-internal.h +++ /dev/null @@ -1,85 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Linux WiMAX - * Internal API for kernel space WiMAX stack - * - * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This header file is for declarations and definitions internal to - * the WiMAX stack. For public APIs and documentation, see - * include/net/wimax.h and include/linux/wimax.h. - */ - -#ifndef __WIMAX_INTERNAL_H__ -#define __WIMAX_INTERNAL_H__ -#ifdef __KERNEL__ - -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/device.h> -#include <net/wimax.h> - - -/* - * Decide if a (locked) device is ready for use - * - * Before using the device structure, it must be locked - * (wimax_dev->mutex). As well, most operations need to call this - * function to check if the state is the right one. - * - * An error value will be returned if the state is not the right - * one. In that case, the caller should not attempt to use the device - * and just unlock it. - */ -static inline __must_check -int wimax_dev_is_ready(struct wimax_dev *wimax_dev) -{ - if (wimax_dev->state == __WIMAX_ST_NULL) - return -EINVAL; /* Device is not even registered! */ - if (wimax_dev->state == WIMAX_ST_DOWN) - return -ENOMEDIUM; - if (wimax_dev->state == __WIMAX_ST_QUIESCING) - return -ESHUTDOWN; - return 0; -} - - -static inline -void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state) -{ - wimax_dev->state = state; -} -void __wimax_state_change(struct wimax_dev *, enum wimax_st); - -#ifdef CONFIG_DEBUG_FS -void wimax_debugfs_add(struct wimax_dev *); -void wimax_debugfs_rm(struct wimax_dev *); -#else -static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {} -static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {} -#endif - -void wimax_id_table_add(struct wimax_dev *); -struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int); -void wimax_id_table_rm(struct wimax_dev *); -void wimax_id_table_release(void); - -int wimax_rfkill_add(struct wimax_dev *); -void wimax_rfkill_rm(struct wimax_dev *); - -/* generic netlink */ -extern struct genl_family wimax_gnl_family; - -/* ops */ -int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info); -int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info); -int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info); -int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info); - -#endif /* #ifdef __KERNEL__ */ -#endif /* #ifndef __WIMAX_INTERNAL_H__ */ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 22d1779ab2b1..e4030f1fbc60 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -530,10 +530,10 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: break; + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: WARN_ON(1); @@ -677,12 +677,12 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev) case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: /* Can NAN type be considered as beaconing interface? */ case NL80211_IFTYPE_NAN: break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); } @@ -1324,12 +1324,12 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* these interface types don't really have a channel */ return; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); } diff --git a/net/wireless/core.c b/net/wireless/core.c index 9f23923e8d29..4b1f35e976e7 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -631,10 +631,8 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) return -EINVAL; } -#ifndef CONFIG_WIRELESS_WDS if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; -#endif /* You can't even choose that many! */ if (WARN_ON(cnt < c->max_interfaces)) @@ -675,10 +673,8 @@ int wiphy_register(struct wiphy *wiphy) !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ))))) return -EINVAL; -#ifndef CONFIG_WIRELESS_WDS if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; -#endif if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported)) return -EINVAL; @@ -1202,9 +1198,6 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_OCB: __cfg80211_leave_ocb(rdev, dev); break; - case NL80211_IFTYPE_WDS: - /* must be handled by mac80211/driver, has no APIs */ - break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ @@ -1214,6 +1207,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: /* invalid */ break; @@ -1250,8 +1244,7 @@ void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_stop_iface); -void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +void cfg80211_init_wdev(struct wireless_dev *wdev) { mutex_init(&wdev->mtx); INIT_LIST_HEAD(&wdev->event_list); @@ -1262,6 +1255,30 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); +#ifdef CONFIG_CFG80211_WEXT + wdev->wext.default_key = -1; + wdev->wext.default_mgmt_key = -1; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; +#endif + + if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) + wdev->ps = true; + else + wdev->ps = false; + /* allow mac80211 to determine the timeout */ + wdev->ps_timeout = -1; + + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || + wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) + wdev->netdev->priv_flags |= IFF_DONT_BRIDGE; + + INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); +} + +void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ /* * We get here also when the interface changes network namespaces, * as it's registered into the new one, but we don't want it to @@ -1295,6 +1312,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, switch (state) { case NETDEV_POST_INIT: SET_NETDEV_DEVTYPE(dev, &wiphy_type); + wdev->netdev = dev; + /* can only change netns with wiphy */ + dev->features |= NETIF_F_NETNS_LOCAL; + + cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: /* @@ -1302,35 +1324,12 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * called within code protected by it when interfaces * are added with nl80211. */ - /* can only change netns with wiphy */ - dev->features |= NETIF_F_NETNS_LOCAL; - if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, "phy80211")) { pr_err("failed to add phy80211 symlink to netdev!\n"); } - wdev->netdev = dev; -#ifdef CONFIG_CFG80211_WEXT - wdev->wext.default_key = -1; - wdev->wext.default_mgmt_key = -1; - wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; -#endif - - if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) - wdev->ps = true; - else - wdev->ps = false; - /* allow mac80211 to determine the timeout */ - wdev->ps_timeout = -1; - - if ((wdev->iftype == NL80211_IFTYPE_STATION || - wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || - wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) - dev->priv_flags |= IFF_DONT_BRIDGE; - - INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); - cfg80211_init_wdev(rdev, wdev); + cfg80211_register_wdev(rdev, wdev); break; case NETDEV_GOING_DOWN: cfg80211_leave(rdev, wdev); diff --git a/net/wireless/core.h b/net/wireless/core.h index e1ec9ac8e608..e3e9686859d4 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -209,8 +209,9 @@ struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net); -void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); +void cfg80211_init_wdev(struct wireless_dev *wdev); +void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); static inline void wdev_lock(struct wireless_dev *wdev) __acquires(wdev) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 554796a6c6fe..8811a4b69f21 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -715,6 +715,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), [NL80211_ATTR_S1G_CAPABILITY_MASK] = NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), + [NL80211_ATTR_SAE_PWE] = + NLA_POLICY_RANGE(NLA_U8, NL80211_SAE_PWE_HUNT_AND_PECK, + NL80211_SAE_PWE_BOTH), }; /* policy for the key attributes */ @@ -1882,7 +1885,6 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL)) goto nla_put_failure; } - CMD(set_wds_peer, SET_WDS_PEER); if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) { CMD(tdls_mgmt, TDLS_MGMT); CMD(tdls_oper, TDLS_OPER); @@ -2860,8 +2862,8 @@ static int parse_txq_params(struct nlattr *tb[], static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) { /* - * You can only set the channel explicitly for WDS interfaces, - * all others have their channel managed via their respective + * You can only set the channel explicitly for some interfaces, + * most have their channel managed via their respective * "establish a connection" command (connect, join, ...) * * For AP/GO and mesh mode, the channel can be set with the @@ -3066,29 +3068,6 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) return __nl80211_set_channel(rdev, netdev, info); } -static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) -{ - struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct net_device *dev = info->user_ptr[1]; - struct wireless_dev *wdev = dev->ieee80211_ptr; - const u8 *bssid; - - if (!info->attrs[NL80211_ATTR_MAC]) - return -EINVAL; - - if (netif_running(dev)) - return -EBUSY; - - if (!rdev->ops->set_wds_peer) - return -EOPNOTSUPP; - - if (wdev->iftype != NL80211_IFTYPE_WDS) - return -EOPNOTSUPP; - - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - return rdev_set_wds_peer(rdev, dev, bssid); -} - static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; @@ -3885,7 +3864,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) * P2P Device and NAN do not have a netdev, so don't go * through the netdev notifier and must be added here */ - cfg80211_init_wdev(rdev, wdev); + cfg80211_init_wdev(wdev); + cfg80211_register_wdev(rdev, wdev); break; default: break; @@ -4594,7 +4574,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask, - struct net_device *dev) + struct net_device *dev, + bool default_all_enabled) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -4609,6 +4590,9 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + if (!default_all_enabled) + break; + sband = rdev->wiphy.bands[i]; if (!sband) @@ -4676,6 +4660,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].ht_mcs)) return -EINVAL; } + if (tb[NL80211_TXRATE_VHT]) { if (!vht_set_mcs_mask( sband, @@ -4683,6 +4668,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].vht_mcs)) return -EINVAL; } + if (tb[NL80211_TXRATE_GI]) { mask->control[band].gi = nla_get_u8(tb[NL80211_TXRATE_GI]); @@ -4694,6 +4680,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, nla_data(tb[NL80211_TXRATE_HE]), mask->control[band].he_mcs)) return -EINVAL; + if (tb[NL80211_TXRATE_HE_GI]) mask->control[band].he_gi = nla_get_u8(tb[NL80211_TXRATE_HE_GI]); @@ -4735,7 +4722,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, i; + u32 count_ht, count_vht, count_he, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -4768,7 +4755,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht)) + count_he = 0; + for (i = 0; i < NL80211_HE_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].he_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].he_mcs[i]) { + count_he++; + if (count_he > 1) + return -EINVAL; + } + if (count_he && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he) || + (!rate && !count_ht && !count_vht && !count_he)) return -EINVAL; if (rate && @@ -4783,6 +4784,10 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, !wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_RATE_VHT)) return -EINVAL; + if (count_he && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_HE)) + return -EINVAL; return 0; } @@ -5243,7 +5248,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, ¶ms.beacon_rate, - dev); + dev, false); if (err) return err; @@ -9731,6 +9736,12 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); } + if (info->attrs[NL80211_ATTR_SAE_PWE]) + settings->sae_pwe = + nla_get_u8(info->attrs[NL80211_ATTR_SAE_PWE]); + else + settings->sae_pwe = NL80211_SAE_PWE_UNSPECIFIED; + return 0; } @@ -11087,7 +11098,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &mask, - dev); + dev, true); if (err) return err; @@ -11696,7 +11707,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &setup.beacon_rate, - dev); + dev, false); if (err) return err; @@ -14476,7 +14487,8 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { attr = NL80211_TID_CONFIG_ATTR_TX_RATE; err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, - &tid_conf->txrate_mask, dev); + &tid_conf->txrate_mask, dev, + true); if (err) return err; @@ -15139,14 +15151,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { NL80211_FLAG_NEED_RTNL, }, { - .cmd = NL80211_CMD_SET_WDS_PEER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = nl80211_set_wds_peer, - .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, - }, - { .cmd = NL80211_CMD_JOIN_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_mesh, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 950d57494168..5e2f349c92a8 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -582,16 +582,6 @@ static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, return ret; } -static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *addr) -{ - int ret; - trace_rdev_set_wds_peer(&rdev->wiphy, dev, addr); - ret = rdev->ops->set_wds_peer(&rdev->wiphy, dev, addr); - trace_rdev_return_int(&rdev->wiphy, ret); - return ret; -} - static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3dab859641e1..a04fdfb35f07 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3616,7 +3616,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) power_rule = ®_rule->power_rule; if (reg_rule->flags & NL80211_RRF_AUTO_BW) - snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO", + snprintf(bw, sizeof(bw), "%d KHz, %u KHz AUTO", freq_range->max_bandwidth_khz, reg_get_max_bandwidth(rd, reg_rule)); else diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8d0e49c46db3..3409f37d838b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -694,7 +694,7 @@ static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, struct cfg80211_scan_request *request) { - u8 i; + int i; u32 s_ssid; for (i = 0; i < request->n_ssids; i++) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 6e218a0acd4e..817c6fef13be 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -838,11 +838,6 @@ DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath, TP_ARGS(wiphy, netdev, mac) ); -DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), - TP_ARGS(wiphy, netdev, mac) -); - TRACE_EVENT(rdev_dump_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *mac), diff --git a/net/wireless/util.c b/net/wireless/util.c index f01746894a4e..5af88037f1fb 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -550,8 +550,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): - if (unlikely(iftype != NL80211_IFTYPE_WDS && - iftype != NL80211_IFTYPE_MESH_POINT && + if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; @@ -1051,7 +1050,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; @@ -1063,6 +1061,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; @@ -1276,20 +1275,22 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { -#define SCALE 2048 - u16 mcs_divisors[12] = { - 34133, /* 16.666666... */ - 17067, /* 8.333333... */ - 11378, /* 5.555555... */ - 8533, /* 4.166666... */ - 5689, /* 2.777777... */ - 4267, /* 2.083333... */ - 3923, /* 1.851851... */ - 3413, /* 1.666666... */ - 2844, /* 1.388888... */ - 2560, /* 1.250000... */ - 2276, /* 1.111111... */ - 2048, /* 1.000000... */ +#define SCALE 6144 + u32 mcs_divisors[14] = { + 102399, /* 16.666666... */ + 51201, /* 8.333333... */ + 34134, /* 5.555555... */ + 25599, /* 4.166666... */ + 17067, /* 2.777777... */ + 12801, /* 2.083333... */ + 11769, /* 1.851851... */ + 10239, /* 1.666666... */ + 8532, /* 1.388888... */ + 7680, /* 1.250000... */ + 6828, /* 1.111111... */ + 6144, /* 1.000000... */ + 5690, /* 0.926106... */ + 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_969[3] = { 480388888, 453700000, 408333333 }; @@ -1301,7 +1302,7 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) u64 tmp; u32 result; - if (WARN_ON_ONCE(rate->mcs > 11)) + if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 78f2927ead7f..b84a345b2653 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -49,9 +49,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, case IW_MODE_ADHOC: type = NL80211_IFTYPE_ADHOC; break; - case IW_MODE_REPEAT: - type = NL80211_IFTYPE_WDS; - break; case IW_MODE_MONITOR: type = NL80211_IFTYPE_MONITOR; break; @@ -1150,50 +1147,6 @@ static int cfg80211_wext_giwpower(struct net_device *dev, return 0; } -static int cfg80211_wds_wext_siwap(struct net_device *dev, - struct iw_request_info *info, - struct sockaddr *addr, char *extra) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int err; - - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) - return -EINVAL; - - if (addr->sa_family != ARPHRD_ETHER) - return -EINVAL; - - if (netif_running(dev)) - return -EBUSY; - - if (!rdev->ops->set_wds_peer) - return -EOPNOTSUPP; - - err = rdev_set_wds_peer(rdev, dev, (u8 *)&addr->sa_data); - if (err) - return err; - - memcpy(&wdev->wext.bssid, (u8 *) &addr->sa_data, ETH_ALEN); - - return 0; -} - -static int cfg80211_wds_wext_giwap(struct net_device *dev, - struct iw_request_info *info, - struct sockaddr *addr, char *extra) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) - return -EINVAL; - - addr->sa_family = ARPHRD_ETHER; - memcpy(&addr->sa_data, wdev->wext.bssid, ETH_ALEN); - - return 0; -} - static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_request_info *info, struct iw_param *rate, char *extra) @@ -1371,8 +1324,6 @@ static int cfg80211_wext_siwap(struct net_device *dev, return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); - case NL80211_IFTYPE_WDS: - return cfg80211_wds_wext_siwap(dev, info, ap_addr, extra); default: return -EOPNOTSUPP; } @@ -1389,8 +1340,6 @@ static int cfg80211_wext_giwap(struct net_device *dev, return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); - case NL80211_IFTYPE_WDS: - return cfg80211_wds_wext_giwap(dev, info, ap_addr, extra); default: return -EOPNOTSUPP; } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 0bbb283f23c9..d41fffb2507b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -200,22 +200,6 @@ static void x25_remove_socket(struct sock *sk) } /* - * Kill all bound sockets on a dropped device. - */ -static void x25_kill_by_device(struct net_device *dev) -{ - struct sock *s; - - write_lock_bh(&x25_list_lock); - - sk_for_each(s, &x25_list) - if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev) - x25_disconnect(s, ENETUNREACH, 0, 0); - - write_unlock_bh(&x25_list_lock); -} - -/* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, @@ -233,21 +217,31 @@ static int x25_device_event(struct notifier_block *this, unsigned long event, #endif ) { switch (event) { - case NETDEV_UP: + case NETDEV_REGISTER: + case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; - case NETDEV_GOING_DOWN: + case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { - x25_terminate_link(nb); + x25_link_terminated(nb); x25_neigh_put(nb); } - break; - case NETDEV_DOWN: - x25_kill_by_device(dev); x25_route_device_down(dev); + break; + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_UNREGISTER: x25_link_device_down(dev); break; + case NETDEV_CHANGE: + if (!netif_carrier_ok(dev)) { + nb = x25_get_neigh(dev); + if (nb) { + x25_link_terminated(nb); + x25_neigh_put(nb); + } + } + break; } } @@ -681,7 +675,8 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || - addr->sx25_family != AF_X25) { + addr->sx25_family != AF_X25 || + strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } @@ -775,7 +770,8 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || - addr->sx25_family != AF_X25) + addr->sx25_family != AF_X25 || + strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; @@ -825,7 +821,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; rc = 0; out_put_neigh: - if (rc) { + if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; @@ -1050,6 +1046,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; + x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index fdae054b7dc1..f92073f3cb11 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -74,16 +74,43 @@ void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb, switch (frametype) { case X25_RESTART_REQUEST: - confirm = !x25_t20timer_pending(nb); - x25_stop_t20timer(nb); - nb->state = X25_LINK_STATE_3; - if (confirm) + switch (nb->state) { + case X25_LINK_STATE_2: + confirm = !x25_t20timer_pending(nb); + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + if (confirm) + x25_transmit_restart_confirmation(nb); + break; + case X25_LINK_STATE_3: + /* clear existing virtual calls */ + x25_kill_by_neigh(nb); + x25_transmit_restart_confirmation(nb); + break; + } break; case X25_RESTART_CONFIRMATION: - x25_stop_t20timer(nb); - nb->state = X25_LINK_STATE_3; + switch (nb->state) { + case X25_LINK_STATE_2: + if (x25_t20timer_pending(nb)) { + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + } else { + x25_transmit_restart_request(nb); + x25_start_t20timer(nb); + } + break; + case X25_LINK_STATE_3: + /* clear existing virtual calls */ + x25_kill_by_neigh(nb); + + x25_transmit_restart_request(nb); + nb->state = X25_LINK_STATE_2; + x25_start_t20timer(nb); + break; + } break; case X25_DIAGNOSTIC: @@ -214,8 +241,6 @@ void x25_link_established(struct x25_neigh *nb) { switch (nb->state) { case X25_LINK_STATE_0: - nb->state = X25_LINK_STATE_2; - break; case X25_LINK_STATE_1: x25_transmit_restart_request(nb); nb->state = X25_LINK_STATE_2; @@ -232,6 +257,9 @@ void x25_link_established(struct x25_neigh *nb) void x25_link_terminated(struct x25_neigh *nb) { nb->state = X25_LINK_STATE_0; + skb_queue_purge(&nb->queue); + x25_stop_t20timer(nb); + /* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */ x25_kill_by_neigh(nb); } @@ -277,9 +305,6 @@ void x25_link_device_up(struct net_device *dev) */ static void __x25_remove_neigh(struct x25_neigh *nb) { - skb_queue_purge(&nb->queue); - x25_stop_t20timer(nb); - if (nb->node.next) { list_del(&nb->node); x25_neigh_put(nb); diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 00e46c9a5280..ec2a39e9b3e6 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -115,9 +115,6 @@ void x25_route_device_down(struct net_device *dev) __x25_remove_route(rt); } write_unlock_bh(&x25_route_list_lock); - - /* Remove any related forwarding */ - x25_clear_forward_by_dev(dev); } /* diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 56d052bc65cb..56a28a686988 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -66,18 +66,31 @@ static void xdp_umem_release(struct xdp_umem *umem) kfree(umem); } +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + void xdp_get_umem(struct xdp_umem *umem) { refcount_inc(&umem->users); } -void xdp_put_umem(struct xdp_umem *umem) +void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) { if (!umem) return; - if (refcount_dec_and_test(&umem->users)) - xdp_umem_release(umem); + if (refcount_dec_and_test(&umem->users)) { + if (defer_cleanup) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } else { + xdp_umem_release(umem); + } + } } static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 181fdda2f2a8..aa9fe2780410 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -9,7 +9,7 @@ #include <net/xdp_sock_drv.h> void xdp_get_umem(struct xdp_umem *umem); -void xdp_put_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b71a32eeae65..56c46e5f57bc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,6 +23,7 @@ #include <linux/netdevice.h> #include <linux/rculist.h> #include <net/xdp_sock_drv.h> +#include <net/busy_poll.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -232,6 +233,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + sk_mark_napi_id_once_xdp(&xs->sk, xdp); len = xdp->data_end - xdp->data; return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? @@ -332,6 +334,63 @@ out: } EXPORT_SYMBOL(xsk_tx_peek_desc); +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + struct xdp_sock *xs; + u32 nb_pkts; + + rcu_read_lock(); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, descs, max_entries); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts); + if (!nb_pkts) + goto out; + + xskq_cons_release_n(xs->tx, nb_pkts); + __xskq_cons_release(xs->tx); + xs->sk.sk_write_space(&xs->sk); + +out: + rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); + static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; @@ -411,11 +470,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; skb->destructor = xsk_destruct_skb; - /* Hinder dev_direct_xmit from freeing the packet and - * therefore completing it in the destructor - */ - refcount_inc(&skb->users); - err = dev_direct_xmit(skb, xs->queue_id); + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ skb->destructor = sock_wfree; @@ -429,12 +484,10 @@ static int xsk_generic_xmit(struct sock *sk) /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ - kfree_skb(skb); err = -EBUSY; goto out; } - consume_skb(skb); sent_frame = true; } @@ -460,18 +513,65 @@ static int __xsk_sendmsg(struct sock *sk) return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); } +static bool xsk_no_wakeup(struct sock *sk) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + /* Prefer busy-polling, skip the wakeup. */ + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && + READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; +#else + return false; +#endif +} + static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; + + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(need_wait)) + return -EOPNOTSUPP; + + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + pool = xs->pool; + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) + return __xsk_sendmsg(sk); + return 0; +} + +static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + bool need_wait = !(flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + if (unlikely(!xs->rx)) + return -ENOBUFS; if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(need_wait)) return -EOPNOTSUPP; - return __xsk_sendmsg(sk); + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_RX); + return 0; } static __poll_t xsk_poll(struct file *file, struct socket *sock, @@ -548,7 +648,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { - WARN_ON(xsk_map_inc(node->map)); + bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } @@ -578,7 +678,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); - xsk_map_put(map); + bpf_map_put(&map->map); } } @@ -1134,7 +1234,7 @@ static const struct proto_ops xsk_proto_ops = { .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, + .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; @@ -1146,7 +1246,8 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xp_put_pool(xs->pool); + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem, !xs->pool); sk_refcnt_debug_dec(sk); } diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index b9e896cee5bb..edcf249ad1f1 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -41,8 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 64c9e55d4d4e..556d82d03687 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -144,14 +144,13 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_NEED_WAKEUP) { + if (flags & XDP_USE_NEED_WAKEUP) pool->uses_need_wakeup = true; - /* Tx needs to be explicitly woken up the first time. - * Also for supporting drivers that do not implement this - * feature. They will always have to call sendto(). - */ - pool->cached_need_wakeup = XDP_WAKEUP_TX; - } + /* Tx needs to be explicitly woken up the first time. Also + * for supporting drivers that do not implement this + * feature. They will always have to call sendto() or poll(). + */ + pool->cached_need_wakeup = XDP_WAKEUP_TX; dev_hold(netdev); @@ -185,8 +184,10 @@ err_unreg_xsk: err_unreg_pool: if (!force_zc) err = 0; /* fallback to copy mode */ - if (err) + if (err) { xsk_clear_pool_at_qid(netdev, queue_id); + dev_put(netdev); + } return err; } @@ -242,7 +243,7 @@ static void xp_release_deferred(struct work_struct *work) pool->cq = NULL; } - xdp_put_umem(pool->umem); + xdp_put_umem(pool->umem, false); xp_destroy(pool); } @@ -251,15 +252,18 @@ void xp_get_pool(struct xsk_buff_pool *pool) refcount_inc(&pool->users); } -void xp_put_pool(struct xsk_buff_pool *pool) +bool xp_put_pool(struct xsk_buff_pool *pool) { if (!pool) - return; + return false; if (refcount_dec_and_test(&pool->users)) { INIT_WORK(&pool->work, xp_release_deferred); schedule_work(&pool->work); + return true; } + + return false; } static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..b936c46b1e16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -18,9 +18,11 @@ struct xdp_ring { /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ - u32 pad ____cacheline_aligned_in_smp; + u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; + u32 pad2 ____cacheline_aligned_in_smp; u32 flags; + u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ @@ -197,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, + struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 cached_cons = q->cached_cons, nb_entries = 0; + + while (cached_cons != q->cached_prod && nb_entries < max) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + + descs[nb_entries] = ring->desc[idx]; + if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { + /* Skip the entry */ + cached_cons++; + continue; + } + + nb_entries++; + cached_cons++; + } + + return nb_entries; +} + /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) @@ -218,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q) __xskq_cons_peek(q); } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; - if (entries >= cnt) - return true; + if (entries >= max) + return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; - return entries >= cnt; + return entries >= max ? max : entries; +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) @@ -247,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, pool); } +static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 entries = xskq_cons_nb_entries(q, max); + + return xskq_cons_read_desc_batch(q, descs, pool, entries); +} + +/* To improve performance in the xskq_cons_release functions, only update local state here. + * Reflect this to global state when we get new entries from the ring in + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. + */ static inline void xskq_cons_release(struct xsk_queue *q) { - /* To improve performance, only update local state here. - * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries() and whenever - * Rx or Tx processing are completed in the NAPI loop. - */ q->cached_cons++; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -266,18 +309,23 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) /* Functions for producers */ -static inline bool xskq_prod_is_full(struct xsk_queue *q) +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (free_entries) - return false; + if (free_entries >= max) + return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); - return !free_entries; + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -302,6 +350,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 max) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 nb_entries, i, cached_prod; + + nb_entries = xskq_prod_nb_free(q, max); + + /* A, matches D */ + cached_prod = q->cached_prod; + for (i = 0; i < nb_entries; i++) + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; + q->cached_prod = cached_prod; + + return nb_entries; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 49da2b8ace8b..113fd9017203 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -11,32 +11,17 @@ #include "xsk.h" -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, struct xdp_sock **map_entry) { struct xsk_map_node *node; - int err; - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + node = bpf_map_kzalloc(&map->map, sizeof(*node), + GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } + bpf_map_inc(&map->map); node->map = map; node->map_entry = map_entry; @@ -45,7 +30,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, static void xsk_map_node_free(struct xsk_map_node *node) { - xsk_map_put(node->map); + bpf_map_put(&node->map->map); kfree(node); } @@ -73,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - struct bpf_map_memory mem; - int err, numa_node; struct xsk_map *m; + int numa_node; u64 size; if (!capable(CAP_NET_ADMIN)) @@ -89,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); + if (!m) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); return &m->map; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 37456d022cfa..be6351e3f3cd 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -760,9 +760,9 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) } EXPORT_SYMBOL(xfrm_input_resume); -static void xfrm_trans_reinject(unsigned long data) +static void xfrm_trans_reinject(struct tasklet_struct *t) { - struct xfrm_trans_tasklet *trans = (void *)data; + struct xfrm_trans_tasklet *trans = from_tasklet(trans, t, tasklet); struct sk_buff_head queue; struct sk_buff *skb; @@ -818,7 +818,6 @@ void __init xfrm_input_init(void) trans = &per_cpu(xfrm_trans_tasklet, i); __skb_queue_head_init(&trans->queue); - tasklet_init(&trans->tasklet, xfrm_trans_reinject, - (unsigned long)trans); + tasklet_setup(&trans->tasklet, xfrm_trans_reinject); } } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index aa4cdcf69d47..697cdcfbb5e1 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -319,12 +319,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) err = dst_output(xi->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += length; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); + dev_sw_netstats_tx_add(dev, 1, length); } else { stats->tx_errors++; stats->tx_aborted_errors++; @@ -538,15 +533,6 @@ static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) return err; } -static void xfrmi_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *s) -{ - dev_fetch_sw_netstats(s, dev->tstats); - - s->rx_dropped = dev->stats.rx_dropped; - s->tx_dropped = dev->stats.tx_dropped; -} - static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); @@ -554,12 +540,11 @@ static int xfrmi_get_iflink(const struct net_device *dev) return xi->p.link; } - static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, - .ndo_get_stats64 = xfrmi_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; @@ -803,14 +788,14 @@ static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, - .priority = -1, + .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, - .priority = -1, + .priority = 2, }; #endif @@ -848,14 +833,14 @@ static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, - .priority = -1, + .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, - .priority = -1, + .priority = 2, }; #endif diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bbd4643d7e82..a77da7aae6fe 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2004,6 +2004,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) int err = -ENOENT; __be32 minspi = htonl(low); __be32 maxspi = htonl(high); + __be32 newspi = 0; u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); @@ -2022,21 +2023,22 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) xfrm_state_put(x0); goto unlock; } - x->id.spi = minspi; + newspi = minspi; } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { spi = low + prandom_u32()%(high-low+1); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { - x->id.spi = htonl(spi); + newspi = htonl(spi); break; } xfrm_state_put(x0); } } - if (x->id.spi) { + if (newspi) { spin_lock_bh(&net->xfrm.xfrm_state_lock); + x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); spin_unlock_bh(&net->xfrm.xfrm_state_lock); |