diff options
Diffstat (limited to 'net/openvswitch')
-rw-r--r-- | net/openvswitch/actions.c | 176 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 326 | ||||
-rw-r--r-- | net/openvswitch/conntrack.h | 14 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 34 | ||||
-rw-r--r-- | net/openvswitch/datapath.h | 2 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 153 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 77 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.c | 274 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.h | 7 | ||||
-rw-r--r-- | net/openvswitch/vport-internal_dev.c | 16 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.c | 10 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 48 | ||||
-rw-r--r-- | net/openvswitch/vport.h | 3 |
13 files changed, 806 insertions, 334 deletions
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 4e03f64709bc..c82301ce3fff 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -62,9 +62,11 @@ struct ovs_frag_data { struct vport *vport; struct ovs_skb_cb cb; __be16 inner_protocol; - __u16 vlan_tci; + u16 network_offset; /* valid only for MPLS */ + u16 vlan_tci; __be16 vlan_proto; unsigned int l2_len; + u8 mac_proto; u8 l2_data[MAX_L2_LEN]; }; @@ -136,12 +138,12 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, static void invalidate_flow_key(struct sw_flow_key *key) { - key->eth.type = htons(0); + key->mac_proto |= SW_FLOW_KEY_INVALID; } static bool is_flow_key_valid(const struct sw_flow_key *key) { - return !!key->eth.type; + return !(key->mac_proto & SW_FLOW_KEY_INVALID); } static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, @@ -185,7 +187,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); - update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); + if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) + update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); skb->protocol = mpls->mpls_ethertype; invalidate_flow_key(key); @@ -195,7 +198,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, const __be16 ethertype) { - struct ethhdr *hdr; int err; err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); @@ -211,11 +213,15 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_reset_mac_header(skb); skb_set_network_header(skb, skb->mac_len); - /* mpls_hdr() is used to locate the ethertype field correctly in the - * presence of VLAN tags. - */ - hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); - update_ethertype(skb, hdr, ethertype); + if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) { + struct ethhdr *hdr; + + /* mpls_hdr() is used to locate the ethertype field correctly in the + * presence of VLAN tags. + */ + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); + update_ethertype(skb, hdr, ethertype); + } if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; @@ -311,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } +/* pop_eth does not support VLAN packets as this action is never called + * for them. + */ +static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) +{ + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + +static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_eth *ethh) +{ + struct ethhdr *hdr; + + /* Add the new Ethernet header */ + if (skb_cow_head(skb, ETH_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + hdr = eth_hdr(skb); + ether_addr_copy(hdr->h_source, ethh->addresses.eth_src); + ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst); + hdr->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, hdr, ETH_HLEN); + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_ETHERNET; + invalidate_flow_key(key); + return 0; +} + static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, __be32 addr, __be32 new_addr) { @@ -666,7 +713,13 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpush_rcsum(skb, skb->data, data->l2_len); skb_reset_mac_header(skb); - ovs_vport_send(vport, skb); + if (eth_p_mpls(skb->protocol)) { + skb->inner_network_header = skb->network_header; + skb_set_network_header(skb, data->network_offset); + skb_reset_mac_len(skb); + } + + ovs_vport_send(vport, skb, data->mac_proto); return 0; } @@ -684,7 +737,8 @@ static struct dst_ops ovs_dst_ops = { /* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is * ovs_vport_output(), which is called once per fragmented packet. */ -static void prepare_frag(struct vport *vport, struct sk_buff *skb) +static void prepare_frag(struct vport *vport, struct sk_buff *skb, + u16 orig_network_offset, u8 mac_proto) { unsigned int hlen = skb_network_offset(skb); struct ovs_frag_data *data; @@ -694,8 +748,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb) data->vport = vport; data->cb = *OVS_CB(skb); data->inner_protocol = skb->inner_protocol; + data->network_offset = orig_network_offset; data->vlan_tci = skb->vlan_tci; data->vlan_proto = skb->vlan_proto; + data->mac_proto = mac_proto; data->l2_len = hlen; memcpy(&data->l2_data, skb->data, hlen); @@ -704,18 +760,27 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb) } static void ovs_fragment(struct net *net, struct vport *vport, - struct sk_buff *skb, u16 mru, __be16 ethertype) + struct sk_buff *skb, u16 mru, + struct sw_flow_key *key) { + u16 orig_network_offset = 0; + + if (eth_p_mpls(skb->protocol)) { + orig_network_offset = skb_network_offset(skb); + skb->network_header = skb->inner_network_header; + } + if (skb_network_offset(skb) > MAX_L2_LEN) { OVS_NLERR(1, "L2 header too long to fragment"); goto err; } - if (ethertype == htons(ETH_P_IP)) { + if (key->eth.type == htons(ETH_P_IP)) { struct dst_entry ovs_dst; unsigned long orig_dst; - prepare_frag(vport, skb); + prepare_frag(vport, skb, orig_network_offset, + ovs_key_mac_proto(key)); dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOCOUNT); ovs_dst.dev = vport->dev; @@ -726,16 +791,16 @@ static void ovs_fragment(struct net *net, struct vport *vport, ip_do_fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); - } else if (ethertype == htons(ETH_P_IPV6)) { + } else if (key->eth.type == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); unsigned long orig_dst; struct rt6_info ovs_rt; - if (!v6ops) { + if (!v6ops) goto err; - } - prepare_frag(vport, skb); + prepare_frag(vport, skb, orig_network_offset, + ovs_key_mac_proto(key)); memset(&ovs_rt, 0, sizeof(ovs_rt)); dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -749,7 +814,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, refdst_drop(orig_dst); } else { WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", - ovs_vport_name(vport), ntohs(ethertype), mru, + ovs_vport_name(vport), ntohs(key->eth.type), mru, vport->dev->mtu); goto err; } @@ -769,26 +834,19 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, u32 cutlen = OVS_CB(skb)->cutlen; if (unlikely(cutlen > 0)) { - if (skb->len - cutlen > ETH_HLEN) + if (skb->len - cutlen > ovs_mac_header_len(key)) pskb_trim(skb, skb->len - cutlen); else - pskb_trim(skb, ETH_HLEN); + pskb_trim(skb, ovs_mac_header_len(key)); } - if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { - ovs_vport_send(vport, skb); + if (likely(!mru || + (skb->len <= mru + vport->dev->hard_header_len))) { + ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); } else if (mru <= vport->dev->mtu) { struct net *net = read_pnet(&dp->net); - __be16 ethertype = key->eth.type; - if (!is_flow_key_valid(key)) { - if (eth_p_mpls(skb->protocol)) - ethertype = skb->inner_protocol; - else - ethertype = vlan_get_protocol(skb); - } - - ovs_fragment(net, vport, skb, mru, ethertype); + ovs_fragment(net, vport, skb, mru, key); } else { kfree_skb(skb); } @@ -1015,6 +1073,8 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_ZONE: case OVS_KEY_ATTR_CT_MARK: case OVS_KEY_ATTR_CT_LABELS: + case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: + case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: err = -EINVAL; break; } @@ -1082,12 +1142,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len) { - /* Every output action needs a separate clone of 'skb', but the common - * case is just a single output action, so that doing a clone and - * then freeing the original skbuff is wasteful. So the following code - * is slightly obscure just to avoid that. - */ - int prev_port = -1; const struct nlattr *a; int rem; @@ -1095,20 +1149,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; - if (unlikely(prev_port != -1)) { - struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); - - if (out_skb) - do_output(dp, out_skb, prev_port, key); + switch (nla_type(a)) { + case OVS_ACTION_ATTR_OUTPUT: { + int port = nla_get_u32(a); + struct sk_buff *clone; + + /* Every output action needs a separate clone + * of 'skb', In case the output action is the + * last action, cloning can be avoided. + */ + if (nla_is_last(a, rem)) { + do_output(dp, skb, port, key); + /* 'skb' has been used for output. + */ + return 0; + } + clone = skb_clone(skb, GFP_ATOMIC); + if (clone) + do_output(dp, clone, port, key); OVS_CB(skb)->cutlen = 0; - prev_port = -1; - } - - switch (nla_type(a)) { - case OVS_ACTION_ATTR_OUTPUT: - prev_port = nla_get_u32(a); break; + } case OVS_ACTION_ATTR_TRUNC: { struct ovs_action_trunc *trunc = nla_data(a); @@ -1182,6 +1244,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (err) return err == -EINPROGRESS ? 0 : err; break; + + case OVS_ACTION_ATTR_PUSH_ETH: + err = push_eth(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_ETH: + err = pop_eth(skb, key); + break; } if (unlikely(err)) { @@ -1190,11 +1260,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } } - if (prev_port != -1) - do_output(dp, skb, prev_port, key); - else - consume_skb(skb); - + consume_skb(skb); return 0; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index fecefa2dc94e..7b2c2fce408a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -65,6 +65,7 @@ struct ovs_conntrack_info { struct nf_conn *ct; u8 commit : 1; u8 nat : 3; /* enum ovs_ct_nat */ + u8 force : 1; u16 family; struct md_mark mark; struct md_labels labels; @@ -73,6 +74,8 @@ struct ovs_conntrack_info { #endif }; +static bool labels_nonzero(const struct ovs_key_ct_labels *labels); + static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); static u16 key_to_nfproto(const struct sw_flow_key *key) @@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct) #endif } +/* Guard against conntrack labels max size shrinking below 128 bits. */ +#if NF_CT_LABELS_MAX_SIZE < 16 +#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes +#endif + static void ovs_ct_get_labels(const struct nf_conn *ct, struct ovs_key_ct_labels *labels) { struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; - if (cl) { - size_t len = sizeof(cl->bits); + if (cl) + memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); + else + memset(labels, 0, OVS_CT_LABELS_LEN); +} - if (len > OVS_CT_LABELS_LEN) - len = OVS_CT_LABELS_LEN; - else if (len < OVS_CT_LABELS_LEN) - memset(labels, 0, OVS_CT_LABELS_LEN); - memcpy(labels, cl->bits, len); +static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, + const struct nf_conntrack_tuple *orig, + u8 icmp_proto) +{ + key->ct_orig_proto = orig->dst.protonum; + if (orig->dst.protonum == icmp_proto) { + key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); + key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); } else { - memset(labels, 0, OVS_CT_LABELS_LEN); + key->ct.orig_tp.src = orig->src.u.all; + key->ct.orig_tp.dst = orig->dst.u.all; } } @@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, const struct nf_conntrack_zone *zone, const struct nf_conn *ct) { - key->ct.state = state; - key->ct.zone = zone->id; + key->ct_state = state; + key->ct_zone = zone->id; key->ct.mark = ovs_ct_get_mark(ct); ovs_ct_get_labels(ct, &key->ct.labels); + + if (ct) { + const struct nf_conntrack_tuple *orig; + + /* Use the master if we have one. */ + if (ct->master) + ct = ct->master; + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + + /* IP version must match with the master connection. */ + if (key->eth.type == htons(ETH_P_IP) && + nf_ct_l3num(ct) == NFPROTO_IPV4) { + key->ipv4.ct_orig.src = orig->src.u3.ip; + key->ipv4.ct_orig.dst = orig->dst.u3.ip; + __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); + return; + } else if (key->eth.type == htons(ETH_P_IPV6) && + !sw_flow_key_is_nd(key) && + nf_ct_l3num(ct) == NFPROTO_IPV6) { + key->ipv6.ct_orig.src = orig->src.u3.in6; + key->ipv6.ct_orig.dst = orig->dst.u3.in6; + __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); + return; + } + } + /* Clear 'ct_orig_proto' to mark the non-existence of conntrack + * original direction key fields. + */ + key->ct_orig_proto = 0; } -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has +/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has * previously sent the packet to conntrack via the ct action. If * 'keep_nat_flags' is true, the existing NAT flags retained, else they are * initialized from the connection status. @@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb, if (ct->master) state |= OVS_CS_F_RELATED; if (keep_nat_flags) { - state |= key->ct.state & OVS_CS_F_NAT_MASK; + state |= key->ct_state & OVS_CS_F_NAT_MASK; } else { if (ct->status & IPS_SRC_NAT) state |= OVS_CS_F_SRC_NAT; @@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) ovs_ct_update_key(skb, NULL, key, false, false); } -int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +#define IN6_ADDR_INITIALIZER(ADDR) \ + { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ + (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } + +int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { - if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && - nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && - nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), - &key->ct.labels)) + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), + &output->ct.labels)) return -EMSGSIZE; + if (swkey->ct_orig_proto) { + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ct_tuple_ipv4 orig = { + output->ipv4.ct_orig.src, + output->ipv4.ct_orig.dst, + output->ct.orig_tp.src, + output->ct.orig_tp.dst, + output->ct_orig_proto, + }; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, + sizeof(orig), &orig)) + return -EMSGSIZE; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ct_tuple_ipv6 orig = { + IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), + IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), + output->ct.orig_tp.src, + output->ct.orig_tp.dst, + output->ct_orig_proto, + }; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, + sizeof(orig), &orig)) + return -EMSGSIZE; + } + } + return 0; } -static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, +static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, u32 ct_mark, u32 mask) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; u32 new_mark; - /* The connection could be invalid, in which case set_mark is no-op. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return 0; - new_mark = ct_mark | (ct->mark & ~(mask)); if (ct->mark != new_mark) { ct->mark = new_mark; - nf_conntrack_event_cache(IPCT_MARK, ct); + if (nf_ct_is_confirmed(ct)) + nf_conntrack_event_cache(IPCT_MARK, ct); key->ct.mark = new_mark; } @@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, #endif } -static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_ct_labels *labels, - const struct ovs_key_ct_labels *mask) +static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) { - enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; - struct nf_conn *ct; - int err; - - /* The connection could be invalid, in which case set_label is no-op.*/ - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return 0; cl = nf_ct_labels_find(ct); if (!cl) { nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) + + return cl; +} + +/* Initialize labels for a new, yet to be committed conntrack entry. Note that + * since the new connection is not yet confirmed, and thus no-one else has + * access to it's labels, we simply write them over. + */ +static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + struct nf_conn_labels *cl, *master_cl; + bool have_mask = labels_nonzero(mask); + + /* Inherit master's labels to the related connection? */ + master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; + + if (!master_cl && !have_mask) + return 0; /* Nothing to do. */ + + cl = ovs_ct_get_conn_labels(ct); + if (!cl) return -ENOSPC; - err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, - OVS_CT_LABELS_LEN / sizeof(u32)); + /* Inherit the master's labels, if any. */ + if (master_cl) + *cl = *master_cl; + + if (have_mask) { + u32 *dst = (u32 *)cl->bits; + int i; + + for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) + dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | + (labels->ct_labels_32[i] + & mask->ct_labels_32[i]); + } + + /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the + * IPCT_LABEL bit it set in the event cache. + */ + nf_conntrack_event_cache(IPCT_LABEL, ct); + + memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); + + return 0; +} + +static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + struct nf_conn_labels *cl; + int err; + + cl = ovs_ct_get_conn_labels(ct); + if (!cl) + return -ENOSPC; + + err = nf_connlabels_replace(ct, labels->ct_labels_32, + mask->ct_labels_32, + OVS_CT_LABELS_LEN_32); if (err) return err; - ovs_ct_get_labels(ct, &key->ct.labels); + memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); + return 0; } @@ -367,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) { @@ -421,16 +538,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) /* Find an existing connection which this packet belongs to without * re-attributing statistics or modifying the connection state. This allows an - * skb->nfct lost due to an upcall to be recovered during actions execution. + * skb->_nfct lost due to an upcall to be recovered during actions execution. * * Must be called with rcu_read_lock. * - * On success, populates skb->nfct and skb->nfctinfo, and returns the - * connection. Returns NULL if there is no existing entry. + * On success, populates skb->_nfct and returns the connection. Returns NULL + * if there is no existing entry. */ static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, - u8 l3num, struct sk_buff *skb) + u8 l3num, struct sk_buff *skb, bool natted) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -453,6 +570,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, return NULL; } + /* Must invert the tuple if skb has been transformed by NAT. */ + if (natted) { + struct nf_conntrack_tuple inverse; + + if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { + pr_debug("ovs_ct_find_existing: Inversion failed!\n"); + return NULL; + } + tuple = inverse; + } + /* look for tuple match */ h = nf_conntrack_find_get(net, zone, &tuple); if (!h) @@ -460,12 +588,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, ct = nf_ct_tuplehash_to_ctrack(h); - skb->nfct = &ct->ct_general; - skb->nfctinfo = ovs_ct_get_info(h); + /* Inverted packet tuple matches the reverse direction conntrack tuple, + * select the other tuplehash to get the right 'ctinfo' bits for this + * packet. + */ + if (natted) + h = &ct->tuplehash[!h->tuple.dst.dir]; + + nf_ct_set(skb, ct, ovs_ct_get_info(h)); return ct; } -/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ static bool skb_nfct_cached(struct net *net, const struct sw_flow_key *key, const struct ovs_conntrack_info *info, @@ -476,14 +610,19 @@ static bool skb_nfct_cached(struct net *net, ct = nf_ct_get(skb, &ctinfo); /* If no ct, check if we have evidence that an existing conntrack entry - * might be found for this skb. This happens when we lose a skb->nfct + * might be found for this skb. This happens when we lose a skb->_nfct * due to an upcall. If the connection was not confirmed, it is not * cached and needs to be run through conntrack again. */ - if (!ct && key->ct.state & OVS_CS_F_TRACKED && - !(key->ct.state & OVS_CS_F_INVALID) && - key->ct.zone == info->zone.id) - ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); + if (!ct && key->ct_state & OVS_CS_F_TRACKED && + !(key->ct_state & OVS_CS_F_INVALID) && + key->ct_zone == info->zone.id) { + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, + !!(key->ct_state + & OVS_CS_F_NAT_MASK)); + if (ct) + nf_ct_get(skb, &ctinfo); + } if (!ct) return false; if (!net_eq(net, read_pnet(&ct->ct_net))) @@ -497,6 +636,18 @@ static bool skb_nfct_cached(struct net *net, if (help && rcu_access_pointer(help->helper) != info->helper) return false; } + /* Force conntrack entry direction to the current packet? */ + if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + /* Delete the conntrack entry if confirmed, else just release + * the reference. + */ + if (nf_ct_is_confirmed(ct)) + nf_ct_delete(ct, 0, 0); + + nf_conntrack_put(&ct->ct_general); + nf_ct_set(skb, NULL, 0); + return false; + } return true; } @@ -514,7 +665,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, int hooknum, nh_off, err = NF_ACCEPT; nh_off = skb_network_offset(skb); - skb_pull(skb, nh_off); + skb_pull_rcsum(skb, nh_off); /* See HOOK2MANIP(). */ if (maniptype == NF_NAT_MANIP_SRC) @@ -579,6 +730,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, err = nf_nat_packet(ct, ctinfo, hooknum, skb); push: skb_push(skb, nh_off); + skb_postpush_rcsum(skb, skb->data, nh_off); return err; } @@ -590,7 +742,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key, if (maniptype == NF_NAT_MANIP_SRC) { __be16 src; - key->ct.state |= OVS_CS_F_SRC_NAT; + key->ct_state |= OVS_CS_F_SRC_NAT; if (key->eth.type == htons(ETH_P_IP)) key->ipv4.addr.src = ip_hdr(skb)->saddr; else if (key->eth.type == htons(ETH_P_IPV6)) @@ -612,7 +764,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key, } else { __be16 dst; - key->ct.state |= OVS_CS_F_DST_NAT; + key->ct_state |= OVS_CS_F_DST_NAT; if (key->eth.type == htons(ETH_P_IP)) key->ipv4.addr.dst = ip_hdr(skb)->daddr; else if (key->eth.type == htons(ETH_P_IPV6)) @@ -699,7 +851,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if * not done already. Update key with new CT state after passing the packet * through conntrack. - * Note that if the packet is deemed invalid by conntrack, skb->nfct will be + * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be * set to NULL and 0 will be returned. */ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, @@ -721,19 +873,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - if (skb->nfct) - nf_conntrack_put(skb->nfct); + if (skb_nfct(skb)) + nf_conntrack_put(skb_nfct(skb)); nf_conntrack_get(&tmpl->ct_general); - skb->nfct = &tmpl->ct_general; - skb->nfctinfo = IP_CT_NEW; + nf_ct_set(skb, tmpl, IP_CT_NEW); } - /* Repeat if requested, see nf_iterate(). */ - do { - err = nf_conntrack_in(net, info->family, - NF_INET_PRE_ROUTING, skb); - } while (err == NF_REPEAT); - + err = nf_conntrack_in(net, info->family, + NF_INET_PRE_ROUTING, skb); if (err != NF_ACCEPT) return -ENOENT; @@ -741,7 +888,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * NAT after the nf_conntrack_in() call. We can actually clear * the whole state, as it will be re-initialized below. */ - key->ct.state = 0; + key->ct_state = 0; /* Update the key, but keep the NAT flags. */ ovs_ct_update_key(skb, info, key, true, true); @@ -757,9 +904,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * * NAT will be done only if the CT action has NAT, and only * once per packet (per zone), as guarded by the NAT bits in - * the key->ct.state. + * the key->ct_state. */ - if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && + if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && (nf_ct_is_confirmed(ct) || info->commit) && ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { return -EINVAL; @@ -823,7 +970,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, if (err) return err; - ct = (struct nf_conn *)skb->nfct; + ct = (struct nf_conn *)skb_nfct(skb); if (ct) nf_ct_deliver_cached_events(ct); } @@ -835,8 +982,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels) { size_t i; - for (i = 0; i < sizeof(*labels); i++) - if (labels->ct_labels[i]) + for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) + if (labels->ct_labels_32[i]) return true; return false; @@ -847,24 +994,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; int err; err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; + /* The connection could be invalid, in which case this is a no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + /* Apply changes before confirming the connection so that the initial * conntrack NEW netlink event carries the values given in the CT * action. */ if (info->mark.mask) { - err = ovs_ct_set_mark(skb, key, info->mark.value, + err = ovs_ct_set_mark(ct, key, info->mark.value, info->mark.mask); if (err) return err; } - if (labels_nonzero(&info->labels.mask)) { - err = ovs_ct_set_labels(skb, key, &info->labels.value, + if (!nf_ct_is_confirmed(ct)) { + err = ovs_ct_init_labels(ct, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + } else if (labels_nonzero(&info->labels.mask)) { + err = ovs_ct_set_labels(ct, key, &info->labels.value, &info->labels.mask); if (err) return err; @@ -890,7 +1049,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, /* The conntrack module expects to be working at L3. */ nh_ofs = skb_network_offset(skb); - skb_pull(skb, nh_ofs); + skb_pull_rcsum(skb, nh_ofs); if (key->ip.frag != OVS_FRAG_TYPE_NONE) { err = handle_fragments(net, key, info->zone.id, skb); @@ -904,6 +1063,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, err = ovs_ct_lookup(net, key, info, skb); skb_push(skb, nh_ofs); + skb_postpush_rcsum(skb, skb->data, nh_ofs); if (err) kfree_skb(skb); return err; @@ -1065,6 +1225,7 @@ static int parse_nat(const struct nlattr *attr, static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), .maxlen = sizeof(u16) }, [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), @@ -1104,6 +1265,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, } switch (type) { + case OVS_CT_ATTR_FORCE_COMMIT: + info->force = true; + /* fall through. */ case OVS_CT_ATTR_COMMIT: info->commit = true; break; @@ -1330,7 +1494,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, if (!start) return -EMSGSIZE; - if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) + if (ct_info->commit && nla_put_flag(skb, ct_info->force + ? OVS_CT_ATTR_FORCE_COMMIT + : OVS_CT_ATTR_COMMIT)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 8f6230bd6183..bc7efd1867ab 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); -int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb); void ovs_ct_free_action(const struct nlattr *a); #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ @@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, static inline void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { - key->ct.state = 0; - key->ct.zone = 0; + key->ct_state = 0; + key->ct_zone = 0; key->ct.mark = 0; memset(&key->ct.labels, 0, sizeof(key->ct.labels)); + /* Clear 'ct_orig_proto' to mark the non-existence of original + * direction key fields. + */ + key->ct_orig_proto = 0; } -static inline int ovs_ct_put_key(const struct sw_flow_key *key, +static inline int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { return 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 4d67ea856067..9c62b6325f7a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -58,8 +58,7 @@ #include "vport-internal_dev.h" #include "vport-netdev.h" -int ovs_net_id __read_mostly; -EXPORT_SYMBOL_GPL(ovs_net_id); +unsigned int ovs_net_id __read_mostly; static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; @@ -131,7 +130,6 @@ int lockdep_ovsl_is_held(void) else return 1; } -EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); #endif static struct vport *new_vport(const struct vport_parms *); @@ -562,7 +560,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct datapath *dp; - struct ethhdr *eth; struct vport *input_vport; u16 mru = 0; int len; @@ -583,17 +580,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); - skb_reset_mac_header(packet); - eth = eth_hdr(packet); - - /* Normally, setting the skb 'protocol' field would be handled by a - * call to eth_type_trans(), but it assumes there's a sending - * device, which we may not have. */ - if (eth_proto_is_802_3(eth->h_proto)) - packet->protocol = eth->h_proto; - else - packet->protocol = htons(ETH_P_802_2); - /* Set packet's mru */ if (a[OVS_PACKET_ATTR_MRU]) { mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); @@ -672,8 +658,7 @@ static const struct genl_ops dp_packet_genl_ops[] = { } }; -static struct genl_family dp_packet_genl_family = { - .id = GENL_ID_GENERATE, +static struct genl_family dp_packet_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, @@ -682,6 +667,7 @@ static struct genl_family dp_packet_genl_family = { .parallel_ops = true, .ops = dp_packet_genl_ops, .n_ops = ARRAY_SIZE(dp_packet_genl_ops), + .module = THIS_MODULE, }; static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, @@ -1437,8 +1423,7 @@ static const struct genl_ops dp_flow_genl_ops[] = { }, }; -static struct genl_family dp_flow_genl_family = { - .id = GENL_ID_GENERATE, +static struct genl_family dp_flow_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, @@ -1449,6 +1434,7 @@ static struct genl_family dp_flow_genl_family = { .n_ops = ARRAY_SIZE(dp_flow_genl_ops), .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, + .module = THIS_MODULE, }; static size_t ovs_dp_cmd_msg_size(void) @@ -1823,8 +1809,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = { }, }; -static struct genl_family dp_datapath_genl_family = { - .id = GENL_ID_GENERATE, +static struct genl_family dp_datapath_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, @@ -1835,6 +1820,7 @@ static struct genl_family dp_datapath_genl_family = { .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, + .module = THIS_MODULE, }; /* Called with ovs_mutex or RCU read lock. */ @@ -2245,8 +2231,7 @@ static const struct genl_ops dp_vport_genl_ops[] = { }, }; -struct genl_family dp_vport_genl_family = { - .id = GENL_ID_GENERATE, +struct genl_family dp_vport_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, @@ -2257,6 +2242,7 @@ struct genl_family dp_vport_genl_family = { .n_ops = ARRAY_SIZE(dp_vport_genl_ops), .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, + .module = THIS_MODULE, }; static struct genl_family * const dp_genl_families[] = { @@ -2274,7 +2260,7 @@ static void dp_unregister_genl(int n_families) genl_unregister_family(dp_genl_families[i]); } -static int dp_register_genl(void) +static int __init dp_register_genl(void) { int err; int i; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index ab85c1cae255..1c6e9377436d 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -144,7 +144,7 @@ struct ovs_net { bool xt_label; }; -extern int ovs_net_id; +extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 22087062bd10..3f76cb765e5b 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb) * Returns 0 if it encounters a non-vlan or incomplete packet. * Returns 1 after successfully parsing vlan tag. */ -static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, + bool untag_vlan) { struct vlan_head *vh = (struct vlan_head *)skb->data; @@ -330,31 +331,47 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); key_vh->tpid = vh->tpid; - __skb_pull(skb, sizeof(struct vlan_head)); + if (unlikely(untag_vlan)) { + int offset = skb->data - skb_mac_header(skb); + u16 tci; + int err; + + __skb_push(skb, offset); + err = __skb_vlan_pop(skb, &tci); + __skb_pull(skb, offset); + if (err) + return err; + __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci); + } else { + __skb_pull(skb, sizeof(struct vlan_head)); + } return 1; } -static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +static void clear_vlan(struct sw_flow_key *key) { - int res; - key->eth.vlan.tci = 0; key->eth.vlan.tpid = 0; key->eth.cvlan.tci = 0; key->eth.cvlan.tpid = 0; +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + int res; if (skb_vlan_tag_present(skb)) { key->eth.vlan.tci = htons(skb->vlan_tci); key->eth.vlan.tpid = skb->vlan_proto; } else { /* Parse outer vlan tag in the non-accelerated case. */ - res = parse_vlan_tag(skb, &key->eth.vlan); + res = parse_vlan_tag(skb, &key->eth.vlan, true); if (res <= 0) return res; } /* Parse inner vlan tag. */ - res = parse_vlan_tag(skb, &key->eth.cvlan); + res = parse_vlan_tag(skb, &key->eth.cvlan, false); if (res <= 0) return res; @@ -483,17 +500,20 @@ invalid: * * Returns 0 if successful, otherwise a negative errno value. * - * Initializes @skb header pointers as follows: + * Initializes @skb header fields as follows: * - * - skb->mac_header: the Ethernet header. + * - skb->mac_header: the L2 header. * - * - skb->network_header: just past the Ethernet header, or just past the - * VLAN header, to the first byte of the Ethernet payload. + * - skb->network_header: just past the L2 header, or just past the + * VLAN header, to the first byte of the L2 payload. * * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 * on output, then just past the IP header, if one is present and * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. + * + * - skb->protocol: the type of the data starting at skb->network_header. + * Equals to key->eth.type. */ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { @@ -505,28 +525,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) skb_reset_mac_header(skb); - /* Link layer. We are guaranteed to have at least the 14 byte Ethernet - * header in the linear data area. - */ - eth = eth_hdr(skb); - ether_addr_copy(key->eth.src, eth->h_source); - ether_addr_copy(key->eth.dst, eth->h_dest); + /* Link layer. */ + clear_vlan(key); + if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { + if (unlikely(eth_type_vlan(skb->protocol))) + return -EINVAL; - __skb_pull(skb, 2 * ETH_ALEN); - /* We are going to push all headers that we pull, so no need to - * update skb->csum here. - */ + skb_reset_network_header(skb); + } else { + eth = eth_hdr(skb); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; + __skb_pull(skb, 2 * ETH_ALEN); + /* We are going to push all headers that we pull, so no need to + * update skb->csum here. + */ - key->eth.type = parse_ethertype(skb); - if (unlikely(key->eth.type == htons(0))) - return -ENOMEM; + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + + skb->protocol = parse_ethertype(skb); + if (unlikely(skb->protocol == htons(0))) + return -ENOMEM; - skb_reset_network_header(skb); + skb_reset_network_header(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + } skb_reset_mac_len(skb); - __skb_push(skb, skb->data - skb_mac_header(skb)); + key->eth.type = skb->protocol; /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { @@ -718,12 +745,34 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) { - return key_extract(skb, key); + int res; + + res = key_extract(skb, key); + if (!res) + key->mac_proto &= ~SW_FLOW_KEY_INVALID; + + return res; +} + +static int key_extract_mac_proto(struct sk_buff *skb) +{ + switch (skb->dev->type) { + case ARPHRD_ETHER: + return MAC_PROTO_ETHERNET; + case ARPHRD_NONE: + if (skb->protocol == htons(ETH_P_TEB)) + return MAC_PROTO_ETHERNET; + return MAC_PROTO_NONE; + } + WARN_ON_ONCE(1); + return -EINVAL; } int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { + int res, err; + /* Extract metadata from packet. */ if (tun_info) { key->tun_proto = ip_tunnel_info_af(tun_info); @@ -749,23 +798,61 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; - ovs_ct_fill_key(skb, key); key->ovs_flow_hash = 0; + res = key_extract_mac_proto(skb); + if (res < 0) + return res; + key->mac_proto = res; key->recirc_id = 0; - return key_extract(skb, key); + err = key_extract(skb, key); + if (!err) + ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */ + return err; } int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log) { + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; int err; + err = parse_flow_nlattrs(attr, a, &attrs, log); + if (err) + return -EINVAL; + /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(net, attr, key, log); + err = ovs_nla_get_flow_metadata(net, a, attrs, key, log); if (err) return err; - return key_extract(skb, key); + /* key_extract assumes that skb->protocol is set-up for + * layer 3 packets which is the case for other callers, + * in particular packets received from the network stack. + * Here the correct value can be set from the metadata + * extracted above. + * For L2 packet key eth type would be zero. skb protocol + * would be set to correct value later during key-extact. + */ + + skb->protocol = key->eth.type; + err = key_extract(skb, key); + if (err) + return err; + + /* Check that we have conntrack original direction tuple metadata only + * for packets for which it makes sense. Otherwise the key may be + * corrupted due to overlapping key fields. + */ + if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) && + key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) && + (key->eth.type != htons(ETH_P_IPV6) || + sw_flow_key_is_nd(key))) + return -EINVAL; + + return 0; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index ae783f5c6695..a9bc1c875965 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2017 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -37,6 +37,12 @@ struct sk_buff; +enum sw_flow_mac_proto { + MAC_PROTO_NONE = 0, + MAC_PROTO_ETHERNET, +}; +#define SW_FLOW_KEY_INVALID 0x80 + /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -68,6 +74,7 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */ u8 tun_proto; /* Protocol of encapsulating tunnel. */ u32 ovs_flow_hash; /* Datapath computed hash value. */ u32 recirc_id; /* Recirculation ID. */ @@ -78,6 +85,11 @@ struct sw_flow_key { struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; + /* Filling a hole of two bytes. */ + u8 ct_state; + u8 ct_orig_proto; /* CT original direction tuple IP + * protocol. + */ union { struct { __be32 top_lse; /* top label stack entry */ @@ -89,6 +101,7 @@ struct sw_flow_key { u8 frag; /* One of OVS_FRAG_TYPE_*. */ } ip; }; + u16 ct_zone; /* Conntrack zone. */ struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ @@ -100,10 +113,16 @@ struct sw_flow_key { __be32 src; /* IP source address. */ __be32 dst; /* IP destination address. */ } addr; - struct { - u8 sha[ETH_ALEN]; /* ARP source hardware address. */ - u8 tha[ETH_ALEN]; /* ARP target hardware address. */ - } arp; + union { + struct { + __be32 src; + __be32 dst; + } ct_orig; /* Conntrack original direction fields. */ + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; + }; } ipv4; struct { struct { @@ -111,23 +130,40 @@ struct sw_flow_key { struct in6_addr dst; /* IPv6 destination address. */ } addr; __be32 label; /* IPv6 flow label. */ - struct { - struct in6_addr target; /* ND target address. */ - u8 sll[ETH_ALEN]; /* ND source link layer address. */ - u8 tll[ETH_ALEN]; /* ND target link layer address. */ - } nd; + union { + struct { + struct in6_addr src; + struct in6_addr dst; + } ct_orig; /* Conntrack original direction fields. */ + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + }; } ipv6; }; struct { - /* Connection tracking fields. */ - u16 zone; + /* Connection tracking fields not packed above. */ + struct { + __be16 src; /* CT orig tuple tp src port. */ + __be16 dst; /* CT orig tuple tp dst port. */ + } orig_tp; u32 mark; - u8 state; struct ovs_key_ct_labels labels; } ct; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ +static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key) +{ + return key->eth.type == htons(ETH_P_IPV6) && + key->ip.proto == NEXTHDR_ICMP && + key->tp.dst == 0 && + (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)); +} + struct sw_flow_key_range { unsigned short int start; unsigned short int end; @@ -206,6 +242,21 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; +static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key) +{ + return key->mac_proto & ~SW_FLOW_KEY_INVALID; +} + +static inline u16 __ovs_mac_header_len(u8 mac_proto) +{ + return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0; +} + +static inline u16 ovs_mac_header_len(const struct sw_flow_key *key) +{ + return __ovs_mac_header_len(ovs_key_mac_proto(key)); +} + static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) { return sfid->ufid_len; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ae25ded82b3b..1105a838bab8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -123,13 +123,15 @@ static void update_range(struct sw_flow_match *match, static bool match_validate(const struct sw_flow_match *match, u64 key_attrs, u64 mask_attrs, bool log) { - u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 key_expected = 0; u64 mask_allowed = key_attrs; /* At most allow all key attributes */ /* The following mask attributes allowed only if they * pass the validation tests. */ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) | (1 << OVS_KEY_ATTR_TCP) | (1 << OVS_KEY_ATTR_TCP_FLAGS) | (1 << OVS_KEY_ATTR_UDP) @@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->eth.type == htons(ETH_P_IP)) { key_expected |= 1 << OVS_KEY_ATTR_IPV4; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + if (match->mask && match->mask->key.eth.type == htons(0xffff)) { mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4; + } if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { if (match->key->ip.proto == IPPROTO_UDP) { @@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->eth.type == htons(ETH_P_IPV6)) { key_expected |= 1 << OVS_KEY_ATTR_IPV6; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + if (match->mask && match->mask->key.eth.type == htons(0xffff)) { mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6; + } if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { if (match->key->ip.proto == IPPROTO_UDP) { @@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match, htons(NDISC_NEIGHBOUR_SOLICITATION) || match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { key_expected |= 1 << OVS_KEY_ATTR_ND; + /* Original direction conntrack tuple + * uses the same space as the ND fields + * in the key, so both are not allowed + * at the same time. + */ + mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); if (match->mask && (match->mask->key.tp.src == htons(0xff))) mask_allowed |= 1 << OVS_KEY_ATTR_ND; } @@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void) + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, + [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = { + .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, + [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { + .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr, return __parse_flow_nlattrs(attr, a, attrsp, log, true); } -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp, - bool log) +int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], + u64 *attrsp, bool log) { return __parse_flow_nlattrs(attr, a, attrsp, log, false); } @@ -588,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, nla_get_in6_addr(a), is_mask); ipv6 = true; break; @@ -649,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_VXLAN_OPT; opts_type = type; break; + case OVS_TUNNEL_KEY_ATTR_PAD: + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -969,10 +987,33 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match, return 0; } +static int parse_eth_type_from_nlattrs(struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (!eth_proto_is_802_3(eth_type)) { + OVS_NLERR(log, "EtherType %x is less than min %x", + ntohs(eth_type), ETH_P_802_3_MIN); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + return 0; +} + static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask, bool log) { + u8 mac_proto = MAC_PROTO_ETHERNET; + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -1033,14 +1074,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return -EINVAL; } - SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); } if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); - SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); } if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && @@ -1059,6 +1100,49 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, sizeof(*cl), is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); } + if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) { + const struct ovs_key_ct_tuple_ipv4 *ct; + + ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]); + + SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); + SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4); + } + if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) { + const struct ovs_key_ct_tuple_ipv6 *ct; + + ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]); + + SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src, + sizeof(match->key->ipv6.ct_orig.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst, + sizeof(match->key->ipv6.ct_orig.dst), + is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); + SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); + } + + /* For layer 3 packets the Ethernet type is provided + * and treated as metadata but no MAC addresses are provided. + */ + if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) && + (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE))) + mac_proto = MAC_PROTO_NONE; + + /* Always exact match mac_proto */ + SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask); + + if (mac_proto == MAC_PROTO_NONE) + return parse_eth_type_from_nlattrs(match, attrs, a, is_mask, + log); + return 0; } @@ -1081,33 +1165,26 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, SW_FLOW_KEY_MEMCPY(match, eth.dst, eth_key->eth_dst, ETH_ALEN, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); - } - if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { - /* VLAN attribute is always parsed before getting here since it - * may occur multiple times. - */ - OVS_NLERR(log, "VLAN attribute unexpected."); - return -EINVAL; - } - - if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - __be16 eth_type; - - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (is_mask) { - /* Always exact match EtherType. */ - eth_type = htons(0xffff); - } else if (!eth_proto_is_802_3(eth_type)) { - OVS_NLERR(log, "EtherType %x is less than min %x", - ntohs(eth_type), ETH_P_802_3_MIN); + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { + /* VLAN attribute is always parsed before getting here since it + * may occur multiple times. + */ + OVS_NLERR(log, "VLAN attribute unexpected."); return -EINVAL; } - SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask, + log); + if (err) + return err; + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + } + } else if (!match->key->eth.type) { + OVS_NLERR(log, "Either Ethernet header or EtherType is required."); + return -EINVAL; } if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { @@ -1462,9 +1539,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) /** * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. - * @key: Receives extracted in_port, priority, tun_key and skb_mark. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. + * @net: Network namespace. + * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack + * metadata. + * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink + * attributes. + * @attrs: Bit mask for the netlink attributes included in @a. * @log: Boolean to allow kernel error logging. Normally true, but when * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. @@ -1473,25 +1553,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) * take the same form accepted by flow_from_nlattrs(), but only enough of it to * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. + * + * This must be called before the packet key fields are filled in 'key'. */ -int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, - struct sw_flow_key *key, - bool log) +int ovs_nla_get_flow_metadata(struct net *net, + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], + u64 attrs, struct sw_flow_key *key, bool log) { - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; struct sw_flow_match match; - u64 attrs = 0; - int err; - - err = parse_flow_nlattrs(attr, a, &attrs, log); - if (err) - return -EINVAL; memset(&match, 0, sizeof(match)); match.key = key; + key->ct_state = 0; + key->ct_zone = 0; + key->ct_orig_proto = 0; memset(&key->ct, 0, sizeof(key->ct)); + memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig)); + memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig)); + key->phy.in_port = DP_MAX_PORTS; return metadata_from_nlattrs(net, &match, &attrs, a, false, log); @@ -1553,45 +1634,47 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; - if (ovs_ct_put_key(output, skb)) + if (ovs_ct_put_key(swkey, output, skb)) goto nla_put_failure; - nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); - if (!nla) - goto nla_put_failure; - - eth_key = nla_data(nla); - ether_addr_copy(eth_key->eth_src, output->eth.src); - ether_addr_copy(eth_key->eth_dst, output->eth.dst); - - if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { - if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) + if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) { + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) goto nla_put_failure; - encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.vlan.tci) - goto unencap; - if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { - if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) + eth_key = nla_data(nla); + ether_addr_copy(eth_key->eth_src, output->eth.src); + ether_addr_copy(eth_key->eth_dst, output->eth.dst); + + if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) goto nla_put_failure; - in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.cvlan.tci) + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.vlan.tci) goto unencap; + + if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) + goto nla_put_failure; + in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.cvlan.tci) + goto unencap; + } } - } - if (swkey->eth.type == htons(ETH_P_802_2)) { - /* - * Ethertype 802.2 is represented in the netlink with omitted - * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and - * 0xffff in the mask attribute. Ethertype can also - * be wildcarded. - */ - if (is_mask && output->eth.type) - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, - output->eth.type)) - goto nla_put_failure; - goto unencap; + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; + goto unencap; + } } if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) @@ -2126,8 +2209,8 @@ static bool validate_masked(u8 *data, int len) static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, - struct sw_flow_actions **sfa, - bool *skip_copy, __be16 eth_type, bool masked, bool log) + struct sw_flow_actions **sfa, bool *skip_copy, + u8 mac_proto, __be16 eth_type, bool masked, bool log) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -2157,7 +2240,11 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_CT_MARK: case OVS_KEY_ATTR_CT_LABELS: + break; + case OVS_KEY_ATTR_ETHERNET: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; break; case OVS_KEY_ATTR_TUNNEL: @@ -2324,6 +2411,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) { + u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; int rem, err; @@ -2346,6 +2434,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), + [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), + [OVS_ACTION_ATTR_POP_ETH] = 0, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2394,10 +2484,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_VLAN: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; vlan_tci = htons(0); break; case OVS_ACTION_ATTR_PUSH_VLAN: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; vlan = nla_data(a); if (!eth_type_vlan(vlan->vlan_tpid)) return -EINVAL; @@ -2447,14 +2541,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, - &skip_copy, eth_type, false, log); + &skip_copy, mac_proto, eth_type, + false, log); if (err) return err; break; case OVS_ACTION_ATTR_SET_MASKED: err = validate_set(a, key, sfa, - &skip_copy, eth_type, true, log); + &skip_copy, mac_proto, eth_type, + true, log); if (err) return err; break; @@ -2474,6 +2570,22 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_PUSH_ETH: + /* Disallow pushing an Ethernet header if one + * is already present */ + if (mac_proto != MAC_PROTO_NONE) + return -EINVAL; + mac_proto = MAC_PROTO_NONE; + break; + + case OVS_ACTION_ATTR_POP_ETH: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; + if (vlan_tci & htons(VLAN_TAG_PRESENT)) + return -EINVAL; + mac_proto = MAC_PROTO_ETHERNET; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 45f9769e5aac..929c665ac3aa 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, - struct sw_flow_key *, bool log); +int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], + u64 *attrsp, bool log); +int ovs_nla_get_flow_metadata(struct net *net, + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], + u64 attrs, struct sw_flow_key *key, bool log); int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index e7da29021b38..89193a634da4 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -89,15 +89,6 @@ static const struct ethtool_ops internal_dev_ethtool_ops = { .get_link = ethtool_op_get_link, }; -static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) -{ - if (new_mtu < 68) - return -EINVAL; - - netdev->mtu = new_mtu; - return 0; -} - static void internal_dev_destructor(struct net_device *dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); @@ -106,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } -static struct rtnl_link_stats64 * +static void internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; @@ -134,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->tx_bytes += local_stats.tx_bytes; stats->tx_packets += local_stats.tx_packets; } - - return stats; } static void internal_set_rx_headroom(struct net_device *dev, int new_hr) @@ -148,7 +137,6 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, .ndo_set_rx_headroom = internal_set_rx_headroom, }; @@ -161,6 +149,8 @@ static void do_setup(struct net_device *netdev) { ether_setup(netdev); + netdev->max_mtu = ETH_MAX_MTU; + netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 4e3972344aa6..0389398fa4ab 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -57,8 +57,10 @@ static void netdev_port_receive(struct sk_buff *skb) if (unlikely(!skb)) return; - skb_push(skb, ETH_HLEN); - skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + if (skb->dev->type == ARPHRD_ETHER) { + skb_push(skb, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + } ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: @@ -97,7 +99,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) } if (vport->dev->flags & IFF_LOOPBACK || - vport->dev->type != ARPHRD_ETHER || + (vport->dev->type != ARPHRD_ETHER && + vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; @@ -162,7 +165,6 @@ void ovs_netdev_detach_dev(struct vport *vport) netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } -EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev); static void netdev_destroy(struct vport *vport) { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 7387418ac514..b6c8524032a0 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -463,27 +463,11 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, ovs_dp_process_packet(skb, &key); return 0; } -EXPORT_SYMBOL_GPL(ovs_vport_receive); -static void free_vport_rcu(struct rcu_head *rcu) +static unsigned int packet_length(const struct sk_buff *skb, + struct net_device *dev) { - struct vport *vport = container_of(rcu, struct vport, rcu); - - ovs_vport_free(vport); -} - -void ovs_vport_deferred_free(struct vport *vport) -{ - if (!vport) - return; - - call_rcu(&vport->rcu, free_vport_rcu); -} -EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); - -static unsigned int packet_length(const struct sk_buff *skb) -{ - unsigned int length = skb->len - ETH_HLEN; + unsigned int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) @@ -497,14 +481,34 @@ static unsigned int packet_length(const struct sk_buff *skb) return length; } -void ovs_vport_send(struct vport *vport, struct sk_buff *skb) +void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) { int mtu = vport->dev->mtu; - if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + switch (vport->dev->type) { + case ARPHRD_NONE: + if (mac_proto == MAC_PROTO_ETHERNET) { + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + skb->protocol = htons(ETH_P_TEB); + } else if (mac_proto != MAC_PROTO_NONE) { + WARN_ON_ONCE(1); + goto drop; + } + break; + case ARPHRD_ETHER: + if (mac_proto != MAC_PROTO_ETHERNET) + goto drop; + break; + default: + goto drop; + } + + if (unlikely(packet_length(skb, vport->dev) > mtu && + !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", vport->dev->name, - packet_length(skb), mtu); + packet_length(skb, vport->dev), mtu); vport->dev->stats.tx_errors++; goto drop; } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index f01f28a567ad..cda66c26ad08 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -149,7 +149,6 @@ struct vport_ops { struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); -void ovs_vport_deferred_free(struct vport *vport); #define VPORT_ALIGN 8 @@ -198,6 +197,6 @@ int __ovs_vport_ops_register(struct vport_ops *ops); }) void ovs_vport_ops_unregister(struct vport_ops *ops); -void ovs_vport_send(struct vport *vport, struct sk_buff *skb); +void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto); #endif /* vport.h */ |