diff options
Diffstat (limited to 'net/openvswitch')
-rw-r--r-- | net/openvswitch/actions.c | 139 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 97 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 67 | ||||
-rw-r--r-- | net/openvswitch/datapath.h | 5 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 118 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 12 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.c | 325 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.h | 3 | ||||
-rw-r--r-- | net/openvswitch/flow_table.c | 25 | ||||
-rw-r--r-- | net/openvswitch/vport-geneve.c | 9 | ||||
-rw-r--r-- | net/openvswitch/vport-gre.c | 11 | ||||
-rw-r--r-- | net/openvswitch/vport-internal_dev.c | 4 | ||||
-rw-r--r-- | net/openvswitch/vport-vxlan.c | 9 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 8 |
14 files changed, 569 insertions, 263 deletions
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 879185fe183f..4e03f64709bc 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -71,6 +71,8 @@ struct ovs_frag_data { static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); #define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) struct action_fifo { int head; int tail; @@ -78,7 +80,12 @@ struct action_fifo { struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; }; +struct recirc_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + static struct action_fifo __percpu *action_fifos; +static struct recirc_keys __percpu *recirc_keys; static DEFINE_PER_CPU(int, exec_actions_level); static void action_fifo_init(struct action_fifo *fifo) @@ -137,11 +144,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key) return !!key->eth.type; } +static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~(hdr->h_proto), ethertype }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + hdr->h_proto = ethertype; +} + static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { - __be32 *new_mpls_lse; - struct ethhdr *hdr; + struct mpls_shim_hdr *new_mpls_lse; /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) @@ -150,21 +169,23 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (skb_cow_head(skb, MPLS_HLEN) < 0) return -ENOMEM; + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb->mac_len); + skb_set_inner_protocol(skb, skb->protocol); + } + skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), skb->mac_len); skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); - new_mpls_lse = (__be32 *)skb_mpls_header(skb); - *new_mpls_lse = mpls->mpls_lse; + new_mpls_lse = mpls_hdr(skb); + new_mpls_lse->label_stack_entry = mpls->mpls_lse; skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); - hdr = eth_hdr(skb); - hdr->h_proto = mpls->mpls_ethertype; - - if (!skb->inner_protocol) - skb_set_inner_protocol(skb, skb->protocol); + update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); skb->protocol = mpls->mpls_ethertype; invalidate_flow_key(key); @@ -181,19 +202,20 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (unlikely(err)) return err; - skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), skb->mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); - /* skb_mpls_header() is used to locate the ethertype - * field correctly in the presence of VLAN tags. + /* mpls_hdr() is used to locate the ethertype field correctly in the + * presence of VLAN tags. */ - hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); - hdr->h_proto = ethertype; + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); + update_ethertype(skb, hdr, ethertype); if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; @@ -204,7 +226,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { - __be32 *stack; + struct mpls_shim_hdr *stack; __be32 lse; int err; @@ -212,16 +234,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (unlikely(err)) return err; - stack = (__be32 *)skb_mpls_header(skb); - lse = OVS_MASKED(*stack, *mpls_lse, *mask); + stack = mpls_hdr(skb); + lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(*stack), lse }; + __be32 diff[] = { ~(stack->label_stack_entry), lse }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } - *stack = lse; + stack->label_stack_entry = lse; flow_key->mpls.top_lse = lse; return 0; } @@ -231,20 +253,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) int err; err = skb_vlan_pop(skb); - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); - else - key->eth.tci = 0; + } else { + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + } return err; } static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_vlan *vlan) { - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); - else - key->eth.tci = vlan->vlan_tci; + } else { + key->eth.vlan.tci = vlan->vlan_tci; + key->eth.vlan.tpid = vlan->vlan_tpid; + } return skb_vlan_push(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); } @@ -740,6 +766,14 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, if (likely(vport)) { u16 mru = OVS_CB(skb)->mru; + u32 cutlen = OVS_CB(skb)->cutlen; + + if (unlikely(cutlen > 0)) { + if (skb->len - cutlen > ETH_HLEN) + pskb_trim(skb, skb->len - cutlen); + else + pskb_trim(skb, ETH_HLEN); + } if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { ovs_vport_send(vport, skb); @@ -765,7 +799,8 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, - const struct nlattr *actions, int actions_len) + const struct nlattr *actions, int actions_len, + uint32_t cutlen) { struct dp_upcall_info upcall; const struct nlattr *a; @@ -812,7 +847,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } /* End of switch. */ } - return ovs_dp_upcall(dp, skb, key, &upcall); + return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } static int sample(struct datapath *dp, struct sk_buff *skb, @@ -822,6 +857,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, const struct nlattr *acts_list = NULL; const struct nlattr *a; int rem; + u32 cutlen = 0; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -848,13 +884,24 @@ static int sample(struct datapath *dp, struct sk_buff *skb, return 0; /* The only known usage of sample action is having a single user-space + * action, or having a truncate action followed by a single user-space * action. Treat this usage as a special case. * The output_userspace() should clone the skb to be sent to the * user space. This skb will be consumed by its caller. */ + if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + cutlen = skb->len - trunc->max_len; + + a = nla_next(a, &rem); + } + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a, actions, actions_len); + return output_userspace(dp, skb, key, a, actions, + actions_len, cutlen); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -980,6 +1027,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, const struct nlattr *a, int rem) { struct deferred_action *da; + int level; if (!is_flow_key_valid(key)) { int err; @@ -1003,6 +1051,18 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, return 0; } + level = this_cpu_read(exec_actions_level); + if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { + struct recirc_keys *rks = this_cpu_ptr(recirc_keys); + struct sw_flow_key *recirc_key = &rks->key[level - 1]; + + *recirc_key = *key; + recirc_key->recirc_id = nla_get_u32(a); + ovs_dp_process_packet(skb, recirc_key); + + return 0; + } + da = add_deferred_actions(skb, key, NULL); if (da) { da->pkt_key.recirc_id = nla_get_u32(a); @@ -1041,6 +1101,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (out_skb) do_output(dp, out_skb, prev_port, key); + OVS_CB(skb)->cutlen = 0; prev_port = -1; } @@ -1049,8 +1110,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, prev_port = nla_get_u32(a); break; + case OVS_ACTION_ATTR_TRUNC: { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + OVS_CB(skb)->cutlen = skb->len - trunc->max_len; + break; + } + case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a, attr, len); + output_userspace(dp, skb, key, a, attr, + len, OVS_CB(skb)->cutlen); + OVS_CB(skb)->cutlen = 0; break; case OVS_ACTION_ATTR_HASH: @@ -1158,11 +1229,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts, struct sw_flow_key *key) { - static const int ovs_recursion_limit = 5; int err, level; level = __this_cpu_inc_return(exec_actions_level); - if (unlikely(level > ovs_recursion_limit)) { + if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); kfree_skb(skb); @@ -1187,10 +1257,17 @@ int action_fifos_init(void) if (!action_fifos) return -ENOMEM; + recirc_keys = alloc_percpu(struct recirc_keys); + if (!recirc_keys) { + free_percpu(action_fifos); + return -ENOMEM; + } + return 0; } void action_fifos_exit(void) { free_percpu(action_fifos); + free_percpu(recirc_keys); } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3d5feede962d..31045ef44a82 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -135,7 +135,7 @@ static void ovs_ct_get_labels(const struct nf_conn *ct, struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; if (cl) { - size_t len = cl->words * sizeof(long); + size_t len = sizeof(cl->bits); if (len > OVS_CT_LABELS_LEN) len = OVS_CT_LABELS_LEN; @@ -274,7 +274,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) return -ENOSPC; err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, @@ -433,7 +433,6 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; - enum ip_conntrack_info ctinfo; struct nf_conn *ct; unsigned int dataoff; u8 protonum; @@ -458,13 +457,8 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, ct = nf_ct_tuplehash_to_ctrack(h); - ctinfo = ovs_ct_get_info(h); - if (ctinfo == IP_CT_NEW) { - /* This should not happen. */ - WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct); - } skb->nfct = &ct->ct_general; - skb->nfctinfo = ctinfo; + skb->nfctinfo = ovs_ct_get_info(h); return ct; } @@ -818,12 +812,33 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, */ state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else - return __ovs_ct_lookup(net, key, info, skb); + } else { + struct nf_conn *ct; + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + ct = (struct nf_conn *)skb->nfct; + if (ct) + nf_ct_deliver_cached_events(ct); + } return 0; } +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) +{ + size_t i; + + for (i = 0; i < sizeof(*labels); i++) + if (labels->ct_labels[i]) + return true; + + return false; +} + /* Lookup connection and confirm if unconfirmed. */ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, @@ -834,24 +849,32 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; - /* This is a no-op if the connection has already been confirmed. */ + + /* Apply changes before confirming the connection so that the initial + * conntrack NEW netlink event carries the values given in the CT + * action. + */ + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + return err; + } + if (labels_nonzero(&info->labels.mask)) { + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + } + /* This will take care of sending queued events even if the connection + * is already confirmed. + */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) return -EINVAL; return 0; } -static bool labels_nonzero(const struct ovs_key_ct_labels *labels) -{ - size_t i; - - for (i = 0; i < sizeof(*labels); i++) - if (labels->ct_labels[i]) - return true; - - return false; -} - /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ @@ -876,19 +899,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, err = ovs_ct_commit(net, key, info, skb); else err = ovs_ct_lookup(net, key, info, skb); - if (err) - goto err; - if (info->mark.mask) { - err = ovs_ct_set_mark(skb, key, info->mark.value, - info->mark.mask); - if (err) - goto err; - } - if (labels_nonzero(&info->labels.mask)) - err = ovs_ct_set_labels(skb, key, &info->labels.value, - &info->labels.mask); -err: skb_push(skb, nh_ofs); if (err) kfree_skb(skb); @@ -1145,6 +1156,20 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, } } +#ifdef CONFIG_NF_CONNTRACK_MARK + if (!info->commit && info->mark.mask) { + OVS_NLERR(log, + "Setting conntrack mark requires 'commit' flag."); + return -EINVAL; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (!info->commit && labels_nonzero(&info->labels.mask)) { + OVS_NLERR(log, + "Setting conntrack labels requires 'commit' flag."); + return -EINVAL; + } +#endif if (rem > 0) { OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); return -EINVAL; @@ -1342,7 +1367,7 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) if (ct_info->helper) module_put(ct_info->helper->me); if (ct_info->ct) - nf_ct_put(ct_info->ct); + nf_ct_tmpl_free(ct_info->ct); } void ovs_ct_init(struct net *net) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 856bd8dba676..4d67ea856067 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -137,10 +137,12 @@ EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, - const struct dp_upcall_info *); + const struct dp_upcall_info *, + uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, - const struct dp_upcall_info *); + const struct dp_upcall_info *, + uint32_t cutlen); /* Must be called with rcu_read_lock. */ static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) @@ -275,7 +277,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; - error = ovs_dp_upcall(dp, skb, key, &upcall); + error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) kfree_skb(skb); else @@ -300,7 +302,8 @@ out: int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct dp_stats_percpu *stats; int err; @@ -311,9 +314,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, } if (!skb_is_gso(skb)) - err = queue_userspace_packet(dp, skb, key, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else - err = queue_gso_packets(dp, skb, key, upcall_info); + err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); if (err) goto err; @@ -331,7 +334,8 @@ err: static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; @@ -360,7 +364,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; - err = queue_userspace_packet(dp, skb, key, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; @@ -383,7 +387,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -416,7 +421,8 @@ static void pad_packet(struct datapath *dp, struct sk_buff *skb) static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; @@ -461,7 +467,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info, hlen); + len = upcall_msg_size(upcall_info, hlen - cutlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; @@ -509,15 +515,25 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, pad_packet(dp, user_skb); } + /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ + if (cutlen > 0) { + if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, + skb->len)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { err = -ENOBUFS; goto out; } - nla->nla_len = nla_attr_size(skb->len); + nla->nla_len = nla_attr_size(skb->len - cutlen); - err = skb_zerocopy(user_skb, skb, skb->len, hlen); + err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; @@ -912,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; - struct sw_flow_key key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -940,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &key, &mask); + ovs_match_init(&match, &new_flow->key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, true, &mask); - /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &key, log); + &new_flow->key, log); if (error) goto err_kfree_flow; + /* unmasked key is needed to match when ufid is not used. */ + if (ovs_identifier_is_key(&new_flow->id)) + match.key = new_flow->id.unmasked_key; + + ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); + /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); @@ -980,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1105,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, &mask); + ovs_match_init(&match, &key, true, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); } else if (!ufid_present) { @@ -1222,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { @@ -1281,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (unlikely(err)) @@ -2421,3 +2440,7 @@ module_exit(dp_cleanup); MODULE_DESCRIPTION("Open vSwitch switching datapath"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 427e39a045cf..ab85c1cae255 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -100,11 +100,13 @@ struct datapath { * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not + * @cutlen: The number of bytes from the packet end to be removed. * fragmented. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; + u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -194,7 +196,8 @@ extern struct genl_family dp_vport_genl_family; void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, - const struct sw_flow_key *, const struct dp_upcall_info *); + const struct sw_flow_key *, const struct dp_upcall_info *, + uint32_t cutlen); const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 0ea128eeeab2..c8c82e109c68 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> +#include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> @@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, { struct flow_stats *stats; int node = numa_node_id(); + int cpu = smp_processor_id(); int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - stats = rcu_dereference(flow->stats[node]); + stats = rcu_dereference(flow->stats[cpu]); - /* Check if already have node-specific stats. */ + /* Check if already have CPU-specific stats. */ if (likely(stats)) { spin_lock(&stats->lock); /* Mark if we write on the pre-allocated stats. */ - if (node == 0 && unlikely(flow->stats_last_writer != node)) - flow->stats_last_writer = node; + if (cpu == 0 && unlikely(flow->stats_last_writer != cpu)) + flow->stats_last_writer = cpu; } else { stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ spin_lock(&stats->lock); - /* If the current NUMA-node is the only writer on the + /* If the current CPU is the only writer on the * pre-allocated stats keep using them. */ - if (unlikely(flow->stats_last_writer != node)) { + if (unlikely(flow->stats_last_writer != cpu)) { /* A previous locker may have already allocated the - * stats, so we need to check again. If node-specific + * stats, so we need to check again. If CPU-specific * stats were already allocated, we update the pre- * allocated stats as we have already locked them. */ - if (likely(flow->stats_last_writer != NUMA_NO_NODE) - && likely(!rcu_access_pointer(flow->stats[node]))) { - /* Try to allocate node-specific stats. */ + if (likely(flow->stats_last_writer != -1) && + likely(!rcu_access_pointer(flow->stats[cpu]))) { + /* Try to allocate CPU-specific stats. */ struct flow_stats *new_stats; new_stats = @@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, new_stats->tcp_flags = tcp_flags; spin_lock_init(&new_stats->lock); - rcu_assign_pointer(flow->stats[node], + rcu_assign_pointer(flow->stats[cpu], new_stats); goto unlock; } } - flow->stats_last_writer = node; + flow->stats_last_writer = cpu; } } @@ -136,14 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int node; + int cpu; *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); - for_each_node(node) { - struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); + /* We open code this to make sure cpu 0 is always considered */ + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) { + struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { /* Local CPU may write on non-local stats, so we must @@ -163,10 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow, /* Called with ovs_mutex. */ void ovs_flow_stats_clear(struct sw_flow *flow) { - int node; + int cpu; - for_each_node(node) { - struct flow_stats *stats = ovsl_dereference(flow->stats[node]); + /* We open code this to make sure cpu 0 is always considered */ + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) { + struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { spin_lock_bh(&stats->lock); @@ -302,24 +306,57 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } -static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +/** + * Parse vlan tag from vlan header. + * Returns ERROR on memory error. + * Returns 0 if it encounters a non-vlan or incomplete packet. + * Returns 1 after successfully parsing vlan tag. + */ +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) { - struct qtag_prefix { - __be16 eth_type; /* ETH_P_8021Q */ - __be16 tci; - }; - struct qtag_prefix *qp; + struct vlan_head *vh = (struct vlan_head *)skb->data; + + if (likely(!eth_type_vlan(vh->tpid))) + return 0; - if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16))) return 0; - if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + - sizeof(__be16)))) + if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) + + sizeof(__be16)))) return -ENOMEM; - qp = (struct qtag_prefix *) skb->data; - key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); - __skb_pull(skb, sizeof(struct qtag_prefix)); + vh = (struct vlan_head *)skb->data; + key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); + key_vh->tpid = vh->tpid; + + __skb_pull(skb, sizeof(struct vlan_head)); + return 1; +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + int res; + + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + key->eth.cvlan.tci = 0; + key->eth.cvlan.tpid = 0; + + if (likely(skb_vlan_tag_present(skb))) { + key->eth.vlan.tci = htons(skb->vlan_tci); + key->eth.vlan.tpid = skb->vlan_proto; + } else { + /* Parse outer vlan tag in the non-accelerated case. */ + res = parse_vlan_tag(skb, &key->eth.vlan); + if (res <= 0) + return res; + } + + /* Parse inner vlan tag. */ + res = parse_vlan_tag(skb, &key->eth.cvlan); + if (res <= 0) + return res; return 0; } @@ -480,12 +517,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * update skb->csum here. */ - key->eth.tci = 0; - if (skb_vlan_tag_present(skb)) - key->eth.tci = htons(skb->vlan_tci); - else if (eth->h_proto == htons(ETH_P_8021Q)) - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; key->eth.type = parse_ethertype(skb); if (unlikely(key->eth.type == htons(0))) @@ -600,12 +633,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) } else if (eth_p_mpls(key->eth.type)) { size_t stack_len = MPLS_HLEN; - /* In the presence of an MPLS label stack the end of the L2 - * header and the beginning of the L3 header differ. - * - * Advance network_header to the beginning of the L3 - * header. mac_len corresponds to the end of the L2 header. - */ + skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; @@ -613,12 +641,12 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(error)) return 0; - memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); if (stack_len == MPLS_HLEN) memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); - skb_set_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + stack_len); if (lse & htonl(MPLS_LS_S_MASK)) break; @@ -734,8 +762,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, { int err; - memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); - /* Extract metadata from netlink attributes. */ err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 03378e75a67c..ae783f5c6695 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -50,6 +50,11 @@ struct ovs_tunnel_info { struct metadata_dst *tun_dst; }; +struct vlan_head { + __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ +}; + #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) @@ -69,7 +74,8 @@ struct sw_flow_key { struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ - __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + struct vlan_head vlan; + struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; union { @@ -172,14 +178,14 @@ struct sw_flow { struct hlist_node node[2]; u32 hash; } flow_table, ufid_table; - int stats_last_writer; /* NUMA-node id of the last writer on + int stats_last_writer; /* CPU id of the last writer on * 'stats[0]'. */ struct sw_flow_key key; struct sw_flow_id id; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; - struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one + struct flow_stats __rcu *stats[]; /* One for each CPU. First one * is allocated at flow creation time, * the rest are allocated on demand * while holding the 'stats[0].lock'. diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 0bb650f4f219..ae25ded82b3b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -808,6 +808,167 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb, ip_tunnel_info_af(tun_info)); } +static int encode_vlan_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *a[], + bool is_mask, bool inner) +{ + __be16 tci = 0; + __be16 tpid = 0; + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (likely(!inner)) { + SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask); + } else { + SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask); + } + return 0; +} + +static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) { + /* Not a VLAN. */ + return 0; + } + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (tci) { + OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { + /* Corner case for truncated VLAN header. */ + OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + } + + return 1; +} + +static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + __be16 tpid = 0; + bool encap_valid = !!(match->key->eth.vlan.tci & + htons(VLAN_TAG_PRESENT)); + bool i_encap_valid = !!(match->key->eth.cvlan.tci & + htons(VLAN_TAG_PRESENT)); + + if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { + /* Not a VLAN. */ + return 0; + } + + if ((!inner && !encap_valid) || (inner && !i_encap_valid)) { + OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (tpid != htons(0xffff)) { + OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).", + (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); + return -EINVAL; + } + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + return 1; +} + +static int __parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, bool inner, + const struct nlattr **a, bool is_mask, + bool log) +{ + int err; + const struct nlattr *encap; + + if (!is_mask) + err = validate_vlan_from_nlattrs(match, *key_attrs, inner, + a, log); + else + err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner, + a, log); + if (err <= 0) + return err; + + err = encode_vlan_from_nlattrs(match, a, is_mask, inner); + if (err) + return err; + + *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + + encap = a[OVS_KEY_ATTR_ENCAP]; + + if (!is_mask) + err = parse_flow_nlattrs(encap, a, key_attrs, log); + else + err = parse_flow_mask_nlattrs(encap, a, key_attrs, log); + + return err; +} + +static int parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + int err; + bool encap_valid = false; + + err = __parse_vlan_from_nlattrs(match, key_attrs, false, a, + is_mask, log); + if (err) + return err; + + encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT)); + if (encap_valid) { + err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, + is_mask, log); + if (err) + return err; + } + + return 0; +} + static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask, bool log) @@ -923,20 +1084,11 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, } if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { - __be16 tci; - - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - if (is_mask) - OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); - else - OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); - - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + /* VLAN attribute is always parsed before getting here since it + * may occur multiple times. + */ + OVS_NLERR(log, "VLAN attribute unexpected."); + return -EINVAL; } if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { @@ -1182,49 +1334,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct nlattr *encap; struct nlattr *newmask = NULL; u64 key_attrs = 0; u64 mask_attrs = 0; - bool encap_valid = false; int err; err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); if (err) return err; - if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && - (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && - (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { - __be16 tci; - - if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && - (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR(log, "Invalid Vlan frame."); - return -EINVAL; - } - - key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - encap = a[OVS_KEY_ATTR_ENCAP]; - key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - encap_valid = true; - - if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs, log); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) { - OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); - return -EINVAL; - } - } else { - OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); - return -EINVAL; - } - } + err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log); + if (err) + return err; err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) @@ -1265,46 +1386,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, goto free_newmask; /* Always match on tci. */ - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); - - if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { - __be16 eth_type = 0; - __be16 tci = 0; - - if (!encap_valid) { - OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); - err = -EINVAL; - goto free_newmask; - } + SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true); - mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - if (a[OVS_KEY_ATTR_ETHERTYPE]) - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (eth_type == htons(0xffff)) { - mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, - &mask_attrs, log); - if (err) - goto free_newmask; - } else { - OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", - ntohs(eth_type)); - err = -EINVAL; - goto free_newmask; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", - ntohs(tci)); - err = -EINVAL; - goto free_newmask; - } - } + err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log); + if (err) + goto free_newmask; err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, log); @@ -1410,12 +1497,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } +static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, + bool is_mask) +{ + __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff); + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci)) + return -EMSGSIZE; + return 0; +} + static int __ovs_nla_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, bool is_mask, struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; - struct nlattr *nla, *encap; + struct nlattr *nla; + struct nlattr *encap = NULL; + struct nlattr *in_encap = NULL; if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) goto nla_put_failure; @@ -1464,17 +1564,21 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(eth_key->eth_src, output->eth.src); ether_addr_copy(eth_key->eth_dst, output->eth.dst); - if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - __be16 eth_type; - eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) goto nla_put_failure; encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.tci) + if (!swkey->eth.vlan.tci) goto unencap; - } else - encap = NULL; + + if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) + goto nla_put_failure; + in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.cvlan.tci) + goto unencap; + } + } if (swkey->eth.type == htons(ETH_P_802_2)) { /* @@ -1493,6 +1597,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) goto nla_put_failure; + if (eth_type_vlan(swkey->eth.type)) { + /* There are 3 VLAN tags, we don't know anything about the rest + * of the packet, so truncate here. + */ + WARN_ON_ONCE(!(encap && in_encap)); + goto unencap; + } + if (swkey->eth.type == htons(ETH_P_IP)) { struct ovs_key_ipv4 *ipv4_key; @@ -1619,6 +1731,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, } unencap: + if (in_encap) + nla_nest_end(skb, in_encap); if (encap) nla_nest_end(skb, encap); @@ -1882,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, + bool reset_key, struct sw_flow_mask *mask) { memset(match, 0, sizeof(*match)); match->key = key; match->mask = mask; - memset(key, 0, sizeof(*key)); + if (reset_key) + memset(key, 0, sizeof(*key)); if (mask) { memset(&mask->key, 0, sizeof(mask->key)); @@ -1935,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct nlattr *a; int err = 0, start, opts_type; - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; @@ -2229,6 +2345,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2255,6 +2372,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_TRUNC: { + const struct ovs_action_trunc *trunc = nla_data(a); + + if (trunc->max_len < ETH_HLEN) + return -EINVAL; + break; + } + case OVS_ACTION_ATTR_HASH: { const struct ovs_action_hash *act_hash = nla_data(a); @@ -2274,7 +2399,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_PUSH_VLAN: vlan = nla_data(a); - if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + if (!eth_type_vlan(vlan->vlan_tpid)) return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; @@ -2379,7 +2504,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, - key->eth.tci, log); + key->eth.vlan.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 47dd142eca1c..45f9769e5aac 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void); size_t ovs_key_attr_size(void); void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, struct sw_flow_mask *mask); + struct sw_flow_key *key, bool reset_key, + struct sw_flow_mask *mask); int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d073fff82fdb..ea7a8073fa02 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -32,6 +32,7 @@ #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> +#include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> @@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void) { struct sw_flow *flow; struct flow_stats *stats; - int node; - flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); - flow->sf_acts = NULL; - flow->mask = NULL; - flow->id.unmasked_key = NULL; - flow->id.ufid_len = 0; - flow->stats_last_writer = NUMA_NO_NODE; + flow->stats_last_writer = -1; /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, @@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void) RCU_INIT_POINTER(flow->stats[0], stats); - for_each_node(node) - if (node != 0) - RCU_INIT_POINTER(flow->stats[node], NULL); - return flow; err: kmem_cache_free(flow_cache, flow); @@ -142,16 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets) static void flow_free(struct sw_flow *flow) { - int node; + int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); - for_each_node(node) - if (flow->stats[node]) + /* We open code this to make sure cpu 0 is always considered */ + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) + if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, - (struct flow_stats __force *)flow->stats[node]); + (struct flow_stats __force *)flow->stats[cpu]); kmem_cache_free(flow_cache, flow); } @@ -756,7 +749,7 @@ int ovs_flow_init(void) BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) - + (nr_node_ids + + (nr_cpu_ids * sizeof(struct flow_stats *)), 0, 0, NULL); if (flow_cache == NULL) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 1a1fcec88695..5aaf3babfc3f 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -93,7 +93,14 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + rtnl_delete_link(dev); + rtnl_unlock(); + ovs_vport_free(vport); + goto error; + } + rtnl_unlock(); return vport; error: diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 7f8897f33a67..0e72d95b0e8f 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -54,6 +54,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct net_device *dev; struct vport *vport; + int err; vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); if (IS_ERR(vport)) @@ -67,9 +68,15 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - dev_change_flags(dev, dev->flags | IFF_UP); - rtnl_unlock(); + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + rtnl_delete_link(dev); + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_PTR(err); + } + rtnl_unlock(); return vport; } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 2ee48e447b72..95c36147a6e1 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -140,7 +140,7 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) static void internal_set_rx_headroom(struct net_device *dev, int new_hr) { - dev->needed_headroom = new_hr; + dev->needed_headroom = new_hr < 0 ? 0 : new_hr; } static const struct net_device_ops internal_dev_netdev_ops = { @@ -195,7 +195,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) } vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, do_setup); + parms->name, NET_NAME_USER, do_setup); if (!vport->dev) { err = -ENOMEM; goto error_free_vport; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 5eb7694348b5..7eb955e453e6 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -130,7 +130,14 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + rtnl_delete_link(dev); + rtnl_unlock(); + ovs_vport_free(vport); + goto error; + } + rtnl_unlock(); return vport; error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 31cbc8c5c7db..8f198437c724 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -444,6 +444,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; + OVS_CB(skb)->cutlen = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; @@ -484,9 +485,14 @@ static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; - if (skb->protocol == htons(ETH_P_8021Q)) + if (skb_vlan_tagged(skb)) length -= VLAN_HLEN; + /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow + * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none + * account for 802.1ad. e.g. is_skb_forwardable(). + */ + return length; } |