summaryrefslogtreecommitdiff
path: root/net/openvswitch
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/actions.c176
-rw-r--r--net/openvswitch/conntrack.c326
-rw-r--r--net/openvswitch/conntrack.h14
-rw-r--r--net/openvswitch/datapath.c34
-rw-r--r--net/openvswitch/datapath.h2
-rw-r--r--net/openvswitch/flow.c153
-rw-r--r--net/openvswitch/flow.h77
-rw-r--r--net/openvswitch/flow_netlink.c274
-rw-r--r--net/openvswitch/flow_netlink.h7
-rw-r--r--net/openvswitch/vport-internal_dev.c16
-rw-r--r--net/openvswitch/vport-netdev.c10
-rw-r--r--net/openvswitch/vport.c48
-rw-r--r--net/openvswitch/vport.h3
13 files changed, 806 insertions, 334 deletions
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 4e03f64709bc..c82301ce3fff 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -62,9 +62,11 @@ struct ovs_frag_data {
struct vport *vport;
struct ovs_skb_cb cb;
__be16 inner_protocol;
- __u16 vlan_tci;
+ u16 network_offset; /* valid only for MPLS */
+ u16 vlan_tci;
__be16 vlan_proto;
unsigned int l2_len;
+ u8 mac_proto;
u8 l2_data[MAX_L2_LEN];
};
@@ -136,12 +138,12 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
static void invalidate_flow_key(struct sw_flow_key *key)
{
- key->eth.type = htons(0);
+ key->mac_proto |= SW_FLOW_KEY_INVALID;
}
static bool is_flow_key_valid(const struct sw_flow_key *key)
{
- return !!key->eth.type;
+ return !(key->mac_proto & SW_FLOW_KEY_INVALID);
}
static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
@@ -185,7 +187,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
- update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
+ update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
skb->protocol = mpls->mpls_ethertype;
invalidate_flow_key(key);
@@ -195,7 +198,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const __be16 ethertype)
{
- struct ethhdr *hdr;
int err;
err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
@@ -211,11 +213,15 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);
- /* mpls_hdr() is used to locate the ethertype field correctly in the
- * presence of VLAN tags.
- */
- hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
- update_ethertype(skb, hdr, ethertype);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
+ struct ethhdr *hdr;
+
+ /* mpls_hdr() is used to locate the ethertype field correctly in the
+ * presence of VLAN tags.
+ */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ update_ethertype(skb, hdr, ethertype);
+ }
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
@@ -311,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
+/* pop_eth does not support VLAN packets as this action is never called
+ * for them.
+ */
+static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ /* safe right before invalidate_flow_key */
+ key->mac_proto = MAC_PROTO_NONE;
+ invalidate_flow_key(key);
+ return 0;
+}
+
+static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct ovs_action_push_eth *ethh)
+{
+ struct ethhdr *hdr;
+
+ /* Add the new Ethernet header */
+ if (skb_cow_head(skb, ETH_HLEN) < 0)
+ return -ENOMEM;
+
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ hdr = eth_hdr(skb);
+ ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
+ ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
+ hdr->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, hdr, ETH_HLEN);
+
+ /* safe right before invalidate_flow_key */
+ key->mac_proto = MAC_PROTO_ETHERNET;
+ invalidate_flow_key(key);
+ return 0;
+}
+
static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
__be32 addr, __be32 new_addr)
{
@@ -666,7 +713,13 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
skb_postpush_rcsum(skb, skb->data, data->l2_len);
skb_reset_mac_header(skb);
- ovs_vport_send(vport, skb);
+ if (eth_p_mpls(skb->protocol)) {
+ skb->inner_network_header = skb->network_header;
+ skb_set_network_header(skb, data->network_offset);
+ skb_reset_mac_len(skb);
+ }
+
+ ovs_vport_send(vport, skb, data->mac_proto);
return 0;
}
@@ -684,7 +737,8 @@ static struct dst_ops ovs_dst_ops = {
/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
* ovs_vport_output(), which is called once per fragmented packet.
*/
-static void prepare_frag(struct vport *vport, struct sk_buff *skb)
+static void prepare_frag(struct vport *vport, struct sk_buff *skb,
+ u16 orig_network_offset, u8 mac_proto)
{
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
@@ -694,8 +748,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
data->vport = vport;
data->cb = *OVS_CB(skb);
data->inner_protocol = skb->inner_protocol;
+ data->network_offset = orig_network_offset;
data->vlan_tci = skb->vlan_tci;
data->vlan_proto = skb->vlan_proto;
+ data->mac_proto = mac_proto;
data->l2_len = hlen;
memcpy(&data->l2_data, skb->data, hlen);
@@ -704,18 +760,27 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
}
static void ovs_fragment(struct net *net, struct vport *vport,
- struct sk_buff *skb, u16 mru, __be16 ethertype)
+ struct sk_buff *skb, u16 mru,
+ struct sw_flow_key *key)
{
+ u16 orig_network_offset = 0;
+
+ if (eth_p_mpls(skb->protocol)) {
+ orig_network_offset = skb_network_offset(skb);
+ skb->network_header = skb->inner_network_header;
+ }
+
if (skb_network_offset(skb) > MAX_L2_LEN) {
OVS_NLERR(1, "L2 header too long to fragment");
goto err;
}
- if (ethertype == htons(ETH_P_IP)) {
+ if (key->eth.type == htons(ETH_P_IP)) {
struct dst_entry ovs_dst;
unsigned long orig_dst;
- prepare_frag(vport, skb);
+ prepare_frag(vport, skb, orig_network_offset,
+ ovs_key_mac_proto(key));
dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_dst.dev = vport->dev;
@@ -726,16 +791,16 @@ static void ovs_fragment(struct net *net, struct vport *vport,
ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
- } else if (ethertype == htons(ETH_P_IPV6)) {
+ } else if (key->eth.type == htons(ETH_P_IPV6)) {
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
- if (!v6ops) {
+ if (!v6ops)
goto err;
- }
- prepare_frag(vport, skb);
+ prepare_frag(vport, skb, orig_network_offset,
+ ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -749,7 +814,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
- ovs_vport_name(vport), ntohs(ethertype), mru,
+ ovs_vport_name(vport), ntohs(key->eth.type), mru,
vport->dev->mtu);
goto err;
}
@@ -769,26 +834,19 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
u32 cutlen = OVS_CB(skb)->cutlen;
if (unlikely(cutlen > 0)) {
- if (skb->len - cutlen > ETH_HLEN)
+ if (skb->len - cutlen > ovs_mac_header_len(key))
pskb_trim(skb, skb->len - cutlen);
else
- pskb_trim(skb, ETH_HLEN);
+ pskb_trim(skb, ovs_mac_header_len(key));
}
- if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
- ovs_vport_send(vport, skb);
+ if (likely(!mru ||
+ (skb->len <= mru + vport->dev->hard_header_len))) {
+ ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
} else if (mru <= vport->dev->mtu) {
struct net *net = read_pnet(&dp->net);
- __be16 ethertype = key->eth.type;
- if (!is_flow_key_valid(key)) {
- if (eth_p_mpls(skb->protocol))
- ethertype = skb->inner_protocol;
- else
- ethertype = vlan_get_protocol(skb);
- }
-
- ovs_fragment(net, vport, skb, mru, ethertype);
+ ovs_fragment(net, vport, skb, mru, key);
} else {
kfree_skb(skb);
}
@@ -1015,6 +1073,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
case OVS_KEY_ATTR_CT_ZONE:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
+ case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
+ case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
err = -EINVAL;
break;
}
@@ -1082,12 +1142,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, int len)
{
- /* Every output action needs a separate clone of 'skb', but the common
- * case is just a single output action, so that doing a clone and
- * then freeing the original skbuff is wasteful. So the following code
- * is slightly obscure just to avoid that.
- */
- int prev_port = -1;
const struct nlattr *a;
int rem;
@@ -1095,20 +1149,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
a = nla_next(a, &rem)) {
int err = 0;
- if (unlikely(prev_port != -1)) {
- struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
-
- if (out_skb)
- do_output(dp, out_skb, prev_port, key);
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_OUTPUT: {
+ int port = nla_get_u32(a);
+ struct sk_buff *clone;
+
+ /* Every output action needs a separate clone
+ * of 'skb', In case the output action is the
+ * last action, cloning can be avoided.
+ */
+ if (nla_is_last(a, rem)) {
+ do_output(dp, skb, port, key);
+ /* 'skb' has been used for output.
+ */
+ return 0;
+ }
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ do_output(dp, clone, port, key);
OVS_CB(skb)->cutlen = 0;
- prev_port = -1;
- }
-
- switch (nla_type(a)) {
- case OVS_ACTION_ATTR_OUTPUT:
- prev_port = nla_get_u32(a);
break;
+ }
case OVS_ACTION_ATTR_TRUNC: {
struct ovs_action_trunc *trunc = nla_data(a);
@@ -1182,6 +1244,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
if (err)
return err == -EINPROGRESS ? 0 : err;
break;
+
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ err = push_eth(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_ETH:
+ err = pop_eth(skb, key);
+ break;
}
if (unlikely(err)) {
@@ -1190,11 +1260,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
}
}
- if (prev_port != -1)
- do_output(dp, skb, prev_port, key);
- else
- consume_skb(skb);
-
+ consume_skb(skb);
return 0;
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index fecefa2dc94e..7b2c2fce408a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
struct nf_conn *ct;
u8 commit : 1;
u8 nat : 3; /* enum ovs_ct_nat */
+ u8 force : 1;
u16 family;
struct md_mark mark;
struct md_labels labels;
@@ -73,6 +74,8 @@ struct ovs_conntrack_info {
#endif
};
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
+
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
#endif
}
+/* Guard against conntrack labels max size shrinking below 128 bits. */
+#if NF_CT_LABELS_MAX_SIZE < 16
+#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
+#endif
+
static void ovs_ct_get_labels(const struct nf_conn *ct,
struct ovs_key_ct_labels *labels)
{
struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
- if (cl) {
- size_t len = sizeof(cl->bits);
+ if (cl)
+ memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
+ else
+ memset(labels, 0, OVS_CT_LABELS_LEN);
+}
- if (len > OVS_CT_LABELS_LEN)
- len = OVS_CT_LABELS_LEN;
- else if (len < OVS_CT_LABELS_LEN)
- memset(labels, 0, OVS_CT_LABELS_LEN);
- memcpy(labels, cl->bits, len);
+static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
+ const struct nf_conntrack_tuple *orig,
+ u8 icmp_proto)
+{
+ key->ct_orig_proto = orig->dst.protonum;
+ if (orig->dst.protonum == icmp_proto) {
+ key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
+ key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
} else {
- memset(labels, 0, OVS_CT_LABELS_LEN);
+ key->ct.orig_tp.src = orig->src.u.all;
+ key->ct.orig_tp.dst = orig->dst.u.all;
}
}
@@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
const struct nf_conntrack_zone *zone,
const struct nf_conn *ct)
{
- key->ct.state = state;
- key->ct.zone = zone->id;
+ key->ct_state = state;
+ key->ct_zone = zone->id;
key->ct.mark = ovs_ct_get_mark(ct);
ovs_ct_get_labels(ct, &key->ct.labels);
+
+ if (ct) {
+ const struct nf_conntrack_tuple *orig;
+
+ /* Use the master if we have one. */
+ if (ct->master)
+ ct = ct->master;
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+ /* IP version must match with the master connection. */
+ if (key->eth.type == htons(ETH_P_IP) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4) {
+ key->ipv4.ct_orig.src = orig->src.u3.ip;
+ key->ipv4.ct_orig.dst = orig->dst.u3.ip;
+ __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
+ return;
+ } else if (key->eth.type == htons(ETH_P_IPV6) &&
+ !sw_flow_key_is_nd(key) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV6) {
+ key->ipv6.ct_orig.src = orig->src.u3.in6;
+ key->ipv6.ct_orig.dst = orig->dst.u3.in6;
+ __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
+ return;
+ }
+ }
+ /* Clear 'ct_orig_proto' to mark the non-existence of conntrack
+ * original direction key fields.
+ */
+ key->ct_orig_proto = 0;
}
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has
* previously sent the packet to conntrack via the ct action. If
* 'keep_nat_flags' is true, the existing NAT flags retained, else they are
* initialized from the connection status.
@@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
if (ct->master)
state |= OVS_CS_F_RELATED;
if (keep_nat_flags) {
- state |= key->ct.state & OVS_CS_F_NAT_MASK;
+ state |= key->ct_state & OVS_CS_F_NAT_MASK;
} else {
if (ct->status & IPS_SRC_NAT)
state |= OVS_CS_F_SRC_NAT;
@@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
ovs_ct_update_key(skb, NULL, key, false, false);
}
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+#define IN6_ADDR_INITIALIZER(ADDR) \
+ { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
+ (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
+
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb)
{
- if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
- nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+ nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
- nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+ nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
- nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
- &key->ct.labels))
+ nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
+ &output->ct.labels))
return -EMSGSIZE;
+ if (swkey->ct_orig_proto) {
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ct_tuple_ipv4 orig = {
+ output->ipv4.ct_orig.src,
+ output->ipv4.ct_orig.dst,
+ output->ct.orig_tp.src,
+ output->ct.orig_tp.dst,
+ output->ct_orig_proto,
+ };
+ if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
+ sizeof(orig), &orig))
+ return -EMSGSIZE;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ct_tuple_ipv6 orig = {
+ IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
+ IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
+ output->ct.orig_tp.src,
+ output->ct.orig_tp.dst,
+ output->ct_orig_proto,
+ };
+ if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
+ sizeof(orig), &orig))
+ return -EMSGSIZE;
+ }
+ }
+
return 0;
}
-static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
u32 ct_mark, u32 mask)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
u32 new_mark;
- /* The connection could be invalid, in which case set_mark is no-op. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return 0;
-
new_mark = ct_mark | (ct->mark & ~(mask));
if (ct->mark != new_mark) {
ct->mark = new_mark;
- nf_conntrack_event_cache(IPCT_MARK, ct);
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
key->ct.mark = new_mark;
}
@@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
#endif
}
-static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
- const struct ovs_key_ct_labels *labels,
- const struct ovs_key_ct_labels *mask)
+static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
{
- enum ip_conntrack_info ctinfo;
struct nf_conn_labels *cl;
- struct nf_conn *ct;
- int err;
-
- /* The connection could be invalid, in which case set_label is no-op.*/
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return 0;
cl = nf_ct_labels_find(ct);
if (!cl) {
nf_ct_labels_ext_add(ct);
cl = nf_ct_labels_find(ct);
}
- if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
+
+ return cl;
+}
+
+/* Initialize labels for a new, yet to be committed conntrack entry. Note that
+ * since the new connection is not yet confirmed, and thus no-one else has
+ * access to it's labels, we simply write them over.
+ */
+static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ struct nf_conn_labels *cl, *master_cl;
+ bool have_mask = labels_nonzero(mask);
+
+ /* Inherit master's labels to the related connection? */
+ master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
+
+ if (!master_cl && !have_mask)
+ return 0; /* Nothing to do. */
+
+ cl = ovs_ct_get_conn_labels(ct);
+ if (!cl)
return -ENOSPC;
- err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
- OVS_CT_LABELS_LEN / sizeof(u32));
+ /* Inherit the master's labels, if any. */
+ if (master_cl)
+ *cl = *master_cl;
+
+ if (have_mask) {
+ u32 *dst = (u32 *)cl->bits;
+ int i;
+
+ for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+ dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
+ (labels->ct_labels_32[i]
+ & mask->ct_labels_32[i]);
+ }
+
+ /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
+ * IPCT_LABEL bit it set in the event cache.
+ */
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
+ return 0;
+}
+
+static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ struct nf_conn_labels *cl;
+ int err;
+
+ cl = ovs_ct_get_conn_labels(ct);
+ if (!cl)
+ return -ENOSPC;
+
+ err = nf_connlabels_replace(ct, labels->ct_labels_32,
+ mask->ct_labels_32,
+ OVS_CT_LABELS_LEN_32);
if (err)
return err;
- ovs_ct_get_labels(ct, &key->ct.labels);
+ memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
return 0;
}
@@ -367,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
} else if (key->eth.type == htons(ETH_P_IPV6)) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
- skb_orphan(skb);
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err) {
@@ -421,16 +538,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
/* Find an existing connection which this packet belongs to without
* re-attributing statistics or modifying the connection state. This allows an
- * skb->nfct lost due to an upcall to be recovered during actions execution.
+ * skb->_nfct lost due to an upcall to be recovered during actions execution.
*
* Must be called with rcu_read_lock.
*
- * On success, populates skb->nfct and skb->nfctinfo, and returns the
- * connection. Returns NULL if there is no existing entry.
+ * On success, populates skb->_nfct and returns the connection. Returns NULL
+ * if there is no existing entry.
*/
static struct nf_conn *
ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
- u8 l3num, struct sk_buff *skb)
+ u8 l3num, struct sk_buff *skb, bool natted)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +570,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
return NULL;
}
+ /* Must invert the tuple if skb has been transformed by NAT. */
+ if (natted) {
+ struct nf_conntrack_tuple inverse;
+
+ if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+ pr_debug("ovs_ct_find_existing: Inversion failed!\n");
+ return NULL;
+ }
+ tuple = inverse;
+ }
+
/* look for tuple match */
h = nf_conntrack_find_get(net, zone, &tuple);
if (!h)
@@ -460,12 +588,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
ct = nf_ct_tuplehash_to_ctrack(h);
- skb->nfct = &ct->ct_general;
- skb->nfctinfo = ovs_ct_get_info(h);
+ /* Inverted packet tuple matches the reverse direction conntrack tuple,
+ * select the other tuplehash to get the right 'ctinfo' bits for this
+ * packet.
+ */
+ if (natted)
+ h = &ct->tuplehash[!h->tuple.dst.dir];
+
+ nf_ct_set(skb, ct, ovs_ct_get_info(h));
return ct;
}
-/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
static bool skb_nfct_cached(struct net *net,
const struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -476,14 +610,19 @@ static bool skb_nfct_cached(struct net *net,
ct = nf_ct_get(skb, &ctinfo);
/* If no ct, check if we have evidence that an existing conntrack entry
- * might be found for this skb. This happens when we lose a skb->nfct
+ * might be found for this skb. This happens when we lose a skb->_nfct
* due to an upcall. If the connection was not confirmed, it is not
* cached and needs to be run through conntrack again.
*/
- if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
- !(key->ct.state & OVS_CS_F_INVALID) &&
- key->ct.zone == info->zone.id)
- ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
+ if (!ct && key->ct_state & OVS_CS_F_TRACKED &&
+ !(key->ct_state & OVS_CS_F_INVALID) &&
+ key->ct_zone == info->zone.id) {
+ ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
+ !!(key->ct_state
+ & OVS_CS_F_NAT_MASK));
+ if (ct)
+ nf_ct_get(skb, &ctinfo);
+ }
if (!ct)
return false;
if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -497,6 +636,18 @@ static bool skb_nfct_cached(struct net *net,
if (help && rcu_access_pointer(help->helper) != info->helper)
return false;
}
+ /* Force conntrack entry direction to the current packet? */
+ if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ /* Delete the conntrack entry if confirmed, else just release
+ * the reference.
+ */
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_delete(ct, 0, 0);
+
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, 0);
+ return false;
+ }
return true;
}
@@ -514,7 +665,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
int hooknum, nh_off, err = NF_ACCEPT;
nh_off = skb_network_offset(skb);
- skb_pull(skb, nh_off);
+ skb_pull_rcsum(skb, nh_off);
/* See HOOK2MANIP(). */
if (maniptype == NF_NAT_MANIP_SRC)
@@ -579,6 +730,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
err = nf_nat_packet(ct, ctinfo, hooknum, skb);
push:
skb_push(skb, nh_off);
+ skb_postpush_rcsum(skb, skb->data, nh_off);
return err;
}
@@ -590,7 +742,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
if (maniptype == NF_NAT_MANIP_SRC) {
__be16 src;
- key->ct.state |= OVS_CS_F_SRC_NAT;
+ key->ct_state |= OVS_CS_F_SRC_NAT;
if (key->eth.type == htons(ETH_P_IP))
key->ipv4.addr.src = ip_hdr(skb)->saddr;
else if (key->eth.type == htons(ETH_P_IPV6))
@@ -612,7 +764,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
} else {
__be16 dst;
- key->ct.state |= OVS_CS_F_DST_NAT;
+ key->ct_state |= OVS_CS_F_DST_NAT;
if (key->eth.type == htons(ETH_P_IP))
key->ipv4.addr.dst = ip_hdr(skb)->daddr;
else if (key->eth.type == htons(ETH_P_IPV6))
@@ -699,7 +851,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
* not done already. Update key with new CT state after passing the packet
* through conntrack.
- * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
* set to NULL and 0 will be returned.
*/
static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
@@ -721,19 +873,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
/* Associate skb with specified zone. */
if (tmpl) {
- if (skb->nfct)
- nf_conntrack_put(skb->nfct);
+ if (skb_nfct(skb))
+ nf_conntrack_put(skb_nfct(skb));
nf_conntrack_get(&tmpl->ct_general);
- skb->nfct = &tmpl->ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
}
- /* Repeat if requested, see nf_iterate(). */
- do {
- err = nf_conntrack_in(net, info->family,
- NF_INET_PRE_ROUTING, skb);
- } while (err == NF_REPEAT);
-
+ err = nf_conntrack_in(net, info->family,
+ NF_INET_PRE_ROUTING, skb);
if (err != NF_ACCEPT)
return -ENOENT;
@@ -741,7 +888,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* NAT after the nf_conntrack_in() call. We can actually clear
* the whole state, as it will be re-initialized below.
*/
- key->ct.state = 0;
+ key->ct_state = 0;
/* Update the key, but keep the NAT flags. */
ovs_ct_update_key(skb, info, key, true, true);
@@ -757,9 +904,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*
* NAT will be done only if the CT action has NAT, and only
* once per packet (per zone), as guarded by the NAT bits in
- * the key->ct.state.
+ * the key->ct_state.
*/
- if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+ if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
(nf_ct_is_confirmed(ct) || info->commit) &&
ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
return -EINVAL;
@@ -823,7 +970,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
if (err)
return err;
- ct = (struct nf_conn *)skb->nfct;
+ ct = (struct nf_conn *)skb_nfct(skb);
if (ct)
nf_ct_deliver_cached_events(ct);
}
@@ -835,8 +982,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
{
size_t i;
- for (i = 0; i < sizeof(*labels); i++)
- if (labels->ct_labels[i])
+ for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+ if (labels->ct_labels_32[i])
return true;
return false;
@@ -847,24 +994,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
int err;
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
+ /* The connection could be invalid, in which case this is a no-op.*/
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return 0;
+
/* Apply changes before confirming the connection so that the initial
* conntrack NEW netlink event carries the values given in the CT
* action.
*/
if (info->mark.mask) {
- err = ovs_ct_set_mark(skb, key, info->mark.value,
+ err = ovs_ct_set_mark(ct, key, info->mark.value,
info->mark.mask);
if (err)
return err;
}
- if (labels_nonzero(&info->labels.mask)) {
- err = ovs_ct_set_labels(skb, key, &info->labels.value,
+ if (!nf_ct_is_confirmed(ct)) {
+ err = ovs_ct_init_labels(ct, key, &info->labels.value,
+ &info->labels.mask);
+ if (err)
+ return err;
+ } else if (labels_nonzero(&info->labels.mask)) {
+ err = ovs_ct_set_labels(ct, key, &info->labels.value,
&info->labels.mask);
if (err)
return err;
@@ -890,7 +1049,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
/* The conntrack module expects to be working at L3. */
nh_ofs = skb_network_offset(skb);
- skb_pull(skb, nh_ofs);
+ skb_pull_rcsum(skb, nh_ofs);
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb);
@@ -904,6 +1063,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
err = ovs_ct_lookup(net, key, info, skb);
skb_push(skb, nh_ofs);
+ skb_postpush_rcsum(skb, skb->data, nh_ofs);
if (err)
kfree_skb(skb);
return err;
@@ -1065,6 +1225,7 @@ static int parse_nat(const struct nlattr *attr,
static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
+ [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 },
[OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
.maxlen = sizeof(u16) },
[OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
@@ -1104,6 +1265,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
}
switch (type) {
+ case OVS_CT_ATTR_FORCE_COMMIT:
+ info->force = true;
+ /* fall through. */
case OVS_CT_ATTR_COMMIT:
info->commit = true;
break;
@@ -1330,7 +1494,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
if (!start)
return -EMSGSIZE;
- if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+ if (ct_info->commit && nla_put_flag(skb, ct_info->force
+ ? OVS_CT_ATTR_FORCE_COMMIT
+ : OVS_CT_ATTR_COMMIT))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
const struct ovs_conntrack_info *);
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb);
void ovs_ct_free_action(const struct nlattr *a);
#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
static inline void ovs_ct_fill_key(const struct sk_buff *skb,
struct sw_flow_key *key)
{
- key->ct.state = 0;
- key->ct.zone = 0;
+ key->ct_state = 0;
+ key->ct_zone = 0;
key->ct.mark = 0;
memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+ /* Clear 'ct_orig_proto' to mark the non-existence of original
+ * direction key fields.
+ */
+ key->ct_orig_proto = 0;
}
-static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output,
struct sk_buff *skb)
{
return 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d67ea856067..9c62b6325f7a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -58,8 +58,7 @@
#include "vport-internal_dev.h"
#include "vport-netdev.h"
-int ovs_net_id __read_mostly;
-EXPORT_SYMBOL_GPL(ovs_net_id);
+unsigned int ovs_net_id __read_mostly;
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
@@ -131,7 +130,6 @@ int lockdep_ovsl_is_held(void)
else
return 1;
}
-EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
#endif
static struct vport *new_vport(const struct vport_parms *);
@@ -562,7 +560,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct datapath *dp;
- struct ethhdr *eth;
struct vport *input_vport;
u16 mru = 0;
int len;
@@ -583,17 +580,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
- skb_reset_mac_header(packet);
- eth = eth_hdr(packet);
-
- /* Normally, setting the skb 'protocol' field would be handled by a
- * call to eth_type_trans(), but it assumes there's a sending
- * device, which we may not have. */
- if (eth_proto_is_802_3(eth->h_proto))
- packet->protocol = eth->h_proto;
- else
- packet->protocol = htons(ETH_P_802_2);
-
/* Set packet's mru */
if (a[OVS_PACKET_ATTR_MRU]) {
mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
@@ -672,8 +658,7 @@ static const struct genl_ops dp_packet_genl_ops[] = {
}
};
-static struct genl_family dp_packet_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_packet_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_PACKET_FAMILY,
.version = OVS_PACKET_VERSION,
@@ -682,6 +667,7 @@ static struct genl_family dp_packet_genl_family = {
.parallel_ops = true,
.ops = dp_packet_genl_ops,
.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+ .module = THIS_MODULE,
};
static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
@@ -1437,8 +1423,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
},
};
-static struct genl_family dp_flow_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_flow_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
@@ -1449,6 +1434,7 @@ static struct genl_family dp_flow_genl_family = {
.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
static size_t ovs_dp_cmd_msg_size(void)
@@ -1823,8 +1809,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
},
};
-static struct genl_family dp_datapath_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_datapath_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
@@ -1835,6 +1820,7 @@ static struct genl_family dp_datapath_genl_family = {
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
/* Called with ovs_mutex or RCU read lock. */
@@ -2245,8 +2231,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
},
};
-struct genl_family dp_vport_genl_family = {
- .id = GENL_ID_GENERATE,
+struct genl_family dp_vport_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
@@ -2257,6 +2242,7 @@ struct genl_family dp_vport_genl_family = {
.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
static struct genl_family * const dp_genl_families[] = {
@@ -2274,7 +2260,7 @@ static void dp_unregister_genl(int n_families)
genl_unregister_family(dp_genl_families[i]);
}
-static int dp_register_genl(void)
+static int __init dp_register_genl(void)
{
int err;
int i;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ab85c1cae255..1c6e9377436d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,7 +144,7 @@ struct ovs_net {
bool xt_label;
};
-extern int ovs_net_id;
+extern unsigned int ovs_net_id;
void ovs_lock(void);
void ovs_unlock(void);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 22087062bd10..3f76cb765e5b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
* Returns 0 if it encounters a non-vlan or incomplete packet.
* Returns 1 after successfully parsing vlan tag.
*/
-static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
+static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
+ bool untag_vlan)
{
struct vlan_head *vh = (struct vlan_head *)skb->data;
@@ -330,31 +331,47 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
key_vh->tpid = vh->tpid;
- __skb_pull(skb, sizeof(struct vlan_head));
+ if (unlikely(untag_vlan)) {
+ int offset = skb->data - skb_mac_header(skb);
+ u16 tci;
+ int err;
+
+ __skb_push(skb, offset);
+ err = __skb_vlan_pop(skb, &tci);
+ __skb_pull(skb, offset);
+ if (err)
+ return err;
+ __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci);
+ } else {
+ __skb_pull(skb, sizeof(struct vlan_head));
+ }
return 1;
}
-static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+static void clear_vlan(struct sw_flow_key *key)
{
- int res;
-
key->eth.vlan.tci = 0;
key->eth.vlan.tpid = 0;
key->eth.cvlan.tci = 0;
key->eth.cvlan.tpid = 0;
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int res;
if (skb_vlan_tag_present(skb)) {
key->eth.vlan.tci = htons(skb->vlan_tci);
key->eth.vlan.tpid = skb->vlan_proto;
} else {
/* Parse outer vlan tag in the non-accelerated case. */
- res = parse_vlan_tag(skb, &key->eth.vlan);
+ res = parse_vlan_tag(skb, &key->eth.vlan, true);
if (res <= 0)
return res;
}
/* Parse inner vlan tag. */
- res = parse_vlan_tag(skb, &key->eth.cvlan);
+ res = parse_vlan_tag(skb, &key->eth.cvlan, false);
if (res <= 0)
return res;
@@ -483,17 +500,20 @@ invalid:
*
* Returns 0 if successful, otherwise a negative errno value.
*
- * Initializes @skb header pointers as follows:
+ * Initializes @skb header fields as follows:
*
- * - skb->mac_header: the Ethernet header.
+ * - skb->mac_header: the L2 header.
*
- * - skb->network_header: just past the Ethernet header, or just past the
- * VLAN header, to the first byte of the Ethernet payload.
+ * - skb->network_header: just past the L2 header, or just past the
+ * VLAN header, to the first byte of the L2 payload.
*
* - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
* on output, then just past the IP header, if one is present and
* of a correct length, otherwise the same as skb->network_header.
* For other key->eth.type values it is left untouched.
+ *
+ * - skb->protocol: the type of the data starting at skb->network_header.
+ * Equals to key->eth.type.
*/
static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
{
@@ -505,28 +525,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
skb_reset_mac_header(skb);
- /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
- * header in the linear data area.
- */
- eth = eth_hdr(skb);
- ether_addr_copy(key->eth.src, eth->h_source);
- ether_addr_copy(key->eth.dst, eth->h_dest);
+ /* Link layer. */
+ clear_vlan(key);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
+ if (unlikely(eth_type_vlan(skb->protocol)))
+ return -EINVAL;
- __skb_pull(skb, 2 * ETH_ALEN);
- /* We are going to push all headers that we pull, so no need to
- * update skb->csum here.
- */
+ skb_reset_network_header(skb);
+ } else {
+ eth = eth_hdr(skb);
+ ether_addr_copy(key->eth.src, eth->h_source);
+ ether_addr_copy(key->eth.dst, eth->h_dest);
- if (unlikely(parse_vlan(skb, key)))
- return -ENOMEM;
+ __skb_pull(skb, 2 * ETH_ALEN);
+ /* We are going to push all headers that we pull, so no need to
+ * update skb->csum here.
+ */
- key->eth.type = parse_ethertype(skb);
- if (unlikely(key->eth.type == htons(0)))
- return -ENOMEM;
+ if (unlikely(parse_vlan(skb, key)))
+ return -ENOMEM;
+
+ skb->protocol = parse_ethertype(skb);
+ if (unlikely(skb->protocol == htons(0)))
+ return -ENOMEM;
- skb_reset_network_header(skb);
+ skb_reset_network_header(skb);
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+ }
skb_reset_mac_len(skb);
- __skb_push(skb, skb->data - skb_mac_header(skb));
+ key->eth.type = skb->protocol;
/* Network layer. */
if (key->eth.type == htons(ETH_P_IP)) {
@@ -718,12 +745,34 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
{
- return key_extract(skb, key);
+ int res;
+
+ res = key_extract(skb, key);
+ if (!res)
+ key->mac_proto &= ~SW_FLOW_KEY_INVALID;
+
+ return res;
+}
+
+static int key_extract_mac_proto(struct sk_buff *skb)
+{
+ switch (skb->dev->type) {
+ case ARPHRD_ETHER:
+ return MAC_PROTO_ETHERNET;
+ case ARPHRD_NONE:
+ if (skb->protocol == htons(ETH_P_TEB))
+ return MAC_PROTO_ETHERNET;
+ return MAC_PROTO_NONE;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
+ int res, err;
+
/* Extract metadata from packet. */
if (tun_info) {
key->tun_proto = ip_tunnel_info_af(tun_info);
@@ -749,23 +798,61 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->phy.priority = skb->priority;
key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
key->phy.skb_mark = skb->mark;
- ovs_ct_fill_key(skb, key);
key->ovs_flow_hash = 0;
+ res = key_extract_mac_proto(skb);
+ if (res < 0)
+ return res;
+ key->mac_proto = res;
key->recirc_id = 0;
- return key_extract(skb, key);
+ err = key_extract(skb, key);
+ if (!err)
+ ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
+ return err;
}
int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
struct sk_buff *skb,
struct sw_flow_key *key, bool log)
{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ u64 attrs = 0;
int err;
+ err = parse_flow_nlattrs(attr, a, &attrs, log);
+ if (err)
+ return -EINVAL;
+
/* Extract metadata from netlink attributes. */
- err = ovs_nla_get_flow_metadata(net, attr, key, log);
+ err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
if (err)
return err;
- return key_extract(skb, key);
+ /* key_extract assumes that skb->protocol is set-up for
+ * layer 3 packets which is the case for other callers,
+ * in particular packets received from the network stack.
+ * Here the correct value can be set from the metadata
+ * extracted above.
+ * For L2 packet key eth type would be zero. skb protocol
+ * would be set to correct value later during key-extact.
+ */
+
+ skb->protocol = key->eth.type;
+ err = key_extract(skb, key);
+ if (err)
+ return err;
+
+ /* Check that we have conntrack original direction tuple metadata only
+ * for packets for which it makes sense. Otherwise the key may be
+ * corrupted due to overlapping key fields.
+ */
+ if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
+ key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+ if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
+ (key->eth.type != htons(ETH_P_IPV6) ||
+ sw_flow_key_is_nd(key)))
+ return -EINVAL;
+
+ return 0;
}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index ae783f5c6695..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -37,6 +37,12 @@
struct sk_buff;
+enum sw_flow_mac_proto {
+ MAC_PROTO_NONE = 0,
+ MAC_PROTO_ETHERNET,
+};
+#define SW_FLOW_KEY_INVALID 0x80
+
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -68,6 +74,7 @@ struct sw_flow_key {
u32 skb_mark; /* SKB mark. */
u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
} __packed phy; /* Safe when right after 'tun_key'. */
+ u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */
u8 tun_proto; /* Protocol of encapsulating tunnel. */
u32 ovs_flow_hash; /* Datapath computed hash value. */
u32 recirc_id; /* Recirculation ID. */
@@ -78,6 +85,11 @@ struct sw_flow_key {
struct vlan_head cvlan;
__be16 type; /* Ethernet frame type. */
} eth;
+ /* Filling a hole of two bytes. */
+ u8 ct_state;
+ u8 ct_orig_proto; /* CT original direction tuple IP
+ * protocol.
+ */
union {
struct {
__be32 top_lse; /* top label stack entry */
@@ -89,6 +101,7 @@ struct sw_flow_key {
u8 frag; /* One of OVS_FRAG_TYPE_*. */
} ip;
};
+ u16 ct_zone; /* Conntrack zone. */
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
@@ -100,10 +113,16 @@ struct sw_flow_key {
__be32 src; /* IP source address. */
__be32 dst; /* IP destination address. */
} addr;
- struct {
- u8 sha[ETH_ALEN]; /* ARP source hardware address. */
- u8 tha[ETH_ALEN]; /* ARP target hardware address. */
- } arp;
+ union {
+ struct {
+ __be32 src;
+ __be32 dst;
+ } ct_orig; /* Conntrack original direction fields. */
+ struct {
+ u8 sha[ETH_ALEN]; /* ARP source hardware address. */
+ u8 tha[ETH_ALEN]; /* ARP target hardware address. */
+ } arp;
+ };
} ipv4;
struct {
struct {
@@ -111,23 +130,40 @@ struct sw_flow_key {
struct in6_addr dst; /* IPv6 destination address. */
} addr;
__be32 label; /* IPv6 flow label. */
- struct {
- struct in6_addr target; /* ND target address. */
- u8 sll[ETH_ALEN]; /* ND source link layer address. */
- u8 tll[ETH_ALEN]; /* ND target link layer address. */
- } nd;
+ union {
+ struct {
+ struct in6_addr src;
+ struct in6_addr dst;
+ } ct_orig; /* Conntrack original direction fields. */
+ struct {
+ struct in6_addr target; /* ND target address. */
+ u8 sll[ETH_ALEN]; /* ND source link layer address. */
+ u8 tll[ETH_ALEN]; /* ND target link layer address. */
+ } nd;
+ };
} ipv6;
};
struct {
- /* Connection tracking fields. */
- u16 zone;
+ /* Connection tracking fields not packed above. */
+ struct {
+ __be16 src; /* CT orig tuple tp src port. */
+ __be16 dst; /* CT orig tuple tp dst port. */
+ } orig_tp;
u32 mark;
- u8 state;
struct ovs_key_ct_labels labels;
} ct;
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
+{
+ return key->eth.type == htons(ETH_P_IPV6) &&
+ key->ip.proto == NEXTHDR_ICMP &&
+ key->tp.dst == 0 &&
+ (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
+}
+
struct sw_flow_key_range {
unsigned short int start;
unsigned short int end;
@@ -206,6 +242,21 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
+static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key)
+{
+ return key->mac_proto & ~SW_FLOW_KEY_INVALID;
+}
+
+static inline u16 __ovs_mac_header_len(u8 mac_proto)
+{
+ return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0;
+}
+
+static inline u16 ovs_mac_header_len(const struct sw_flow_key *key)
+{
+ return __ovs_mac_header_len(ovs_key_mac_proto(key));
+}
+
static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid)
{
return sfid->ufid_len;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index ae25ded82b3b..1105a838bab8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -123,13 +123,15 @@ static void update_range(struct sw_flow_match *match,
static bool match_validate(const struct sw_flow_match *match,
u64 key_attrs, u64 mask_attrs, bool log)
{
- u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 key_expected = 0;
u64 mask_allowed = key_attrs; /* At most allow all key attributes */
/* The following mask attributes allowed only if they
* pass the validation tests. */
mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
| (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
| (1 << OVS_KEY_ATTR_TCP)
| (1 << OVS_KEY_ATTR_TCP_FLAGS)
| (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
if (match->key->eth.type == htons(ETH_P_IP)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV4;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+ mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
+ }
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
if (match->key->eth.type == htons(ETH_P_IPV6)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV6;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+ mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
+ }
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
htons(NDISC_NEIGHBOUR_SOLICITATION) ||
match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
key_expected |= 1 << OVS_KEY_ATTR_ND;
+ /* Original direction conntrack tuple
+ * uses the same space as the ND fields
+ * in the key, so both are not allowed
+ * at the same time.
+ */
+ mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
if (match->mask && (match->mask->key.tp.src == htons(0xff)))
mask_allowed |= 1 << OVS_KEY_ATTR_ND;
}
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
/* Whenever adding new OVS_KEY_ FIELDS, we should consider
* updating this function.
*/
- BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
+ BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
+ nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
+ nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
+ + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
[OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
+ [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
+ .len = sizeof(struct ovs_key_ct_tuple_ipv4) },
+ [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
+ .len = sizeof(struct ovs_key_ct_tuple_ipv6) },
};
static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
return __parse_flow_nlattrs(attr, a, attrsp, log, true);
}
-static int parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp,
- bool log)
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+ u64 *attrsp, bool log)
{
return __parse_flow_nlattrs(attr, a, attrsp, log, false);
}
@@ -588,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
ipv4 = true;
break;
case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src,
nla_get_in6_addr(a), is_mask);
ipv6 = true;
break;
@@ -649,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
tun_flags |= TUNNEL_VXLAN_OPT;
opts_type = type;
break;
+ case OVS_TUNNEL_KEY_ATTR_PAD:
+ break;
default:
OVS_NLERR(log, "Unknown IP tunnel attribute %d",
type);
@@ -969,10 +987,33 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
return 0;
}
+static int parse_eth_type_from_nlattrs(struct sw_flow_match *match,
+ u64 *attrs, const struct nlattr **a,
+ bool is_mask, bool log)
+{
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (!eth_proto_is_802_3(eth_type)) {
+ OVS_NLERR(log, "EtherType %x is less than min %x",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ return 0;
+}
+
static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
u64 *attrs, const struct nlattr **a,
bool is_mask, bool log)
{
+ u8 mac_proto = MAC_PROTO_ETHERNET;
+
if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
@@ -1033,14 +1074,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
return -EINVAL;
}
- SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
}
if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
- SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
}
if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1059,6 +1100,49 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
sizeof(*cl), is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
}
+ if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
+ const struct ovs_key_ct_tuple_ipv4 *ct;
+
+ ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
+
+ SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
+ }
+ if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
+ const struct ovs_key_ct_tuple_ipv6 *ct;
+
+ ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
+
+ SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
+ sizeof(match->key->ipv6.ct_orig.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
+ sizeof(match->key->ipv6.ct_orig.dst),
+ is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
+ }
+
+ /* For layer 3 packets the Ethernet type is provided
+ * and treated as metadata but no MAC addresses are provided.
+ */
+ if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) &&
+ (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)))
+ mac_proto = MAC_PROTO_NONE;
+
+ /* Always exact match mac_proto */
+ SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask);
+
+ if (mac_proto == MAC_PROTO_NONE)
+ return parse_eth_type_from_nlattrs(match, attrs, a, is_mask,
+ log);
+
return 0;
}
@@ -1081,33 +1165,26 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
SW_FLOW_KEY_MEMCPY(match, eth.dst,
eth_key->eth_dst, ETH_ALEN, is_mask);
attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
- }
- if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
- /* VLAN attribute is always parsed before getting here since it
- * may occur multiple times.
- */
- OVS_NLERR(log, "VLAN attribute unexpected.");
- return -EINVAL;
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
- __be16 eth_type;
-
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (is_mask) {
- /* Always exact match EtherType. */
- eth_type = htons(0xffff);
- } else if (!eth_proto_is_802_3(eth_type)) {
- OVS_NLERR(log, "EtherType %x is less than min %x",
- ntohs(eth_type), ETH_P_802_3_MIN);
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ /* VLAN attribute is always parsed before getting here since it
+ * may occur multiple times.
+ */
+ OVS_NLERR(log, "VLAN attribute unexpected.");
return -EINVAL;
}
- SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask,
+ log);
+ if (err)
+ return err;
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ }
+ } else if (!match->key->eth.type) {
+ OVS_NLERR(log, "Either Ethernet header or EtherType is required.");
+ return -EINVAL;
}
if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
@@ -1462,9 +1539,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
/**
* ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
- * @key: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
+ * @net: Network namespace.
+ * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
+ * metadata.
+ * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
+ * attributes.
+ * @attrs: Bit mask for the netlink attributes included in @a.
* @log: Boolean to allow kernel error logging. Normally true, but when
* probing for feature compatibility this should be passed in as false to
* suppress unnecessary error logging.
@@ -1473,25 +1553,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
* take the same form accepted by flow_from_nlattrs(), but only enough of it to
* get the metadata, that is, the parts of the flow key that cannot be
* extracted from the packet itself.
+ *
+ * This must be called before the packet key fields are filled in 'key'.
*/
-int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
- struct sw_flow_key *key,
- bool log)
+int ovs_nla_get_flow_metadata(struct net *net,
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+ u64 attrs, struct sw_flow_key *key, bool log)
{
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
struct sw_flow_match match;
- u64 attrs = 0;
- int err;
-
- err = parse_flow_nlattrs(attr, a, &attrs, log);
- if (err)
- return -EINVAL;
memset(&match, 0, sizeof(match));
match.key = key;
+ key->ct_state = 0;
+ key->ct_zone = 0;
+ key->ct_orig_proto = 0;
memset(&key->ct, 0, sizeof(key->ct));
+ memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
+ memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
+
key->phy.in_port = DP_MAX_PORTS;
return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1553,45 +1634,47 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
goto nla_put_failure;
- if (ovs_ct_put_key(output, skb))
+ if (ovs_ct_put_key(swkey, output, skb))
goto nla_put_failure;
- nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
- if (!nla)
- goto nla_put_failure;
-
- eth_key = nla_data(nla);
- ether_addr_copy(eth_key->eth_src, output->eth.src);
- ether_addr_copy(eth_key->eth_dst, output->eth.dst);
-
- if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
- if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
+ if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
goto nla_put_failure;
- encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.vlan.tci)
- goto unencap;
- if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
- if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+ eth_key = nla_data(nla);
+ ether_addr_copy(eth_key->eth_src, output->eth.src);
+ ether_addr_copy(eth_key->eth_dst, output->eth.dst);
+
+ if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
goto nla_put_failure;
- in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.cvlan.tci)
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.vlan.tci)
goto unencap;
+
+ if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+ goto nla_put_failure;
+ in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.cvlan.tci)
+ goto unencap;
+ }
}
- }
- if (swkey->eth.type == htons(ETH_P_802_2)) {
- /*
- * Ethertype 802.2 is represented in the netlink with omitted
- * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
- * 0xffff in the mask attribute. Ethertype can also
- * be wildcarded.
- */
- if (is_mask && output->eth.type)
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
- output->eth.type))
- goto nla_put_failure;
- goto unencap;
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
}
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
@@ -2126,8 +2209,8 @@ static bool validate_masked(u8 *data, int len)
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key,
- struct sw_flow_actions **sfa,
- bool *skip_copy, __be16 eth_type, bool masked, bool log)
+ struct sw_flow_actions **sfa, bool *skip_copy,
+ u8 mac_proto, __be16 eth_type, bool masked, bool log)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
@@ -2157,7 +2240,11 @@ static int validate_set(const struct nlattr *a,
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
+ break;
+
case OVS_KEY_ATTR_ETHERNET:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
break;
case OVS_KEY_ATTR_TUNNEL:
@@ -2324,6 +2411,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
int depth, struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log)
{
+ u8 mac_proto = ovs_key_mac_proto(key);
const struct nlattr *a;
int rem, err;
@@ -2346,6 +2434,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
[OVS_ACTION_ATTR_CT] = (u32)-1,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
+ [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
+ [OVS_ACTION_ATTR_POP_ETH] = 0,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2394,10 +2484,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
}
case OVS_ACTION_ATTR_POP_VLAN:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
vlan_tci = htons(0);
break;
case OVS_ACTION_ATTR_PUSH_VLAN:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
vlan = nla_data(a);
if (!eth_type_vlan(vlan->vlan_tpid))
return -EINVAL;
@@ -2447,14 +2541,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_SET:
err = validate_set(a, key, sfa,
- &skip_copy, eth_type, false, log);
+ &skip_copy, mac_proto, eth_type,
+ false, log);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SET_MASKED:
err = validate_set(a, key, sfa,
- &skip_copy, eth_type, true, log);
+ &skip_copy, mac_proto, eth_type,
+ true, log);
if (err)
return err;
break;
@@ -2474,6 +2570,22 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
skip_copy = true;
break;
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ /* Disallow pushing an Ethernet header if one
+ * is already present */
+ if (mac_proto != MAC_PROTO_NONE)
+ return -EINVAL;
+ mac_proto = MAC_PROTO_NONE;
+ break;
+
+ case OVS_ACTION_ATTR_POP_ETH:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
+ if (vlan_tci & htons(VLAN_TAG_PRESENT))
+ return -EINVAL;
+ mac_proto = MAC_PROTO_ETHERNET;
+ break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
- struct sw_flow_key *, bool log);
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+ u64 *attrsp, bool log);
+int ovs_nla_get_flow_metadata(struct net *net,
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+ u64 attrs, struct sw_flow_key *key, bool log);
int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index e7da29021b38..89193a634da4 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -89,15 +89,6 @@ static const struct ethtool_ops internal_dev_ethtool_ops = {
.get_link = ethtool_op_get_link,
};
-static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
-{
- if (new_mtu < 68)
- return -EINVAL;
-
- netdev->mtu = new_mtu;
- return 0;
-}
-
static void internal_dev_destructor(struct net_device *dev)
{
struct vport *vport = ovs_internal_dev_get_vport(dev);
@@ -106,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
free_netdev(dev);
}
-static struct rtnl_link_stats64 *
+static void
internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
int i;
@@ -134,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_bytes += local_stats.tx_bytes;
stats->tx_packets += local_stats.tx_packets;
}
-
- return stats;
}
static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
@@ -148,7 +137,6 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
- .ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_get_stats,
.ndo_set_rx_headroom = internal_set_rx_headroom,
};
@@ -161,6 +149,8 @@ static void do_setup(struct net_device *netdev)
{
ether_setup(netdev);
+ netdev->max_mtu = ETH_MAX_MTU;
+
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 4e3972344aa6..0389398fa4ab 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -57,8 +57,10 @@ static void netdev_port_receive(struct sk_buff *skb)
if (unlikely(!skb))
return;
- skb_push(skb, ETH_HLEN);
- skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ if (skb->dev->type == ARPHRD_ETHER) {
+ skb_push(skb, ETH_HLEN);
+ skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ }
ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
return;
error:
@@ -97,7 +99,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
}
if (vport->dev->flags & IFF_LOOPBACK ||
- vport->dev->type != ARPHRD_ETHER ||
+ (vport->dev->type != ARPHRD_ETHER &&
+ vport->dev->type != ARPHRD_NONE) ||
ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
@@ -162,7 +165,6 @@ void ovs_netdev_detach_dev(struct vport *vport)
netdev_master_upper_dev_get(vport->dev));
dev_set_promiscuity(vport->dev, -1);
}
-EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
static void netdev_destroy(struct vport *vport)
{
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 7387418ac514..b6c8524032a0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -463,27 +463,11 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
ovs_dp_process_packet(skb, &key);
return 0;
}
-EXPORT_SYMBOL_GPL(ovs_vport_receive);
-static void free_vport_rcu(struct rcu_head *rcu)
+static unsigned int packet_length(const struct sk_buff *skb,
+ struct net_device *dev)
{
- struct vport *vport = container_of(rcu, struct vport, rcu);
-
- ovs_vport_free(vport);
-}
-
-void ovs_vport_deferred_free(struct vport *vport)
-{
- if (!vport)
- return;
-
- call_rcu(&vport->rcu, free_vport_rcu);
-}
-EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-
-static unsigned int packet_length(const struct sk_buff *skb)
-{
- unsigned int length = skb->len - ETH_HLEN;
+ unsigned int length = skb->len - dev->hard_header_len;
if (!skb_vlan_tag_present(skb) &&
eth_type_vlan(skb->protocol))
@@ -497,14 +481,34 @@ static unsigned int packet_length(const struct sk_buff *skb)
return length;
}
-void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
{
int mtu = vport->dev->mtu;
- if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ switch (vport->dev->type) {
+ case ARPHRD_NONE:
+ if (mac_proto == MAC_PROTO_ETHERNET) {
+ skb_reset_network_header(skb);
+ skb_reset_mac_len(skb);
+ skb->protocol = htons(ETH_P_TEB);
+ } else if (mac_proto != MAC_PROTO_NONE) {
+ WARN_ON_ONCE(1);
+ goto drop;
+ }
+ break;
+ case ARPHRD_ETHER:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ goto drop;
+ break;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(packet_length(skb, vport->dev) > mtu &&
+ !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
vport->dev->name,
- packet_length(skb), mtu);
+ packet_length(skb, vport->dev), mtu);
vport->dev->stats.tx_errors++;
goto drop;
}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index f01f28a567ad..cda66c26ad08 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -149,7 +149,6 @@ struct vport_ops {
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
-void ovs_vport_deferred_free(struct vport *vport);
#define VPORT_ALIGN 8
@@ -198,6 +197,6 @@ int __ovs_vport_ops_register(struct vport_ops *ops);
})
void ovs_vport_ops_unregister(struct vport_ops *ops);
-void ovs_vport_send(struct vport *vport, struct sk_buff *skb);
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto);
#endif /* vport.h */