diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 1 | ||||
-rw-r--r-- | net/core/dev.c | 189 | ||||
-rw-r--r-- | net/core/dst.c | 120 | ||||
-rw-r--r-- | net/core/ethtool.c | 2 | ||||
-rw-r--r-- | net/core/fib_rules.c | 48 | ||||
-rw-r--r-- | net/core/filter.c | 299 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 284 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 249 | ||||
-rw-r--r-- | net/core/neighbour.c | 59 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 54 | ||||
-rw-r--r-- | net/core/net-traces.c | 1 | ||||
-rw-r--r-- | net/core/netpoll.c | 31 | ||||
-rw-r--r-- | net/core/pktgen.c | 5 | ||||
-rw-r--r-- | net/core/ptp_classifier.c | 16 | ||||
-rw-r--r-- | net/core/request_sock.c | 88 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 112 | ||||
-rw-r--r-- | net/core/skbuff.c | 13 | ||||
-rw-r--r-- | net/core/sock.c | 89 | ||||
-rw-r--r-- | net/core/sock_diag.c | 17 | ||||
-rw-r--r-- | net/core/timestamping.c | 6 | ||||
-rw-r--r-- | net/core/tso.c | 18 | ||||
-rw-r--r-- | net/core/utils.c | 58 |
22 files changed, 1364 insertions, 395 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index fec0856dd6c0..086b01fbe1bd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/net/core/dev.c b/net/core/dev.c index a8e4dd430285..8ce3f74cd6b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -99,6 +99,7 @@ #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dst.h> +#include <net/dst_metadata.h> #include <net/pkt_sched.h> #include <net/checksum.h> #include <net/xfrm.h> @@ -682,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev) EXPORT_SYMBOL(dev_get_iflink); /** + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. + * + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. + */ +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info; + + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; + + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; + + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); +} +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); + +/** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find @@ -2915,9 +2942,11 @@ EXPORT_SYMBOL(xmit_recursion); /** * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); @@ -2972,6 +3001,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) new_index = skb_tx_hash(dev, skb); if (queue_index != new_index && sk && + sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); @@ -3061,6 +3091,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -3133,11 +3173,11 @@ out: return rc; } -int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb) +int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } -EXPORT_SYMBOL(dev_queue_xmit_sk); +EXPORT_SYMBOL(dev_queue_xmit); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) { @@ -3645,19 +3685,27 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, qdisc_skb_cb(skb)->pkt_len = skb->len; skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - qdisc_bstats_update_cpu(cl->q, skb); + qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res)) { + switch (tc_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: - qdisc_qstats_drop_cpu(cl->q); + qdisc_qstats_cpu_drop(cl->q); case TC_ACT_STOLEN: case TC_ACT_QUEUED: kfree_skb(skb); return NULL; + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by cls/act_bpf, so + * we can safely push the L2 header back before + * redirecting to another netdev + */ + __skb_push(skb, skb->mac_len); + skb_do_redirect(skb); + return NULL; default: break; } @@ -3972,13 +4020,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb) * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ -int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb) +int netif_receive_skb(struct sk_buff *skb) { trace_netif_receive_skb_entry(skb); return netif_receive_skb_internal(skb); } -EXPORT_SYMBOL(netif_receive_skb_sk); +EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending * Called with irqs disabled. @@ -4703,6 +4751,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); @@ -4845,8 +4895,7 @@ struct netdev_adjacent { struct rcu_head rcu; }; -static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, - struct net_device *adj_dev, +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; @@ -4872,7 +4921,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); + return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4985,7 +5034,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower - * list will remain unchainged. + * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) @@ -5040,7 +5089,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower - * list will remain unchainged. + * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { @@ -5134,7 +5183,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, struct netdev_adjacent *adj; int ret; - adj = __netdev_find_adj(dev, adj_dev, dev_list); + adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr++; @@ -5190,7 +5239,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, { struct netdev_adjacent *adj; - adj = __netdev_find_adj(dev, adj_dev, dev_list); + adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("tried to remove device %s from %s\n", @@ -5301,6 +5350,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *private) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -5310,15 +5360,25 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) + if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) + if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = master; + changeupper_info.linking = true; + + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, master); if (ret) @@ -5357,7 +5417,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); return 0; rollback_lower_mesh: @@ -5452,9 +5513,17 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j; ASSERT_RTNL(); + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + changeupper_info.linking = false; + + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower @@ -5474,7 +5543,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5580,7 +5650,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev, if (!lower_dev) return NULL; - lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; @@ -6075,6 +6145,26 @@ int dev_get_phys_port_name(struct net_device *dev, EXPORT_SYMBOL(dev_get_phys_port_name); /** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -6198,6 +6288,48 @@ static void rollback_registered(struct net_device *dev) list_del(&single); } +static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(&upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(upper->wanted_features & feature) + && (features & feature)) { + netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", + &feature, upper->name); + features &= ~feature; + } + } + + return features; +} + +static void netdev_sync_lower_features(struct net_device *upper, + struct net_device *lower, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(&upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(features & feature) && (lower->features & feature)) { + netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", + &feature, lower->name); + lower->wanted_features &= ~feature; + netdev_update_features(lower); + + if (unlikely(lower->features & feature)) + netdev_WARN(upper, "failed to disable %pNF on %s!\n", + &feature, lower->name); + } + } +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -6267,7 +6399,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, int __netdev_update_features(struct net_device *dev) { + struct net_device *upper, *lower; netdev_features_t features; + struct list_head *iter; int err = 0; ASSERT_RTNL(); @@ -6280,6 +6414,10 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); + /* some features can't be enabled if they're off an an upper device */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) + features = netdev_sync_upper_features(dev, upper, features); + if (dev->features == features) return 0; @@ -6296,6 +6434,12 @@ int __netdev_update_features(struct net_device *dev) return -1; } + /* some features must be disabled on lower devices when disabled + * on an upper device (think: bonding master or bridge) + */ + netdev_for_each_lower_dev(dev, lower, iter) + netdev_sync_lower_features(dev, lower, features); + if (!err) dev->features = features; @@ -6967,6 +7111,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); + if (!dev->tx_queue_len) + dev->priv_flags |= IFF_NO_QUEUE; + dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) @@ -7639,7 +7786,7 @@ static int __init net_dev_init(void) open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dst.c b/net/core/dst.c index 002144bea935..2a1818065e12 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -20,8 +20,10 @@ #include <net/net_namespace.h> #include <linux/sched.h> #include <linux/prefetch.h> +#include <net/lwtunnel.h> #include <net/dst.h> +#include <net/dst_metadata.h> /* * Theory of operations: @@ -142,12 +144,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard_sk(struct sock *sk, struct sk_buff *skb) +int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard_sk); +EXPORT_SYMBOL(dst_discard_out); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -158,19 +160,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = { [RTAX_MAX] = 0xdeadbeef, }; - -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags) { - struct dst_entry *dst; - - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) - return NULL; - } - dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); - if (!dst) - return NULL; dst->child = NULL; dst->dev = dev; if (dev) @@ -184,7 +177,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -192,6 +185,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif + dst->lwtstate = NULL; atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; @@ -200,6 +194,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); +} +EXPORT_SYMBOL(dst_init); + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, unsigned short flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + return dst; } EXPORT_SYMBOL(dst_alloc); @@ -211,7 +224,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -248,7 +261,13 @@ again: dst->ops->destroy(dst); if (dst->dev) dev_put(dst->dev); - kmem_cache_free(dst->ops->kmem_cachep, dst); + + lwtstate_put(dst->lwtstate); + + if (dst->flags & DST_METADATA) + kfree(dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) { @@ -329,6 +348,69 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); +static struct dst_ops md_dst_ops = { + .family = AF_UNSPEC, +}; + +static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call output on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static int dst_md_discard(struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call input on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) +{ + struct dst_entry *dst; + + dst = &md_dst->dst; + dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + + dst->input = dst_md_discard; + dst->output = dst_md_discard_out; + + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return NULL; + + __metadata_dst_init(md_dst, optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + +struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +{ + int cpu; + struct metadata_dst __percpu *md_dst; + + md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + __alignof__(struct metadata_dst), flags); + if (!md_dst) + return NULL; + + for_each_possible_cpu(cpu) + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice @@ -348,7 +430,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); @@ -393,7 +475,7 @@ static struct notifier_block dst_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; -void __init dst_init(void) +void __init dst_subsys_init(void) { register_netdevice_notifier(&dst_dev_notifier); } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b495ab1797fa..29edf74846fc 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1284,7 +1284,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) gstrings.len = ret; - data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); if (!data) return -ENOMEM; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a12668f7d62..365de66436ac 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -16,6 +16,7 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_rules.h> +#include <net/ip_tunnels.h> int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) @@ -43,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); -u32 fib_default_rule_pref(struct fib_rules_ops *ops) +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -59,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops) return 0; } -EXPORT_SYMBOL(fib_default_rule_pref); static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, @@ -186,6 +186,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -295,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } rule->fr_net = net; - if (tb[FRA_PRIORITY]) - rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) + : fib_default_rule_pref(ops); if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -330,6 +333,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_FWMASK]) rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + if (tb[FRA_TUN_ID]) + rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -343,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) else rule->suppress_ifgroup = -1; - if (!tb[FRA_PRIORITY] && ops->default_pref) - rule->pref = ops->default_pref(ops); - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) @@ -407,6 +410,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (unresolved) ops->unresolved_rules++; + if (rule->tun_id) + ip_tunnel_need_metadata(); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -473,6 +479,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) continue; + if (tb[FRA_TUN_ID] && + (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -487,6 +497,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) goto errout; } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { @@ -535,7 +548,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ - + nla_total_size(4); /* FRA_FWMASK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -591,7 +605,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && - nla_put_u32(skb, FRA_GOTO, rule->target))) + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { @@ -615,15 +631,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -632,7 +650,7 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -648,7 +666,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - return dump_rules(skb, cb, ops); + dump_rules(skb, cb, ops); + + return skb->len; } rcu_read_lock(); diff --git a/net/core/filter.c b/net/core/filter.c index be3098fb65e4..672eefbfbe99 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,16 +47,19 @@ #include <linux/if_vlan.h> #include <linux/bpf.h> #include <net/sch_generic.h> +#include <net/cls_cgroup.h> +#include <net/dst_metadata.h> +#include <net/dst.h> /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * - * Run the filter code and then cut skb->data to correct size returned by - * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller + * Run the eBPF program and then cut skb->data to correct size returned by + * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to SK_RUN_FILTER. It returns 0 if the packet should + * wrapper to BPF_PROG_RUN. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -80,7 +83,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } @@ -146,12 +149,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } -/* note that this only generates 32-bit random numbers */ -static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) -{ - return prandom_u32(); -} - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -310,7 +307,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, *insn = BPF_EMIT_CALL(__get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: - *insn = BPF_EMIT_CALL(__get_random_u32); + *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); + bpf_user_rnd_init_once(); break; } break; @@ -476,9 +474,9 @@ do_pass: bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; - insn->src_reg = BPF_REG_X; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ @@ -999,7 +997,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, int err; fp->bpf_func = NULL; - fp->jited = false; + fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { @@ -1081,16 +1079,18 @@ EXPORT_SYMBOL_GPL(bpf_prog_create); * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler + * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, - bpf_aux_classic_check_t trans) + bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; + int err; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) @@ -1106,12 +1106,16 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, } fp->len = fprog->len; - /* Since unattached filters are not copied back to user - * space through sk_get_filter(), we do not need to hold - * a copy here, and can spare us the work. - */ fp->orig_prog = NULL; + if (save_orig) { + err = bpf_prog_store_orig_filter(fp, fprog); + if (err) { + __bpf_prog_free(fp); + return -ENOMEM; + } + } + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ @@ -1122,6 +1126,7 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, *pfp = fp; return 0; } +EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { @@ -1346,7 +1351,7 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); int offset = (int) r2; __sum16 sum, *ptr; @@ -1401,9 +1406,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!dev)) return -EINVAL; - if (unlikely(!(dev->flags & IFF_UP))) - return -EINVAL; - skb2 = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb2)) return -ENOMEM; @@ -1412,6 +1414,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) return dev_forward_skb(dev, skb2); skb2->dev = dev; + skb_sender_cpu_clear(skb2); return dev_queue_xmit(skb2); } @@ -1424,6 +1427,201 @@ const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; +struct redirect_info { + u32 ifindex; + u32 flags; +}; + +static DEFINE_PER_CPU(struct redirect_info, redirect_info); +static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + ri->ifindex = ifindex; + ri->flags = flags; + return TC_ACT_REDIRECT; +} + +int skb_do_redirect(struct sk_buff *skb) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct net_device *dev; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); + ri->ifindex = 0; + if (unlikely(!dev)) { + kfree_skb(skb); + return -EINVAL; + } + + if (BPF_IS_REDIRECT_INGRESS(ri->flags)) + return dev_forward_skb(dev, skb); + + skb->dev = dev; + skb_sender_cpu_clear(skb); + return dev_queue_xmit(skb); +} + +const struct bpf_func_proto bpf_redirect_proto = { + .func = bpf_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return task_get_classid((struct sk_buff *) (unsigned long) r1); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { + .func = bpf_get_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + const struct dst_entry *dst; + + dst = skb_dst((struct sk_buff *) (unsigned long) r1); + if (dst) + return dst->tclassid; +#endif + return 0; +} + +static const struct bpf_func_proto bpf_get_route_realm_proto = { + .func = bpf_get_route_realm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 vlan_proto = (__force __be16) r2; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + return skb_vlan_push(skb, vlan_proto, vlan_tci); +} + +const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); + +static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + + return skb_vlan_pop(skb); +} + +const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); + +bool bpf_helper_changes_skb_data(void *func) +{ + if (func == bpf_skb_vlan_push) + return true; + if (func == bpf_skb_vlan_pop) + return true; + return false; +} + +static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) + return -EINVAL; + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + to->tunnel_id = be64_to_cpu(info->key.tun_id); + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + + return 0; +} + +const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { + .func = bpf_skb_get_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static struct metadata_dst __percpu *md_dst; + +static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; + struct metadata_dst *md = this_cpu_ptr(md_dst); + struct ip_tunnel_info *info; + + if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + return -EINVAL; + + skb_dst_drop(skb); + dst_hold((struct dst_entry *) md); + skb_dst_set(skb, (struct dst_entry *) md); + + info = &md->u.tun_info; + info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_flags = TUNNEL_KEY; + info->key.tun_id = cpu_to_be64(from->tunnel_id); + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + + return 0; +} + +const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { + .func = bpf_skb_set_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +{ + if (!md_dst) { + /* race is not possible, since it's called from + * verifier that is holding verifier mutex + */ + md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + if (!md_dst) + return NULL; + } + return &bpf_skb_set_tunnel_key_proto; +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1443,7 +1641,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); default: return NULL; } @@ -1461,6 +1660,20 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_key_proto(); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; default: return sk_filter_func_proto(func_id); } @@ -1486,6 +1699,9 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { + if (off == offsetof(struct __sk_buff, tc_classid)) + return false; + if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... @@ -1502,10 +1718,14 @@ static bool sk_filter_is_valid_access(int off, int size, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type) { + if (off == offsetof(struct __sk_buff, tc_classid)) + return type == BPF_WRITE ? true : false; + if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, tc_index): + case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... offsetof(struct __sk_buff, cb[4]): break; @@ -1518,7 +1738,8 @@ static bool tc_cls_act_is_valid_access(int off, int size, static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, - struct bpf_insn *insn_buf) + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1547,8 +1768,12 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, priority): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, priority)); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); break; case offsetof(struct __sk_buff, ingress_ifindex): @@ -1569,6 +1794,13 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, offsetof(struct net_device, ifindex)); break; + case offsetof(struct __sk_buff, hash): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, hash)); + break; + case offsetof(struct __sk_buff, mark): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); @@ -1598,6 +1830,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + prog->cb_access = 1; ctx_off -= offsetof(struct __sk_buff, cb[0]); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct qdisc_skb_cb, data); @@ -1607,6 +1840,14 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); break; + case offsetof(struct __sk_buff, tc_classid): + ctx_off -= offsetof(struct __sk_buff, tc_classid); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + WARN_ON(type != BPF_WRITE); + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + break; + case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); @@ -1701,9 +1942,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; /* We're copying the filter that has been originally attached, - * so no conversion/decode needed anymore. + * so no conversion/decode needed anymore. eBPF programs that + * have no original program cannot be dumped through this. */ + ret = -EACCES; fprog = filter->prog->orig_prog; + if (!fprog) + goto out; ret = fprog->len; if (!len) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2a834c6179b9..d79699c9d1b9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,14 +19,14 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1 << key_id); } -static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } @@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); - BUG_ON(skb_flow_dissector_uses_key(flow_dissector, - key->key_id)); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); - skb_flow_dissector_set_key(flow_dissector, key->key_id); + dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_CONTROL)); - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_BASIC)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); @@ -121,7 +121,8 @@ EXPORT_SYMBOL(__skb_flow_get_ports); bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - void *data, __be16 proto, int nhoff, int hlen) + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -130,6 +131,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_keyid *key_keyid; u8 ip_proto = 0; + bool ret = false; if (!data) { data = skb->data; @@ -152,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; @@ -171,15 +173,13 @@ again: ip: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) - return false; + goto out_bad; nhoff += iph->ihl * 4; ip_proto = iph->protocol; - if (ip_is_fragment(iph)) - ip_proto = 0; - if (!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; key_addrs = skb_flow_dissector_target(flow_dissector, @@ -187,6 +187,22 @@ ip: memcpy(&key_addrs->v4addrs, &iph->saddr, sizeof(key_addrs->v4addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + + if (ip_is_fragment(iph)) { + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + if (iph->frag_off & htons(IP_OFFSET)) { + goto out_good; + } else { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) + goto out_good; + } + } + + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_IPV6): { @@ -197,13 +213,13 @@ ip: ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) - return false; + goto out_bad; ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, @@ -216,15 +232,20 @@ ipv6: flow_label = ip6_flowlabel(iph); if (flow_label) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) + goto out_good; } + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_8021AD): @@ -234,10 +255,10 @@ ipv6: vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) - return false; + goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_VLANID, target_container); @@ -256,7 +277,7 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; proto = hdr->proto; nhoff += PPPOE_SES_HLEN; switch (proto) { @@ -265,7 +286,7 @@ ipv6: case htons(PPP_IPV6): goto ipv6; default: - return false; + goto out_bad; } } case htons(ETH_P_TIPC): { @@ -275,19 +296,17 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; - key_basic->n_proto = proto; - key_control->thoff = (u16)nhoff; + goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC_ADDRS, target_container); key_addrs->tipcaddrs.srcnode = hdr->srcnode; key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; } - return true; + goto out_good; } case htons(ETH_P_MPLS_UC): @@ -297,12 +316,12 @@ mpls: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); @@ -310,21 +329,17 @@ mpls: htonl(MPLS_LS_LABEL_MASK); } - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - - return true; + goto out_good; } - return true; + goto out_good; } case htons(ETH_P_FCOE): key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: - return false; + goto out_bad; } ip_proto_again: @@ -337,7 +352,7 @@ ip_proto_again: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; /* * Only look inside GRE if version zero and no * routing @@ -357,10 +372,10 @@ ip_proto_again: data, hlen, &_keyid); if (!keyid) - return false; + goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); @@ -378,10 +393,15 @@ ip_proto_again: sizeof(_eth), data, hlen, &_eth); if (!eth) - return false; + goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); } + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto again; } case NEXTHDR_HOP: @@ -395,18 +415,53 @@ ip_proto_again: opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) - return false; + goto out_bad; ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; goto ip_proto_again; } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) + goto out_bad; + + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + nhoff += sizeof(_fh); + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + ip_proto = fh->nexthdr; + goto ip_proto_again; + } + } + goto out_good; + } case IPPROTO_IPIP: proto = htons(ETH_P_IP); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ip; case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ipv6; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); @@ -415,12 +470,8 @@ ip_proto_again: break; } - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); @@ -428,7 +479,15 @@ ip_proto_again: data, hlen); } - return true; +out_good: + ret = true; + +out_bad: + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; + + return ret; } EXPORT_SYMBOL(__skb_flow_dissect); @@ -438,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) +static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, + u32 keyval) { return jhash2(words, length, keyval); } -static inline void *flow_keys_hash_start(struct flow_keys *flow) +static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) { + const void *p = flow; + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); - return (void *)flow + FLOW_KEYS_HASH_OFFSET; + return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); } -static inline size_t flow_keys_hash_length(struct flow_keys *flow) +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); @@ -539,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) __flow_hash_consistentify(keys); - hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + hash = __flow_hash_words(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -557,8 +619,8 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect_flow_keys(skb, keys)) - return 0; + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } @@ -602,17 +664,11 @@ EXPORT_SYMBOL(make_flow_keys_digest); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, hashrnd); - if (!hash) - return; - if (keys.ports.ports) - skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; + __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), + flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); @@ -624,6 +680,51 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + memcpy(&keys.addrs.v6addrs.src, &fl6->saddr, + sizeof(keys.addrs.v6addrs.src)); + memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys.addrs.v6addrs.dst)); + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys.ports.src = fl6->fl6_sport; + keys.ports.dst = fl6->fl6_dport; + keys.keyid.keyid = fl6->fl6_gre_key; + keys.tags.flow_label = (__force u32)fl6->flowlabel; + keys.basic.ip_proto = fl6->flowi6_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi6); + +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + keys.addrs.v4addrs.src = fl4->saddr; + keys.addrs.v4addrs.dst = fl4->daddr; + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys.ports.src = fl4->fl4_sport; + keys.ports.dst = fl4->fl4_dport; + keys.keyid.keyid = fl4->fl4_gre_key; + keys.basic.ip_proto = fl4->flowi4_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi4); + u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { @@ -683,12 +784,47 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect_flow_keys(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi4); + static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 000000000000..299cfc24d888 --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,249 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/lwtunnel.h> +#include <linux/in.h> +#include <linux/init.h> +#include <linux/err.h> + +#include <net/lwtunnel.h> +#include <net/rtnetlink.h> +#include <net/ip6_fib.h> + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +static const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, unsigned int family, + const void *cfg, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, family, cfg, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_cmp_encap); + +int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(net, sk, skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_output); + +int lwtunnel_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->input)) + ret = ops->input(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_input); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 84195dacb8b6..1aa8437ed6c4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) + entries >= tbl->gc_thresh3) { + net_info_ratelimited("%s: neighbor table overflow!\n", + tbl->id); + NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; + } } n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); @@ -1849,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; ndst.ndts_forced_gc_runs += st->forced_gc_runs; + ndst.ndts_table_fulls += st->table_fulls; } if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) @@ -2230,14 +2235,53 @@ static void neigh_update_notify(struct neighbour *neigh) __neigh_notify(neigh, RTM_NEWNEIGH, 0); } +static bool neigh_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) +{ + if (filter_idx && dev->ifindex != filter_idx) + return true; + + return false; +} + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + const struct nlmsghdr *nlh = cb->nlh; + struct nlattr *tb[NDA_MAX + 1]; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; + int filter_master_idx = 0, filter_idx = 0; + unsigned int flags = NLM_F_MULTI; + int err; + + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL); + if (!err) { + if (tb[NDA_IFINDEX]) + filter_idx = nla_get_u32(tb[NDA_IFINDEX]); + + if (tb[NDA_MASTER]) + filter_master_idx = nla_get_u32(tb[NDA_MASTER]); + + if (filter_idx || filter_master_idx) + flags |= NLM_F_DUMP_FILTERED; + } rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); @@ -2250,12 +2294,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; + if (neigh_ifindex_filtered(n->dev, filter_idx)) + continue; + if (neigh_master_filtered(n->dev, filter_master_idx)) + continue; if (idx < s_idx) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, - NLM_F_MULTI) < 0) { + flags) < 0) { rc = -1; goto out; } @@ -2717,12 +2765,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx\n", + "%08lx %08lx %08lx %08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, @@ -2739,7 +2787,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) st->periodic_gc_runs, st->forced_gc_runs, - st->unres_discards + st->unres_discards, + st->table_fulls ); return 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 18b34d771ed4..f88a62ab019d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -31,7 +31,6 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; -static const char fmt_udec[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -202,7 +201,7 @@ static ssize_t speed_show(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); return ret; @@ -404,6 +403,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool) proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -459,7 +471,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -501,6 +513,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); @@ -675,7 +688,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rps_map *old_map, *map; cpumask_var_t mask; int err, cpu, i; - static DEFINE_SPINLOCK(rps_map_lock); + static DEFINE_MUTEX(rps_map_mutex); if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -708,18 +721,21 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, map = NULL; } - spin_lock(&rps_map_lock); + mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, - lockdep_is_held(&rps_map_lock)); + mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); - spin_unlock(&rps_map_lock); if (map) static_key_slow_inc(&rps_needed); - if (old_map) { - kfree_rcu(old_map, rcu); + if (old_map) static_key_slow_dec(&rps_needed); - } + + mutex_unlock(&rps_map_mutex); + + if (old_map) + kfree_rcu(old_map, rcu); + free_cpumask_var(mask); return len; } @@ -987,15 +1003,12 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) - break; + unsigned int i; + i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; @@ -1464,6 +1477,15 @@ static int of_dev_node_match(struct device *dev, const void *data) return ret == 0 ? dev->of_node == data : ret; } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index ba3c0120786c..adef015b2f41 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -31,6 +31,7 @@ #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> +#include <trace/events/fib.h> EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c126a878c47c..94acfc89ad97 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -140,36 +140,42 @@ static void queue_process(struct work_struct *work) * case. Further, we test the poll_owner to avoid recursion on UP * systems where the lock doesn't exist. */ -static int poll_one_napi(struct napi_struct *napi, int budget) +static void poll_one_napi(struct napi_struct *napi) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while * holding the napi->poll_lock. */ if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return budget; + return; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + return; - work = napi->poll(napi, budget); - WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); + /* We explicilty pass the polling call a budget of 0 to + * indicate that we are clearing the Tx path only. + */ + work = napi->poll(napi, 0); + WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); - - return budget - work; } -static void poll_napi(struct net_device *dev, int budget) +static void poll_napi(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(napi, budget); + poll_one_napi(napi); spin_unlock(&napi->poll_lock); } } @@ -179,7 +185,6 @@ static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); - int budget = 0; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity @@ -202,7 +207,7 @@ static void netpoll_poll_dev(struct net_device *dev) /* Process pending work on NIC */ ops->ndo_poll_controller(dev); - poll_napi(dev, budget); + poll_napi(dev); up(&ni->dev_lock); @@ -380,6 +385,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; + WARN_ON_ONCE(!irqs_disabled()); + udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1cbd209192ea..de8d5cc5eb24 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -273,7 +273,6 @@ struct pktgen_dev { /* runtime counters relating to clone_skb */ - __u64 allocated_skbs; __u32 clone_count; int last_ok; /* Was last skb sent? * Or a failed transmit of some sort? @@ -2279,7 +2278,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { - pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2788,6 +2787,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, } else { skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + skb_reserve(skb, LL_RESERVED_SPACE(dev)); return skb; } @@ -3397,7 +3397,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) return; } pkt_dev->last_pkt_size = pkt_dev->skb->len; - pkt_dev->allocated_skbs++; pkt_dev->clone_count = 0; /* reset counter */ } diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index 4eab4a94a59d..703cf76aa7c2 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -58,7 +58,7 @@ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [18] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2 * ret a ; return PTP class * * ; PTP over UDP over IPv4 over 802.1Q over Ethernet @@ -73,7 +73,7 @@ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? * ldh [x + 26] ; load payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 * ret a ; return PTP class * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE * @@ -86,7 +86,7 @@ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? * ldh [66] ; load payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 + * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 * ret a ; return PTP class * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE * @@ -98,7 +98,7 @@ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [14] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x30 ; PTP_CLASS_L2 + * or #0x40 ; PTP_CLASS_L2 * ret a ; return PTP class * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE */ @@ -150,7 +150,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 35, 0x00000000 }, { 0x28, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000070 }, + { 0x44, 0, 0, 0x000000c0 }, { 0x16, 0, 0, 0x00000000 }, { 0x15, 0, 12, 0x00000800 }, { 0x30, 0, 0, 0x0000001b }, @@ -162,7 +162,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 4, 0x0000013f }, { 0x48, 0, 0, 0x0000001a }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000050 }, + { 0x44, 0, 0, 0x00000090 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 8, 0x000086dd }, @@ -172,7 +172,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 4, 0x0000013f }, { 0x28, 0, 0, 0x00000042 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000060 }, + { 0x44, 0, 0, 0x000000a0 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 7, 0x000088f7 }, @@ -181,7 +181,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 4, 0x00000000 }, { 0x28, 0, 0, 0x0000000e }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000030 }, + { 0x44, 0, 0, 0x00000040 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, }; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index b42f0e26f89e..5d26056b6d8f 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,90 +37,16 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt = NULL; + spin_lock_init(&queue->rskq_lock); - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); + spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; - if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - lopt = kzalloc(lopt_size, GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); - if (!lopt) - lopt = vzalloc(lopt_size); - if (!lopt) - return -ENOMEM; - - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - spin_lock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; - lopt->max_qlen_log = ilog2(nr_table_entries); - - spin_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - spin_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* This is an error recovery path only, no locking needed */ - kvfree(queue->listen_opt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - spin_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - spin_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* make all the listen_opt local to us */ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - - if (listen_sock_qlen(lopt) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_inc(&lopt->qlen_dec); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(listen_sock_qlen(lopt) != 0)) - pr_err("qlen %u\n", listen_sock_qlen(lopt)); - kvfree(lopt); } /* @@ -174,7 +100,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; - fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dc004b1e1f85..504bd17b7456 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -96,7 +96,7 @@ int rtnl_is_locked(void) EXPORT_SYMBOL(rtnl_is_locked); #ifdef CONFIG_PROVE_LOCKING -int lockdep_rtnl_is_held(void) +bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } @@ -497,7 +497,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) } EXPORT_SYMBOL_GPL(rtnl_af_unregister); -static size_t rtnl_link_get_af_size(const struct net_device *dev) +static size_t rtnl_link_get_af_size(const struct net_device *dev, + u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; @@ -509,7 +510,7 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev) if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + - af_ops->get_link_af_size(dev); + af_ops->get_link_af_size(dev, ext_filter_mask); } } @@ -678,6 +679,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; @@ -831,7 +838,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, /* IFLA_VF_STATS_BROADCAST */ nla_total_size(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ - nla_total_size(sizeof(__u64))); + nla_total_size(sizeof(__u64)) + + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else return 0; @@ -894,9 +902,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(1); /* IFLA_PROTO_DOWN */ + } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1017,7 +1027,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -1082,7 +1092,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, (dev->ifalias && nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes))) + atomic_read(&dev->carrier_changes)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; if (1) { @@ -1151,6 +1162,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_link_state vf_linkstate; struct ifla_vf_rss_query_en vf_rss_query_en; struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; /* * Not all SR-IOV capable drivers support the @@ -1160,6 +1172,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, */ ivi.spoofchk = -1; ivi.rss_query_en = -1; + ivi.trusted = -1; memset(ivi.mac, 0, sizeof(ivi.mac)); /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero @@ -1173,7 +1186,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = - vf_rss_query_en.vf = ivi.vf; + vf_rss_query_en.vf = + vf_trust.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; @@ -1184,6 +1198,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -1201,7 +1216,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), - &vf_rss_query_en)) + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) goto nla_put_failure; memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) @@ -1263,7 +1280,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (!(af = nla_nest_start(skb, af_ops->family))) goto nla_put_failure; - err = af_ops->fill_link_af(skb, dev); + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there @@ -1319,6 +1336,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1337,6 +1355,7 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, + [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, }; static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { @@ -1576,6 +1595,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } + if (tb[IFLA_VF_TRUST]) { + struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_trust) + err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); + if (err < 0) + return err; + } + return err; } @@ -1861,6 +1890,14 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; + if (tb[IFLA_PROTO_DOWN]) { + err = dev_change_proto_down(dev, + nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) @@ -1951,16 +1988,30 @@ static int rtnl_group_dellink(const struct net *net, int group) return 0; } +int rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; - LIST_HEAD(list_kill); err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1982,13 +2033,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev) return -ENODEV; - ops = dev->rtnl_link_ops; - if (!ops || !ops->dellink) - return -EOPNOTSUPP; - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return 0; + return rtnl_delete_link(dev); } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) @@ -3021,6 +3066,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; u32 filter_mask = 0; + int err; if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { struct nlattr *extfilt; @@ -3041,20 +3087,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } } @@ -3411,4 +3462,3 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); } - diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7b84330e5d30..fab4599ba8b2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -392,7 +392,7 @@ EXPORT_SYMBOL(napi_alloc_frag); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The @@ -461,7 +461,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); /** * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will @@ -2958,11 +2958,12 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); */ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); diff --git a/net/core/sock.c b/net/core/sock.c index 193901d09757..7529eb9463be 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -422,13 +422,25 @@ static void sock_warn_obsolete_bsdism(const char *name) } } +static bool sock_needs_netstamp(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_UNSPEC: + case AF_UNIX: + return false; + default: + return true; + } +} + #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; - if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } @@ -988,6 +1000,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -1578,7 +1594,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + if (sock_needs_netstamp(sk) && + newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: @@ -1639,6 +1656,28 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) { + skb->destructor = sock_edemux; + sock_hold(sk); + return; + } +#endif + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ + atomic_add(skb->truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(skb_set_owner_w); + void skb_orphan_partial(struct sk_buff *skb) { /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, @@ -1852,6 +1891,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) @@ -2078,7 +2143,7 @@ suppress_allocation: EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reclaim - reclaim memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ @@ -2353,6 +2418,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2479,7 +2545,8 @@ void sock_enable_timestamp(struct sock *sk, int flag) * time stamping, but time stamping might have been on * already because of the other one */ - if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } @@ -2740,10 +2807,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2760,7 +2825,7 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - 0, NULL); + prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -2828,10 +2893,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index d79866c5f8bc..0c1d58d43f67 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,3 +1,5 @@ +/* License: GPL */ + #include <linux/mutex.h> #include <linux/socket.h> #include <linux/skbuff.h> @@ -90,6 +92,9 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, goto out; fprog = filter->prog->orig_prog; + if (!fprog) + goto out; + flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); @@ -320,14 +325,4 @@ static int __init sock_diag_init(void) BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } - -static void __exit sock_diag_exit(void) -{ - unregister_pernet_subsys(&diag_net_ops); - destroy_workqueue(broadcast_wq); -} - -module_init(sock_diag_init); -module_exit(sock_diag_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); +device_initcall(sock_diag_init); diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 43d3dd62fcc8..42689d5c468c 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; + __skb_push(skb, ETH_HLEN); - type = classify(skb); + type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); diff --git a/net/core/tso.c b/net/core/tso.c index 630b30b4fb53..5dca7ce8ee9f 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -1,4 +1,5 @@ #include <linux/export.h> +#include <linux/if_vlan.h> #include <net/ip.h> #include <net/tso.h> #include <asm/unaligned.h> @@ -14,18 +15,24 @@ EXPORT_SYMBOL(tso_count_descs); void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) { - struct iphdr *iph; struct tcphdr *tcph; int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); int mac_hdr_len = skb_network_offset(skb); memcpy(hdr, skb->data, hdr_len); - iph = (struct iphdr *)(hdr + mac_hdr_len); - iph->id = htons(tso->ip_id); - iph->tot_len = htons(size + hdr_len - mac_hdr_len); + if (!tso->ipv6) { + struct iphdr *iph = (void *)(hdr + mac_hdr_len); + + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tso->ip_id++; + } else { + struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len); + + iph->payload_len = htons(size + tcp_hdrlen(skb)); + } tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); put_unaligned_be32(tso->tcp_seq, &tcph->seq); - tso->ip_id++; if (!is_last) { /* Clear all special flags for not last packet */ @@ -61,6 +68,7 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso) tso->ip_id = ntohs(ip_hdr(skb)->id); tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); tso->next_frag_idx = 0; + tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6); /* Build first data */ tso->size = skb_headlen(skb) - hdr_len; diff --git a/net/core/utils.c b/net/core/utils.c index a7732a068043..3d17ca8b4744 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -301,7 +301,7 @@ out: EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) + __be32 from, __be32 to, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace4(sum, from, to); @@ -318,7 +318,7 @@ EXPORT_SYMBOL(inet_proto_csum_replace4); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, - int pseudohdr) + bool pseudohdr) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], @@ -336,51 +336,15 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace16); -struct __net_random_once_work { - struct work_struct work; - struct static_key *key; -}; - -static void __net_random_once_deferred(struct work_struct *w) -{ - struct __net_random_once_work *work = - container_of(w, struct __net_random_once_work, work); - BUG_ON(!static_key_enabled(work->key)); - static_key_slow_dec(work->key); - kfree(work); -} - -static void __net_random_once_disable_jump(struct static_key *key) +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr) { - struct __net_random_once_work *w; - - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return; - - INIT_WORK(&w->work, __net_random_once_deferred); - w->key = key; - schedule_work(&w->work); -} - -bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *once_key) -{ - static DEFINE_SPINLOCK(lock); - unsigned long flags; - - spin_lock_irqsave(&lock, flags); - if (*done) { - spin_unlock_irqrestore(&lock, flags); - return false; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_add(diff, ~skb->csum); + } else if (pseudohdr) { + *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); } - - get_random_bytes(buf, nbytes); - *done = true; - spin_unlock_irqrestore(&lock, flags); - - __net_random_once_disable_jump(once_key); - - return true; } -EXPORT_SYMBOL(__net_get_random_once); +EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); |