From 9c7dafbfab1554705f85523fead578aa1a3d338c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 8 Aug 2012 21:52:46 +0000 Subject: net: Allow to create links with given ifindex Currently the RTM_NEWLINK results in -EOPNOTSUPP if the ifinfomsg->ifi_index is not zero. I propose to allow requesting ifindices on link creation. This is required by the checkpoint-restore to correctly restore a net namespace (i.e. -- a container). Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f91abf800161..3ca300d85271 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5579,7 +5579,12 @@ int register_netdevice(struct net_device *dev) } } - dev->ifindex = dev_new_index(net); + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) + goto err_uninit; + if (dev->iflink == -1) dev->iflink = dev->ifindex; -- cgit v1.2.3 From aa79e66eee5d525e2fcbd2a5fcb87ae3dd4aa9e9 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 8 Aug 2012 21:53:19 +0000 Subject: net: Make ifindex generation per-net namespace Strictly speaking this is only _really_ required for checkpoint-restore to make loopback device always have the same index. This change appears to be safe wrt "ifindex should be unique per-system" concept, as all the ifindex usage is either already made per net namespace of is explicitly limited with init_net only. There are two cool side effects of this. The first one -- ifindices of devices in container are always small, regardless of how many containers we've started (and re-started) so far. The second one is -- we can speed up the loopback ifidex access as shown in the next patch. v2: Place ifindex right after dev_base_seq : avoid two holes and use the same cache line, dirtied in list_netdevice()/unlist_netdevice() Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_namespace.h | 1 + net/core/dev.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index ae1cd6c9ba52..6dc3db3466bf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -66,6 +66,7 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; unsigned int dev_base_seq; /* protected by rtnl_mutex */ + int ifindex; /* core fib_rules */ struct list_head rules_ops; diff --git a/net/core/dev.c b/net/core/dev.c index 3ca300d85271..1f06df8d10a3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5221,12 +5221,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ static int dev_new_index(struct net *net) { - static int ifindex; + int ifindex = net->ifindex; for (;;) { if (++ifindex <= 0) ifindex = 1; if (!__dev_get_by_index(net, ifindex)) - return ifindex; + return net->ifindex = ifindex; } } -- cgit v1.2.3 From ee89bab14e857678f83a71ee99e575b0fdbb58d4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 9 Aug 2012 22:14:56 +0000 Subject: net: move and rename netif_notify_peers() I believe net/core/dev.c is a better place for netif_notify_peers(), because other net event notify functions also stay in this file. And rename it to netdev_notify_peers(). Cc: David S. Miller Cc: Ian Campbell Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/net/xen-netfront.c | 2 +- include/linux/netdevice.h | 3 +-- net/core/dev.c | 18 ++++++++++++++++++ net/sched/sch_generic.c | 18 ------------------ 6 files changed, 22 insertions(+), 23 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 8c5a1c43c81d..e91111a656f7 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -400,7 +400,7 @@ static void netvsc_send_garp(struct work_struct *w) ndev_ctx = container_of(w, struct net_device_context, dwork.work); net_device = hv_get_drvdata(ndev_ctx->device_ctx); net = net_device->ndev; - netif_notify_peers(net); + netdev_notify_peers(net); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 83d2b0c34c5e..81a64c58e8ad 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -993,7 +993,7 @@ static void virtnet_config_changed_work(struct work_struct *work) goto done; if (v & VIRTIO_NET_S_ANNOUNCE) { - netif_notify_peers(vi->dev); + netdev_notify_peers(vi->dev); virtnet_ack_link_announce(vi); } diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 30899901aef5..39afd37e62b3 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1731,7 +1731,7 @@ static void netback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: - netif_notify_peers(netdev); + netdev_notify_peers(netdev); break; case XenbusStateClosing: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9db4f33407f..8d4b7316c734 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2246,8 +2246,6 @@ extern void netif_carrier_on(struct net_device *dev); extern void netif_carrier_off(struct net_device *dev); -extern void netif_notify_peers(struct net_device *dev); - /** * netif_dormant_on - mark device as dormant. * @dev: network device @@ -2596,6 +2594,7 @@ extern void __dev_set_rx_mode(struct net_device *dev); extern int dev_set_promiscuity(struct net_device *dev, int inc); extern int dev_set_allmulti(struct net_device *dev, int inc); extern void netdev_state_change(struct net_device *dev); +extern void netdev_notify_peers(struct net_device *dev); extern int netdev_bonding_change(struct net_device *dev, unsigned long event); extern void netdev_features_change(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index 1f06df8d10a3..5defcf940005 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1106,6 +1106,24 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); +/** + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void netdev_notify_peers(struct net_device *dev) +{ + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(netdev_notify_peers); + int netdev_bonding_change(struct net_device *dev, unsigned long event) { return call_netdevice_notifiers(event, dev); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 511323e89cec..6c4d5fe53ce8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -324,24 +324,6 @@ void netif_carrier_off(struct net_device *dev) } EXPORT_SYMBOL(netif_carrier_off); -/** - * netif_notify_peers - notify network peers about existence of @dev - * @dev: network device - * - * Generate traffic such that interested network peers are aware of - * @dev, such as by generating a gratuitous ARP. This may be used when - * a device wants to inform the rest of the network about some sort of - * reconfiguration such as a failover event or virtual machine - * migration. - */ -void netif_notify_peers(struct net_device *dev) -{ - rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - rtnl_unlock(); -} -EXPORT_SYMBOL(netif_notify_peers); - /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. -- cgit v1.2.3 From b7bc2a5b5bd99b216c3e5fe68c7f45c684ab5745 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 9 Aug 2012 22:14:57 +0000 Subject: net: remove netdev_bonding_change() I don't see any benifits to use netdev_bonding_change() than using call_netdevice_notifiers() directly. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 20 ++++++++++---------- include/linux/netdevice.h | 2 -- net/core/dev.c | 6 ------ 3 files changed, 10 insertions(+), 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6fae5f3ec7f6..d95fbc34b229 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1120,10 +1120,10 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) write_unlock_bh(&bond->curr_slave_lock); read_unlock(&bond->lock); - netdev_bonding_change(bond->dev, NETDEV_BONDING_FAILOVER); + call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev); if (should_notify_peers) - netdev_bonding_change(bond->dev, - NETDEV_NOTIFY_PEERS); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, + bond->dev); read_lock(&bond->lock); write_lock_bh(&bond->curr_slave_lock); @@ -1560,8 +1560,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) bond_dev->name, bond_dev->type, slave_dev->type); - res = netdev_bonding_change(bond_dev, - NETDEV_PRE_TYPE_CHANGE); + res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, + bond_dev); res = notifier_to_errno(res); if (res) { pr_err("%s: refused to change device type\n", @@ -1581,8 +1581,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } - netdev_bonding_change(bond_dev, - NETDEV_POST_TYPE_CHANGE); + call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, + bond_dev); } } else if (bond_dev->type != slave_dev->type) { pr_err("%s ether type (%d) is different from other slaves (%d), can not enslave it.\n", @@ -1943,7 +1943,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) } block_netpoll_tx(); - netdev_bonding_change(bond_dev, NETDEV_RELEASE); + call_netdevice_notifiers(NETDEV_RELEASE, bond_dev); write_lock_bh(&bond->lock); slave = bond_get_slave_by_dev(bond, slave_dev); @@ -2586,7 +2586,7 @@ re_arm: read_unlock(&bond->lock); return; } - netdev_bonding_change(bond->dev, NETDEV_NOTIFY_PEERS); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); rtnl_unlock(); } } @@ -3205,7 +3205,7 @@ re_arm: read_unlock(&bond->lock); return; } - netdev_bonding_change(bond->dev, NETDEV_NOTIFY_PEERS); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); rtnl_unlock(); } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8d4b7316c734..1d6ab69c1f3f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2595,8 +2595,6 @@ extern int dev_set_promiscuity(struct net_device *dev, int inc); extern int dev_set_allmulti(struct net_device *dev, int inc); extern void netdev_state_change(struct net_device *dev); extern void netdev_notify_peers(struct net_device *dev); -extern int netdev_bonding_change(struct net_device *dev, - unsigned long event); extern void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ extern void dev_load(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index 5defcf940005..ce1bccb08de5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1124,12 +1124,6 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -int netdev_bonding_change(struct net_device *dev, unsigned long event) -{ - return call_netdevice_notifiers(event, dev); -} -EXPORT_SYMBOL(netdev_bonding_change); - /** * dev_load - load a network module * @net: the applicable net namespace -- cgit v1.2.3 From d04a48b06d63b6d6e9289ca8a5e6e84ebfe39bfd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 May 2012 17:01:57 -0600 Subject: userns: Convert __dev_set_promiscuity to use kuids in audit logs Cc: Klaus Heinrich Kiwi Cc: Eric Paris Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- net/core/dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 0cb3fe8d8e72..026bb4a37665 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4492,8 +4492,8 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; ASSERT_RTNL(); @@ -4525,7 +4525,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), audit_get_loginuid(current), - uid, gid, + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } -- cgit v1.2.3 From 0115e8e30d6fcdd4b8faa30d3ffd90859a591f51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Aug 2012 17:19:46 +0000 Subject: net: remove delay at device dismantle I noticed extra one second delay in device dismantle, tracked down to a call to dst_dev_event() while some call_rcu() are still in RCU queues. These call_rcu() were posted by rt_free(struct rtable *rt) calls. We then wait a little (but one second) in netdev_wait_allrefs() before kicking again NETDEV_UNREGISTER. As the call_rcu() are now completed, dst_dev_event() can do the needed device swap on busy dst. To solve this problem, add a new NETDEV_UNREGISTER_FINAL, called after a rcu_barrier(), but outside of RTNL lock. Use NETDEV_UNREGISTER_FINAL with care ! Change dst_dev_event() handler to react to NETDEV_UNREGISTER_FINAL Also remove NETDEV_UNREGISTER_BATCH, as its not used anymore after IP cache removal. With help from Gao feng Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Cc: "Eric W. Biederman" Cc: Gao feng Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 22 ++++++++-------------- net/core/dst.c | 2 +- net/core/fib_rules.c | 3 ++- net/core/rtnetlink.c | 2 +- net/ipv4/devinet.c | 6 +++++- net/ipv4/fib_frontend.c | 8 ++++---- net/ipv6/addrconf.c | 6 +++++- 8 files changed, 27 insertions(+), 24 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4936f09a9333..9ad7fa8c10e0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1553,7 +1553,7 @@ struct packet_type { #define NETDEV_PRE_TYPE_CHANGE 0x000E #define NETDEV_POST_TYPE_CHANGE 0x000F #define NETDEV_POST_INIT 0x0010 -#define NETDEV_UNREGISTER_BATCH 0x0011 +#define NETDEV_UNREGISTER_FINAL 0x0011 #define NETDEV_RELEASE 0x0012 #define NETDEV_NOTIFY_PEERS 0x0013 #define NETDEV_JOIN 0x0014 diff --git a/net/core/dev.c b/net/core/dev.c index 088923fe4066..0640d2a859c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1406,7 +1406,6 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } @@ -1448,7 +1447,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb) nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } unlock: @@ -1468,7 +1466,8 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); + if (val != NETDEV_UNREGISTER_FINAL) + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -5331,10 +5330,6 @@ static void rollback_registered_many(struct list_head *head) netdev_unregister_kobject(dev); } - /* Process any work delayed until the end of the batch */ - dev = list_first_entry(head, struct net_device, unreg_list); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - synchronize_net(); list_for_each_entry(dev, head, unreg_list) @@ -5787,9 +5782,8 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users - * should have already handle it the first time */ - + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -5851,9 +5845,8 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before attempting to drain - * the device list. This usually avoids a 250ms wait. - */ + + /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); @@ -5862,6 +5855,8 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); @@ -6256,7 +6251,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* diff --git a/net/core/dst.c b/net/core/dst.c index 56d63612e1e4..f6593d238e9a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -374,7 +374,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, struct dst_entry *dst, *last = NULL; switch (event) { - case NETDEV_UNREGISTER: + case NETDEV_UNREGISTER_FINAL: case NETDEV_DOWN: mutex_lock(&dst_gc_mutex); for (dst = dst_busy_list; dst; dst = dst->next) { diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ab7db83236c9..585093755c23 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -711,15 +711,16 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct net *net = dev_net(dev); struct fib_rules_ops *ops; - ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: + ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: + ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 34d975b0f277..c64efcff8078 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2358,7 +2358,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_PRE_TYPE_CHANGE: case NETDEV_GOING_DOWN: case NETDEV_UNREGISTER: - case NETDEV_UNREGISTER_BATCH: + case NETDEV_UNREGISTER_FINAL: case NETDEV_RELEASE: case NETDEV_JOIN: break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index adf273f8ad2e..6a5e6e4b142c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1147,8 +1147,12 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev; + if (event == NETDEV_UNREGISTER_FINAL) + goto out; + + in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7f073a38c87d..fd7d9ae64f16 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1041,7 +1041,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev; struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { @@ -1050,9 +1050,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo return NOTIFY_DONE; } - if (!in_dev) + if (event == NETDEV_UNREGISTER_FINAL) return NOTIFY_DONE; + in_dev = __in_dev_get_rtnl(dev); + switch (event) { case NETDEV_UP: for_ifa(in_dev) { @@ -1071,8 +1073,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo case NETDEV_CHANGE: rt_cache_flush(dev_net(dev), 0); break; - case NETDEV_UNREGISTER_BATCH: - break; } return NOTIFY_DONE; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6bc85f7c31e3..e581009cb09e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2566,10 +2566,14 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, void *data) { struct net_device *dev = (struct net_device *) data; - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev; int run_pending = 0; int err; + if (event == NETDEV_UNREGISTER_FINAL) + return NOTIFY_DONE; + + idev = __in6_dev_get(dev); switch (event) { case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { -- cgit v1.2.3 From 748e2d9396a18c3fd3d07d47c1b41320acf1fbf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Aug 2012 21:50:59 +0000 Subject: net: reinstate rtnl in call_netdevice_notifiers() Eric Biederman pointed out that not holding RTNL while calling call_netdevice_notifiers() was racy. This patch is a direct transcription his feedback against commit 0115e8e30d6fc (net: remove delay at device dismantle) Thanks Eric ! Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Cc: "Eric W. Biederman" Cc: Gao feng Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++-- net/core/fib_rules.c | 3 +-- net/ipv4/devinet.c | 6 +----- net/ipv4/fib_frontend.c | 7 ++----- net/ipv6/addrconf.c | 6 +----- 5 files changed, 12 insertions(+), 19 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 0640d2a859c6..bc857fead8c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1466,8 +1466,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - if (val != NETDEV_UNREGISTER_FINAL) - ASSERT_RTNL(); + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -5782,7 +5781,11 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + __rtnl_unlock(); rcu_barrier(); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -5855,7 +5858,9 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + rtnl_lock(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + __rtnl_unlock(); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 585093755c23..ab7db83236c9 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -711,16 +711,15 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct net *net = dev_net(dev); struct fib_rules_ops *ops; + ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: - ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: - ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6a5e6e4b142c..adf273f8ad2e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1147,12 +1147,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; - struct in_device *in_dev; - - if (event == NETDEV_UNREGISTER_FINAL) - goto out; + struct in_device *in_dev = __in_dev_get_rtnl(dev); - in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index fd7d9ae64f16..acdee325d972 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1050,9 +1050,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo return NOTIFY_DONE; } - if (event == NETDEV_UNREGISTER_FINAL) - return NOTIFY_DONE; - in_dev = __in_dev_get_rtnl(dev); switch (event) { @@ -1064,14 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(net, -1); break; case NETDEV_DOWN: fib_disable_ip(dev, 0, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(net, 0); break; } return NOTIFY_DONE; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e581009cb09e..6bc85f7c31e3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2566,14 +2566,10 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, void *data) { struct net_device *dev = (struct net_device *) data; - struct inet6_dev *idev; + struct inet6_dev *idev = __in6_dev_get(dev); int run_pending = 0; int err; - if (event == NETDEV_UNREGISTER_FINAL) - return NOTIFY_DONE; - - idev = __in6_dev_get(dev); switch (event) { case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { -- cgit v1.2.3 From 8f4cccbbd92f2ad0ddbbc498ef7cee2a1c3defe9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 20 Aug 2012 22:16:51 +0100 Subject: net: Set device operstate at registration time The operstate of a device is initially IF_OPER_UNKNOWN and is updated asynchronously by linkwatch after each change of carrier state reported by the driver. The default carrier state of a net device is on, and this will never be changed on drivers that do not support carrier detection, thus the operstate remains IF_OPER_UNKNOWN. For devices that do support carrier detection, the driver must set the carrier state to off initially, then poll the hardware state when the device is opened. However, we must not activate linkwatch for a unregistered device, and commit b473001 ('net: Do not fire linkwatch events until the device is registered.') ensured that we don't. But this means that the operstate for many devices that support carrier detection remains IF_OPER_UNKNOWN when it should be IF_OPER_DOWN. The same issue exists with the dormant state. The proper initialisation sequence, avoiding a race with opening of the device, is: rtnl_lock(); rc = register_netdevice(dev); if (rc) goto out_unlock; netif_carrier_off(dev); /* or netif_dormant_on(dev) */ rtnl_unlock(); but it seems silly that this should have to be repeated in so many drivers. Further, the operstate seen immediately after opening the device may still be IF_OPER_UNKNOWN due to the asynchronous nature of linkwatch. Commit 22604c8 ('net: Fix for initial link state in 2.6.28') attempted to fix this by setting the operstate synchronously, but it was reverted as it could lead to deadlock. This initialises the operstate synchronously at registration time only. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 2 ++ net/core/link_watch.c | 8 ++++++++ 3 files changed, 11 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9ad7fa8c10e0..ccac82e61604 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2227,6 +2227,7 @@ static inline void dev_hold(struct net_device *dev) * kind of lower layer not just hardware media. */ +extern void linkwatch_init_dev(struct net_device *dev); extern void linkwatch_fire_event(struct net_device *dev); extern void linkwatch_forget_dev(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index bc857fead8c8..2f25d0cac51c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5648,6 +5648,8 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); + linkwatch_init_dev(dev); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c3519c6d1b16..a01922219a23 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -76,6 +76,14 @@ static void rfc2863_policy(struct net_device *dev) } +void linkwatch_init_dev(struct net_device *dev) +{ + /* Handle pre-registration link state changes */ + if (!netif_carrier_ok(dev) || netif_dormant(dev)) + rfc2863_policy(dev); +} + + static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) -- cgit v1.2.3 From 6549dd43c04327071571edf7aea4465b16539422 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 23 Aug 2012 15:36:55 +0000 Subject: net: dev: fix the incorrect hold of net namespace's lo device When moving a net device from one net namespace to another net namespace,dev_change_net_namespace calls NETDEV_DOWN event,so the original net namespace's dst entries which beloned to this net device will be put into dst_garbage list. then dev_change_net_namespace will set this net device's net to the new net namespace. If we unregister this net device's driver, this will trigger the NETDEV_UNREGISTER_FINAL event, dst_ifdown will be called, and get this net device's dst entries from dst_garbage list, put these entries' dev to the new net namespace's lo device. It's not what we want,actually we need these dst entries hold the original net namespace's lo device,this incorrect device holding will trigger emg message like below. unregister_netdevice: waiting for lo to become free. Usage count = 1 so we should call NETDEV_UNREGISTER_FINAL event in dev_change_net_namespace too,in order to make sure dst entries already in the dst_garbage list, we need rcu_barrier before we call NETDEV_UNREGISTER_FINAL event. With help form Eric Dumazet. Signed-off-by: Gao feng Cc: Eric Dumazet Cc: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3401e2dab7cc..a5fc3e301cf2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6259,6 +6259,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* -- cgit v1.2.3 From d1a53dfd114a6c53464de116cdab88d934bfe92f Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Mon, 27 Aug 2012 23:39:24 +0000 Subject: net: fix documentation of skb_needs_linearize(). skb_needs_linearize() does not check highmem DMA as it does not call illegal_highdma() anymore, so there is no need to mention highmem DMA here. (Indeed, ~NETIF_F_SG flag, which is checked in skb_needs_linearize(), can be set when illegal_highdma() returns true, and we are assured that illegal_highdma() is invoked prior to skb_needs_linearize() as skb_needs_linearize() is a static method called only once. But ~NETIF_F_SG can be set not only there in this same invocation path. It can also be set when can_checksum_protocol() returns false). see commit 02932ce9e2c136e6fab2571c8e0dd69ae8ec9853, Convert skb_need_linearize() to use precomputed features. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a5fc3e301cf2..b1e6d6385516 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2184,9 +2184,7 @@ EXPORT_SYMBOL(netif_skb_features); /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG, or if - * at least one of fragments is in highmem and device does not - * support DMA from it. + * 2. skb is fragmented and the device does not support SG. */ static inline int skb_needs_linearize(struct sk_buff *skb, int features) -- cgit v1.2.3 From b004ff4972e2a42aa4512c90cc6a9e4dc1bb36b6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Sep 2012 20:12:19 -0700 Subject: netdev_printk/dynamic_netdev_dbg: Directly call printk_emit A lot of stack is used in recursive printks with %pV. Using multiple levels of %pV (a logging function with %pV that calls another logging function with %pV) can consume more stack than necessary. Avoid excessive stack use by not calling dev_printk from netdev_printk and dynamic_netdev_dbg. Duplicate the logic and form of dev_printk instead. Make __netdev_printk static. Remove EXPORT_SYMBOL(__netdev_printk) Whitespace and brace style neatening. Signed-off-by: Joe Perches Acked-by: David S. Miller Tested-by: Jim Cromie Acked-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 3 --- lib/dynamic_debug.c | 26 +++++++++++++++++++++++--- net/core/dev.c | 24 +++++++++++++++++------- 3 files changed, 40 insertions(+), 13 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59dc05f38247..5f49cc0a107e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2720,9 +2720,6 @@ static inline const char *netdev_name(const struct net_device *dev) return dev->name; } -extern int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf); - extern __printf(3, 4) int netdev_printk(const char *level, const struct net_device *dev, const char *format, ...); diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 29ff2e4cfb75..2a29f4e04bdf 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -611,20 +611,40 @@ EXPORT_SYMBOL(__dynamic_dev_dbg); #ifdef CONFIG_NET int __dynamic_netdev_dbg(struct _ddebug *descriptor, - const struct net_device *dev, const char *fmt, ...) + const struct net_device *dev, const char *fmt, ...) { struct va_format vaf; va_list args; int res; - char buf[PREFIX_SIZE]; BUG_ON(!descriptor); BUG_ON(!fmt); va_start(args, fmt); + vaf.fmt = fmt; vaf.va = &args; - res = __netdev_printk(dynamic_emit_prefix(descriptor, buf), dev, &vaf); + + if (dev && dev->dev.parent) { + char buf[PREFIX_SIZE]; + char dict[128]; + size_t dictlen; + + dictlen = create_syslog_header(dev->dev.parent, + dict, sizeof(dict)); + + res = printk_emit(0, 7, dictlen ? dict : NULL, dictlen, + "%s%s %s: %s: %pV", + dynamic_emit_prefix(descriptor, buf), + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), &vaf); + } else if (dev) { + res = printk(KERN_DEBUG "%s: %pV", netdev_name(dev), &vaf); + } else { + res = printk(KERN_DEBUG "(NULL net_device): %pV", &vaf); + } + va_end(args); return res; diff --git a/net/core/dev.c b/net/core/dev.c index d7fe32c946c1..ac890f14613a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6423,22 +6423,30 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -int __netdev_printk(const char *level, const struct net_device *dev, +static int __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { int r; - if (dev && dev->dev.parent) - r = dev_printk(level, dev->dev.parent, "%s: %pV", - netdev_name(dev), vaf); - else if (dev) + if (dev && dev->dev.parent) { + char dict[128]; + size_t dictlen = create_syslog_header(dev->dev.parent, + dict, sizeof(dict)); + + r = printk_emit(0, level[1] - '0', + dictlen ? dict : NULL, dictlen, + "%s %s: %s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), vaf); + } else if (dev) { r = printk("%s%s: %pV", level, netdev_name(dev), vaf); - else + } else { r = printk("%s(NULL net_device): %pV", level, vaf); + } return r; } -EXPORT_SYMBOL(__netdev_printk); int netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) @@ -6453,6 +6461,7 @@ int netdev_printk(const char *level, const struct net_device *dev, vaf.va = &args; r = __netdev_printk(level, dev, &vaf); + va_end(args); return r; @@ -6472,6 +6481,7 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.va = &args; \ \ r = __netdev_printk(level, dev, &vaf); \ + \ va_end(args); \ \ return r; \ -- cgit v1.2.3 From c2c5a7051c556036b7beb8f4a89eefdc91c3245b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Sep 2012 20:13:01 -0700 Subject: netdev_printk/netif_printk: Remove a superfluous logging colon netdev_printk originally called dev_printk with %pV. This style emitted the complete dev_printk header with a colon followed by the netdev_name prefix followed by a colon. Now that netdev_printk does not call dev_printk, the extra colon is superfluous. Remove it. Example: old: sky2 0000:02:00.0: eth0: Link is up at 100 Mbps, full duplex, flow control both new: sky2 0000:02:00.0 eth0: Link is up at 100 Mbps, full duplex, flow control both Signed-off-by: Joe Perches Acked-by: David S. Miller Tested-by: Jim Cromie Acked-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- lib/dynamic_debug.c | 2 +- net/core/dev.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 2a29f4e04bdf..6b3ebabacfa8 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -634,7 +634,7 @@ int __dynamic_netdev_dbg(struct _ddebug *descriptor, dict, sizeof(dict)); res = printk_emit(0, 7, dictlen ? dict : NULL, dictlen, - "%s%s %s: %s: %pV", + "%s%s %s %s: %pV", dynamic_emit_prefix(descriptor, buf), dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), diff --git a/net/core/dev.c b/net/core/dev.c index ac890f14613a..cb9d43be07e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6435,7 +6435,7 @@ static int __netdev_printk(const char *level, const struct net_device *dev, r = printk_emit(0, level[1] - '0', dictlen ? dict : NULL, dictlen, - "%s %s: %s: %pV", + "%s %s %s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), netdev_name(dev), vaf); -- cgit v1.2.3 From 666f355f3805d68b6ed5f7013806f1f65abfbf03 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Sep 2012 20:14:11 -0700 Subject: device and dynamic_debug: Use dev_vprintk_emit and dev_printk_emit Convert direct calls of vprintk_emit and printk_emit to the dev_ equivalents. Make create_syslog_header static. Signed-off-by: Joe Perches Acked-by: David S. Miller Tested-by: Jim Cromie Acked-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 14 +++++--------- include/linux/device.h | 2 -- lib/dynamic_debug.c | 31 +++++++++++-------------------- net/core/dev.c | 16 ++++++---------- 4 files changed, 22 insertions(+), 41 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/base/core.c b/drivers/base/core.c index dff94c8fb350..abea76c36a4b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1865,7 +1865,8 @@ void device_shutdown(void) */ #ifdef CONFIG_PRINTK -int create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) +static int +create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) { const char *subsys; size_t pos = 0; @@ -1943,17 +1944,12 @@ EXPORT_SYMBOL(dev_printk_emit); static int __dev_printk(const char *level, const struct device *dev, struct va_format *vaf) { - char hdr[128]; - size_t hdrlen; - if (!dev) return printk("%s(NULL device *): %pV", level, vaf); - hdrlen = create_syslog_header(dev, hdr, sizeof(hdr)); - - return printk_emit(0, level[1] - '0', hdrlen ? hdr : NULL, hdrlen, - "%s %s: %pV", - dev_driver_string(dev), dev_name(dev), vaf); + return dev_printk_emit(level[1] - '0', dev, + "%s %s: %pV", + dev_driver_string(dev), dev_name(dev), vaf); } int dev_printk(const char *level, const struct device *dev, diff --git a/include/linux/device.h b/include/linux/device.h index 0d9ba0c09791..6d37e59db571 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -895,8 +895,6 @@ extern const char *dev_driver_string(const struct device *dev); #ifdef CONFIG_PRINTK -extern int create_syslog_header(const struct device *dev, - char *hdr, size_t hdrlen); extern int dev_vprintk_emit(int level, const struct device *dev, const char *fmt, va_list args); extern __printf(3, 4) diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 6b3ebabacfa8..e7f7d993357a 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -591,15 +591,11 @@ int __dynamic_dev_dbg(struct _ddebug *descriptor, res = printk(KERN_DEBUG "(NULL device *): %pV", &vaf); } else { char buf[PREFIX_SIZE]; - char dict[128]; - size_t dictlen; - dictlen = create_syslog_header(dev, dict, sizeof(dict)); - - res = printk_emit(0, 7, dictlen ? dict : NULL, dictlen, - "%s%s %s: %pV", - dynamic_emit_prefix(descriptor, buf), - dev_driver_string(dev), dev_name(dev), &vaf); + res = dev_printk_emit(7, dev, "%s%s %s: %pV", + dynamic_emit_prefix(descriptor, buf), + dev_driver_string(dev), dev_name(dev), + &vaf); } va_end(args); @@ -627,18 +623,13 @@ int __dynamic_netdev_dbg(struct _ddebug *descriptor, if (dev && dev->dev.parent) { char buf[PREFIX_SIZE]; - char dict[128]; - size_t dictlen; - - dictlen = create_syslog_header(dev->dev.parent, - dict, sizeof(dict)); - - res = printk_emit(0, 7, dictlen ? dict : NULL, dictlen, - "%s%s %s %s: %pV", - dynamic_emit_prefix(descriptor, buf), - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), &vaf); + + res = dev_printk_emit(7, dev->dev.parent, + "%s%s %s %s: %pV", + dynamic_emit_prefix(descriptor, buf), + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), &vaf); } else if (dev) { res = printk(KERN_DEBUG "%s: %pV", netdev_name(dev), &vaf); } else { diff --git a/net/core/dev.c b/net/core/dev.c index cb9d43be07e7..76c0fe66fdc0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6429,16 +6429,12 @@ static int __netdev_printk(const char *level, const struct net_device *dev, int r; if (dev && dev->dev.parent) { - char dict[128]; - size_t dictlen = create_syslog_header(dev->dev.parent, - dict, sizeof(dict)); - - r = printk_emit(0, level[1] - '0', - dictlen ? dict : NULL, dictlen, - "%s %s %s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), vaf); + r = dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), vaf); } else if (dev) { r = printk("%s%s: %pV", level, netdev_name(dev), vaf); } else { -- cgit v1.2.3 From e1760bd5ffae8cb98cffb030ee8e631eba28f3d8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 22:39:43 -0700 Subject: userns: Convert the audit loginuid to be a kuid Always store audit loginuids in type kuid_t. Print loginuids by converting them into uids in the appropriate user namespace, and then printing the resulting uid. Modify audit_get_loginuid to return a kuid_t. Modify audit_set_loginuid to take a kuid_t. Modify /proc//loginuid on read to convert the loginuid into the user namespace of the opener of the file. Modify /proc//loginud on write to convert the loginuid rom the user namespace of the opener of the file. Cc: Al Viro Cc: Eric Paris Cc: Paul Moore ? Cc: David Miller Signed-off-by: Eric W. Biederman --- drivers/tty/tty_audit.c | 14 ++++++++------ fs/proc/base.c | 12 ++++++++++-- include/linux/audit.h | 6 +++--- include/linux/init_task.h | 2 +- include/linux/sched.h | 2 +- include/linux/tty.h | 4 ++-- include/net/netlabel.h | 2 +- include/net/xfrm.h | 23 ++++++++++++----------- kernel/audit.c | 20 ++++++++++---------- kernel/audit_watch.c | 2 +- kernel/auditfilter.c | 7 ++++--- kernel/auditsc.c | 20 +++++++++++--------- net/core/dev.c | 2 +- net/netlabel/netlabel_unlabeled.c | 2 +- net/netlabel/netlabel_user.c | 2 +- net/xfrm/xfrm_policy.c | 8 ++++---- net/xfrm/xfrm_state.c | 6 +++--- net/xfrm/xfrm_user.c | 12 ++++++------ 18 files changed, 80 insertions(+), 66 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c index 7c5866920622..5b59bd7f4227 100644 --- a/drivers/tty/tty_audit.c +++ b/drivers/tty/tty_audit.c @@ -61,7 +61,7 @@ static void tty_audit_buf_put(struct tty_audit_buf *buf) } static void tty_audit_log(const char *description, struct task_struct *tsk, - uid_t loginuid, unsigned sessionid, int major, + kuid_t loginuid, unsigned sessionid, int major, int minor, unsigned char *data, size_t size) { struct audit_buffer *ab; @@ -73,7 +73,9 @@ static void tty_audit_log(const char *description, struct task_struct *tsk, audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u " "major=%d minor=%d comm=", description, - tsk->pid, uid, loginuid, sessionid, + tsk->pid, uid, + from_kuid(&init_user_ns, loginuid), + sessionid, major, minor); get_task_comm(name, tsk); audit_log_untrustedstring(ab, name); @@ -89,7 +91,7 @@ static void tty_audit_log(const char *description, struct task_struct *tsk, * Generate an audit message from the contents of @buf, which is owned by * @tsk with @loginuid. @buf->mutex must be locked. */ -static void tty_audit_buf_push(struct task_struct *tsk, uid_t loginuid, +static void tty_audit_buf_push(struct task_struct *tsk, kuid_t loginuid, unsigned int sessionid, struct tty_audit_buf *buf) { @@ -112,7 +114,7 @@ static void tty_audit_buf_push(struct task_struct *tsk, uid_t loginuid, */ static void tty_audit_buf_push_current(struct tty_audit_buf *buf) { - uid_t auid = audit_get_loginuid(current); + kuid_t auid = audit_get_loginuid(current); unsigned int sessionid = audit_get_sessionid(current); tty_audit_buf_push(current, auid, sessionid, buf); } @@ -179,7 +181,7 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch) } if (should_audit && audit_enabled) { - uid_t auid; + kuid_t auid; unsigned int sessionid; auid = audit_get_loginuid(current); @@ -199,7 +201,7 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch) * reference to the tty audit buffer if available. * Flush the buffer or return an appropriate error code. */ -int tty_audit_push_task(struct task_struct *tsk, uid_t loginuid, u32 sessionid) +int tty_audit_push_task(struct task_struct *tsk, kuid_t loginuid, u32 sessionid) { struct tty_audit_buf *buf = ERR_PTR(-EPERM); unsigned long flags; diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84cbdb73..138cff4b05dd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1089,7 +1089,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", - audit_get_loginuid(task)); + from_kuid(file->f_cred->user_ns, + audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1101,6 +1102,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, char *page, *tmp; ssize_t length; uid_t loginuid; + kuid_t kloginuid; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1130,7 +1132,13 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(loginuid); + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } + + length = audit_set_loginuid(kloginuid); if (likely(length == 0)) length = count; diff --git a/include/linux/audit.h b/include/linux/audit.h index ca019bb74da3..12367cbadfe1 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -527,7 +527,7 @@ static inline void audit_ptrace(struct task_struct *t) extern unsigned int audit_serial(void); extern int auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial); -extern int audit_set_loginuid(uid_t loginuid); +extern int audit_set_loginuid(kuid_t loginuid); #define audit_get_loginuid(t) ((t)->loginuid) #define audit_get_sessionid(t) ((t)->sessionid) extern void audit_log_task_context(struct audit_buffer *ab); @@ -639,7 +639,7 @@ extern int audit_signals; #define audit_core_dumps(i) do { ; } while (0) #define audit_seccomp(i,s,c) do { ; } while (0) #define auditsc_get_stamp(c,t,s) (0) -#define audit_get_loginuid(t) (-1) +#define audit_get_loginuid(t) (INVALID_UID) #define audit_get_sessionid(t) (-1) #define audit_log_task_context(b) do { ; } while (0) #define audit_ipc_obj(i) ((void)0) @@ -705,7 +705,7 @@ extern int audit_update_lsm_rules(void); extern int audit_filter_user(void); extern int audit_filter_type(int type); extern int audit_receive_filter(int type, int pid, int seq, - void *data, size_t datasz, uid_t loginuid, + void *data, size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid); extern int audit_enabled; #else diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 89f1cb1056f0..6d087c5f57f7 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -92,7 +92,7 @@ extern struct group_info init_groups; #ifdef CONFIG_AUDITSYSCALL #define INIT_IDS \ - .loginuid = -1, \ + .loginuid = INVALID_UID, \ .sessionid = -1, #else #define INIT_IDS diff --git a/include/linux/sched.h b/include/linux/sched.h index c147e7024f11..f64d092f2bed 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1426,7 +1426,7 @@ struct task_struct { struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL - uid_t loginuid; + kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; diff --git a/include/linux/tty.h b/include/linux/tty.h index 9f47ab540f65..7298385815e6 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -553,7 +553,7 @@ extern void tty_audit_fork(struct signal_struct *sig); extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); extern void tty_audit_push(struct tty_struct *tty); extern int tty_audit_push_task(struct task_struct *tsk, - uid_t loginuid, u32 sessionid); + kuid_t loginuid, u32 sessionid); #else static inline void tty_audit_add_data(struct tty_struct *tty, unsigned char *data, size_t size) @@ -572,7 +572,7 @@ static inline void tty_audit_push(struct tty_struct *tty) { } static inline int tty_audit_push_task(struct task_struct *tsk, - uid_t loginuid, u32 sessionid) + kuid_t loginuid, u32 sessionid) { return 0; } diff --git a/include/net/netlabel.h b/include/net/netlabel.h index f67440970d7e..2c95d55f7914 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -110,7 +110,7 @@ struct cipso_v4_doi; /* NetLabel audit information */ struct netlbl_audit { u32 secid; - uid_t loginuid; + kuid_t loginuid; u32 sessionid; }; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d9509eb29b80..1f217e2c5d82 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -662,7 +662,7 @@ struct xfrm_spi_skb_cb { /* Audit Information */ struct xfrm_audit { u32 secid; - uid_t loginuid; + kuid_t loginuid; u32 sessionid; }; @@ -681,13 +681,14 @@ static inline struct audit_buffer *xfrm_audit_start(const char *op) return audit_buf; } -static inline void xfrm_audit_helper_usrinfo(uid_t auid, u32 ses, u32 secid, +static inline void xfrm_audit_helper_usrinfo(kuid_t auid, u32 ses, u32 secid, struct audit_buffer *audit_buf) { char *secctx; u32 secctx_len; - audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses); + audit_log_format(audit_buf, " auid=%u ses=%u", + from_kuid(&init_user_ns, auid), ses); if (secid != 0 && security_secid_to_secctx(secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " subj=%s", secctx); @@ -697,13 +698,13 @@ static inline void xfrm_audit_helper_usrinfo(uid_t auid, u32 ses, u32 secid, } extern void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, - u32 auid, u32 ses, u32 secid); + kuid_t auid, u32 ses, u32 secid); extern void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, - u32 auid, u32 ses, u32 secid); + kuid_t auid, u32 ses, u32 secid); extern void xfrm_audit_state_add(struct xfrm_state *x, int result, - u32 auid, u32 ses, u32 secid); + kuid_t auid, u32 ses, u32 secid); extern void xfrm_audit_state_delete(struct xfrm_state *x, int result, - u32 auid, u32 ses, u32 secid); + kuid_t auid, u32 ses, u32 secid); extern void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb); extern void xfrm_audit_state_replay(struct xfrm_state *x, @@ -716,22 +717,22 @@ extern void xfrm_audit_state_icvfail(struct xfrm_state *x, #else static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, - u32 auid, u32 ses, u32 secid) + kuid_t auid, u32 ses, u32 secid) { } static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, - u32 auid, u32 ses, u32 secid) + kuid_t auid, u32 ses, u32 secid) { } static inline void xfrm_audit_state_add(struct xfrm_state *x, int result, - u32 auid, u32 ses, u32 secid) + kuid_t auid, u32 ses, u32 secid) { } static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result, - u32 auid, u32 ses, u32 secid) + kuid_t auid, u32 ses, u32 secid) { } diff --git a/kernel/audit.c b/kernel/audit.c index 2e0dd5edf69b..44a4b13c9f00 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -265,7 +265,7 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sessionid, u32 sid, + kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; @@ -273,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, loginuid, sessionid); + old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -293,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old, } static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sessionid, + int new, kuid_t loginuid, u32 sessionid, u32 sid) { int allow_changes, rc = 0, old = *to_change; @@ -320,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit, loginuid, sessionid, sid); } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit, loginuid, sessionid, sid); } -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) @@ -349,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) return rc; } -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -607,7 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - uid_t auid, u32 ses, u32 sid) + kuid_t auid, u32 ses, u32 sid) { int rc = 0; char *ctx = NULL; @@ -622,7 +622,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", task_tgid_vnr(current), from_kuid(&init_user_ns, current_uid()), - auid, ses); + from_kuid(&init_user_ns, auid), ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -644,7 +644,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - uid_t loginuid; /* loginuid of sender */ + kuid_t loginuid; /* loginuid of sender */ u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3823281401b5..1c22ec3d87bc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); audit_log_string(ab, op); audit_log_format(ab, " path="); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b30320cea26f..c4bcdbaf4d4d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1109,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, +static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, char *action, struct audit_krule *rule, int res) { @@ -1121,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); + audit_log_format(ab, "auid=%u ses=%u", + from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -1152,7 +1153,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) + size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; struct audit_netlink_list *dest; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0b5b8a232b55..26fdfc092e35 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -149,7 +149,7 @@ struct audit_aux_data_execve { struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; - uid_t target_auid[AUDIT_AUX_PIDS]; + kuid_t target_auid[AUDIT_AUX_PIDS]; uid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; u32 target_sid[AUDIT_AUX_PIDS]; @@ -214,7 +214,7 @@ struct audit_context { int arch; pid_t target_pid; - uid_t target_auid; + kuid_t target_auid; uid_t target_uid; unsigned int target_sessionid; u32 target_sid; @@ -1176,7 +1176,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk } static int audit_log_pid_context(struct audit_context *context, pid_t pid, - uid_t auid, uid_t uid, unsigned int sessionid, + kuid_t auid, uid_t uid, unsigned int sessionid, u32 sid, char *comm) { struct audit_buffer *ab; @@ -1188,7 +1188,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, if (!ab) return rc; - audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, + audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, + from_kuid(&init_user_ns, auid), uid, sessionid); if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); @@ -1630,7 +1631,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->name_count, context->ppid, context->pid, - tsk->loginuid, + from_kuid(&init_user_ns, tsk->loginuid), context->uid, context->gid, context->euid, context->suid, context->fsuid, @@ -2291,14 +2292,14 @@ static atomic_t session_id = ATOMIC_INIT(0); * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(uid_t loginuid) +int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; struct audit_context *context = task->audit_context; unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (task->loginuid != -1) + if (uid_valid(task->loginuid)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) @@ -2315,7 +2316,8 @@ int audit_set_loginuid(uid_t loginuid) "old auid=%u new auid=%u" " old ses=%u new ses=%u", task->pid, task_uid(task), - task->loginuid, loginuid, + from_kuid(&init_user_ns, task->loginuid), + from_kuid(&init_user_ns, loginuid), task->sessionid, sessionid); audit_log_end(ab); } @@ -2543,7 +2545,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; - if (tsk->loginuid != -1) + if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else audit_sig_uid = uid; diff --git a/net/core/dev.c b/net/core/dev.c index 026bb4a37665..1c0d0823a5a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4524,7 +4524,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), - audit_get_loginuid(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index e7ff694f1049..729a345c75a4 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1541,7 +1541,7 @@ int __init netlbl_unlabel_defconf(void) * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ security_task_getsecid(current, &audit_info.secid); - audit_info.loginuid = 0; + audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 9fae63f10298..9650c4ad5f88 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -109,7 +109,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, return NULL; audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", - audit_info->loginuid, + from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); if (audit_info->secid != 0 && diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5a5165a5927..2f475151cea1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2630,12 +2630,12 @@ static void xfrm_policy_fini(struct net *net) flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY - audit_info.loginuid = -1; + audit_info.loginuid = INVALID_UID; audit_info.sessionid = -1; audit_info.secid = 0; xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, &audit_info); #endif - audit_info.loginuid = -1; + audit_info.loginuid = INVALID_UID; audit_info.sessionid = -1; audit_info.secid = 0; xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); @@ -2742,7 +2742,7 @@ static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, - uid_t auid, u32 sessionid, u32 secid) + kuid_t auid, u32 sessionid, u32 secid) { struct audit_buffer *audit_buf; @@ -2757,7 +2757,7 @@ void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, - uid_t auid, u32 sessionid, u32 secid) + kuid_t auid, u32 sessionid, u32 secid) { struct audit_buffer *audit_buf; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5b228f97d4b3..fce6a49bc7c6 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2045,7 +2045,7 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - audit_info.loginuid = -1; + audit_info.loginuid = INVALID_UID; audit_info.sessionid = -1; audit_info.secid = 0; xfrm_state_flush(net, IPSEC_PROTO_ANY, &audit_info); @@ -2112,7 +2112,7 @@ static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, } void xfrm_audit_state_add(struct xfrm_state *x, int result, - uid_t auid, u32 sessionid, u32 secid) + kuid_t auid, u32 sessionid, u32 secid) { struct audit_buffer *audit_buf; @@ -2127,7 +2127,7 @@ void xfrm_audit_state_add(struct xfrm_state *x, int result, EXPORT_SYMBOL_GPL(xfrm_audit_state_add); void xfrm_audit_state_delete(struct xfrm_state *x, int result, - uid_t auid, u32 sessionid, u32 secid) + kuid_t auid, u32 sessionid, u32 secid) { struct audit_buffer *audit_buf; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e75d8e47f35c..9ea55db737b4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -575,7 +575,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_state *x; int err; struct km_event c; - uid_t loginuid = audit_get_loginuid(current); + kuid_t loginuid = audit_get_loginuid(current); u32 sessionid = audit_get_sessionid(current); u32 sid; @@ -654,7 +654,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -ESRCH; struct km_event c; struct xfrm_usersa_id *p = nlmsg_data(nlh); - uid_t loginuid = audit_get_loginuid(current); + kuid_t loginuid = audit_get_loginuid(current); u32 sessionid = audit_get_sessionid(current); u32 sid; @@ -1369,7 +1369,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct km_event c; int err; int excl; - uid_t loginuid = audit_get_loginuid(current); + kuid_t loginuid = audit_get_loginuid(current); u32 sessionid = audit_get_sessionid(current); u32 sid; @@ -1624,7 +1624,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).pid); } } else { - uid_t loginuid = audit_get_loginuid(current); + kuid_t loginuid = audit_get_loginuid(current); u32 sessionid = audit_get_sessionid(current); u32 sid; @@ -1918,7 +1918,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, err = 0; if (up->hard) { - uid_t loginuid = audit_get_loginuid(current); + kuid_t loginuid = audit_get_loginuid(current); u32 sessionid = audit_get_sessionid(current); u32 sid; @@ -1961,7 +1961,7 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, km_state_expired(x, ue->hard, current->pid); if (ue->hard) { - uid_t loginuid = audit_get_loginuid(current); + kuid_t loginuid = audit_get_loginuid(current); u32 sessionid = audit_get_sessionid(current); u32 sid; -- cgit v1.2.3 From b40863c667c16b7a73d4f034a8eab67029b5b15a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Sep 2012 20:44:49 +0000 Subject: net: more accurate network taps in transmit path dev_queue_xmit_nit() should be called right before ndo_start_xmit() calls or we might give wrong packet contents to taps users : Packet checksum can be changed, or packet can be linearized or segmented, and segments partially sent for the later case. Also a memory allocation can fail and packet never really hit the driver entry point. Reported-by: Jamie Gloudon Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index dcc673d0674c..52cd1d7f004a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2213,9 +2213,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && @@ -2250,6 +2247,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + skb_len = skb->len; rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); @@ -2272,6 +2272,9 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(nskb, dev); + skb_len = nskb->len; rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); -- cgit v1.2.3 From 828de4f6bf6e785f7b5497f8f7cfd4d4fbcfdb7e Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 13 Sep 2012 20:58:27 +0000 Subject: net: dev: fix incorrect getting net device's name When moving a nic from net namespace A to net namespace B, in dev_change_net_namesapce,we call __dev_get_by_name to decide if the netns B has the device has the same name. if the netns B already has the same named device,we call dev_get_valid_name to try to get a valid name for this nic in the netns B,but net_device->nd_net still point to netns A now. this patch fix it. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 52cd1d7f004a..bb42eb161969 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -959,18 +959,30 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net_device *dev, const char *name) +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) { - struct net *net; + char buf[IFNAMSIZ]; + int ret; - BUG_ON(!dev_net(dev)); - net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +static int dev_get_valid_name(struct net *net, + struct net_device *dev, + const char *name) +{ + BUG_ON(!net); if (!dev_valid_name(name)) return -EINVAL; if (strchr(name, '%')) - return dev_alloc_name(dev, name); + return dev_alloc_name_ns(net, dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; else if (dev->name != name) @@ -1006,7 +1018,7 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(dev, newname); + err = dev_get_valid_name(net, dev, newname); if (err < 0) return err; @@ -5589,7 +5601,7 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = dev_get_valid_name(dev, dev->name); + ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -6233,7 +6245,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(dev, pat) < 0) + if (dev_get_valid_name(net, dev, pat) < 0) goto out; } -- cgit v1.2.3 From 2c60db037034d27f8c636403355d52872da92f81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Sep 2012 09:17:26 +0000 Subject: net: provide a default dev->ethtool_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of forcing device drivers to provide empty ethtool_ops or tweak net/core/ethtool.c again, we could provide a generic ethtool_ops. This occurred to me when I wanted to add GSO support to GRE tunnels. ethtool -k support should be generic for all drivers. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Maciej Żenczykowski Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ net/core/ethtool.c | 12 ------------ 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index bb42eb161969..bbda81997f4f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5974,6 +5974,8 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) return queue; } +static const struct ethtool_ops default_ethtool_ops; + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -6061,6 +6063,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, strcpy(dev->name, name); dev->group = INIT_NETDEV_GROUP; + if (!dev->ethtool_ops) + dev->ethtool_ops = &default_ethtool_ops; return dev; free_all: diff --git a/net/core/ethtool.c b/net/core/ethtool.c index cbf033dcaf1f..4d64cc2e3fa9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1426,18 +1426,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; - if (!dev->ethtool_ops) { - /* A few commands do not require any driver support, - * are unprivileged, and do not change anything, so we - * can take a shortcut to them. */ - if (ethcmd == ETHTOOL_GDRVINFO) - return ethtool_get_drvinfo(dev, useraddr); - else if (ethcmd == ETHTOOL_GET_TS_INFO) - return ethtool_get_ts_info(dev, useraddr); - else - return -EOPNOTSUPP; - } - /* Allow some commands to be done by anyone */ switch (ethcmd) { case ETHTOOL_GSET: -- cgit v1.2.3 From 8c4c49df5cfeb8d56e5b85a430c8cbcb86c2ac37 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 17 Sep 2012 20:16:31 +0000 Subject: netpoll: call ->ndo_select_queue() in tx path In netpoll tx path, we miss the chance of calling ->ndo_select_queue(), thus could cause problems when bonding is involved. This patch makes dev_pick_tx() extern (and rename it to netdev_pick_tx()) to let netpoll call it in netpoll_send_skb_on_dev(). Reported-by: Sylvain Munaut Cc: "David S. Miller" Cc: Eric Dumazet Signed-off-by: Cong Wang Tested-by: Sylvain Munaut Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 6 +++--- net/core/netpoll.c | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 82264e717e53..6c131f055ab0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1403,6 +1403,9 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } +extern struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb); + /* * Net namespace inlines */ diff --git a/net/core/dev.c b/net/core/dev.c index bbda81997f4f..707b12425a79 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2396,8 +2396,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -static struct netdev_queue *dev_pick_tx(struct net_device *dev, - struct sk_buff *skb) +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb) { int queue_index; const struct net_device_ops *ops = dev->netdev_ops; @@ -2571,7 +2571,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = dev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dd67818025d1..77a0388fc3be 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -328,7 +328,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = netdev_pick_tx(dev, skb); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; -- cgit v1.2.3 From c9e6bc644e557338221e75c242ab12c275a67d1b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2012 19:29:05 +0000 Subject: net: add gro_cells infrastructure This adds a new include file (include/net/gro_cells.h), to bring GRO (Generic Receive Offload) capability to tunnels, in a modular way. Because tunnels receive path is lockless, and GRO adds a serialization using a napi_struct, I chose to add an array of up to DEFAULT_MAX_NUM_RSS_QUEUES cells, so that multi queue devices wont be slowed down because of GRO layer. skb_get_rx_queue() is used as selector. In the future, we might add optional fanout capabilities, using rxhash for example. With help from Ben Hutchings who reminded me netif_get_num_default_rss_queues() function. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Signed-off-by: David S. Miller --- include/net/gro_cells.h | 103 ++++++++++++++++++++++++++++++++++++++++++++++++ net/core/dev.c | 2 + 2 files changed, 105 insertions(+) create mode 100644 include/net/gro_cells.h (limited to 'net/core/dev.c') diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h new file mode 100644 index 000000000000..4fd8a4b4b7ee --- /dev/null +++ b/include/net/gro_cells.h @@ -0,0 +1,103 @@ +#ifndef _NET_GRO_CELLS_H +#define _NET_GRO_CELLS_H + +#include +#include +#include + +struct gro_cell { + struct sk_buff_head napi_skbs; + struct napi_struct napi; +} ____cacheline_aligned_in_smp; + +struct gro_cells { + unsigned int gro_cells_mask; + struct gro_cell *cells; +}; + +static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) +{ + unsigned long flags; + struct gro_cell *cell = gcells->cells; + struct net_device *dev = skb->dev; + + if (!cell || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) { + netif_rx(skb); + return; + } + + if (skb_rx_queue_recorded(skb)) + cell += skb_get_rx_queue(skb) & gcells->gro_cells_mask; + + if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return; + } + + spin_lock_irqsave(&cell->napi_skbs.lock, flags); + + __skb_queue_tail(&cell->napi_skbs, skb); + if (skb_queue_len(&cell->napi_skbs) == 1) + napi_schedule(&cell->napi); + + spin_unlock_irqrestore(&cell->napi_skbs.lock, flags); +} + +static inline int gro_cell_poll(struct napi_struct *napi, int budget) +{ + struct gro_cell *cell = container_of(napi, struct gro_cell, napi); + struct sk_buff *skb; + int work_done = 0; + + while (work_done < budget) { + skb = skb_dequeue(&cell->napi_skbs); + if (!skb) + break; + + napi_gro_receive(napi, skb); + work_done++; + } + + if (work_done < budget) + napi_complete(napi); + return work_done; +} + +static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) +{ + int i; + + gcells->gro_cells_mask = roundup_pow_of_two(netif_get_num_default_rss_queues()) - 1; + gcells->cells = kcalloc(sizeof(struct gro_cell), + gcells->gro_cells_mask + 1, + GFP_KERNEL); + if (!gcells->cells) + return -ENOMEM; + + for (i = 0; i <= gcells->gro_cells_mask; i++) { + struct gro_cell *cell = gcells->cells + i; + + skb_queue_head_init(&cell->napi_skbs); + netif_napi_add(dev, &cell->napi, gro_cell_poll, 64); + napi_enable(&cell->napi); + } + return 0; +} + +static inline void gro_cells_destroy(struct gro_cells *gcells) +{ + struct gro_cell *cell = gcells->cells; + int i; + + if (!cell) + return; + for (i = 0; i <= gcells->gro_cells_mask; i++,cell++) { + netif_napi_del(&cell->napi); + skb_queue_purge(&cell->napi_skbs); + } + kfree(gcells->cells); + gcells->cells = NULL; +} + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 3e645f3751bf..ba4bb118a756 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,6 +2645,8 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); + int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ -- cgit v1.2.3 From ca07e43e288956a0ad5e6bd075f7aa1fca3bca00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Oct 2012 22:28:06 +0000 Subject: net: gro: fix a potential crash in skb_gro_reset_offset Before accessing skb first fragment, better make sure there is one. This is probably not needed for old kernels, since an ethernet frame cannot contain only an ethernet header, but the recent GRO addition to tunnels makes this patch needed. Also skb_gro_reset_offset() can be static, it actually allows compiler to inline it. Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 14 ++++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 01646aa53b0e..a659fd0ba965 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1663,7 +1663,6 @@ extern int netpoll_trap(void); #endif extern int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); -extern void skb_gro_reset_offset(struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { diff --git a/net/core/dev.c b/net/core/dev.c index 1e0a1847c3bb..de2bad717d56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3631,20 +3631,22 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) } EXPORT_SYMBOL(napi_skb_finish); -void skb_gro_reset_offset(struct sk_buff *skb) +static void skb_gro_reset_offset(struct sk_buff *skb) { + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { - NAPI_GRO_CB(skb)->frag0 = - skb_frag_address(&skb_shinfo(skb)->frags[0]); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); } } -EXPORT_SYMBOL(skb_gro_reset_offset); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.3 From 2e71a6f8084e7ac87166dd77d99c44190fb844fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Oct 2012 08:08:49 +0000 Subject: net: gro: selective flush of packets Current GRO can hold packets in gro_list for almost unlimited time, in case napi->poll() handler consumes its budget over and over. In this case, napi_complete()/napi_gro_flush() are not called. Another problem is that gro_list is flushed in non friendly way : We scan the list and complete packets in the reverse order. (youngest packets first, oldest packets last) This defeats priorities that sender could have cooked. Since GRO currently only store TCP packets, we dont really notice the bug because of retransmits, but this behavior can add unexpected latencies, particularly on mice flows clamped by elephant flows. This patch makes sure no packet can stay more than 1 ms in queue, and only in stress situations. It also complete packets in the right order to minimize latencies. Signed-off-by: Eric Dumazet Cc: Herbert Xu Cc: Jesse Gross Cc: Tom Herbert Cc: Yuchung Cheng Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/skge.c | 2 +- drivers/net/ethernet/realtek/8139cp.c | 2 +- include/linux/netdevice.h | 15 ++++++++------ net/core/dev.c | 38 ++++++++++++++++++++++++++++------- 4 files changed, 42 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 3f7dab46626b..9b9c2ac5c4c2 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -3189,7 +3189,7 @@ static int skge_poll(struct napi_struct *napi, int to_do) if (work_done < to_do) { unsigned long flags; - napi_gro_flush(napi); + napi_gro_flush(napi, false); spin_lock_irqsave(&hw->hw_lock, flags); __napi_complete(napi); hw->intr_mask |= napimask[skge->port]; diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 995d0cfc4c06..1c818254b7be 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -563,7 +563,7 @@ rx_next: if (cpr16(IntrStatus) & cp_rx_intr_mask) goto rx_status_loop; - napi_gro_flush(napi); + napi_gro_flush(napi, false); spin_lock_irqsave(&cp->lock, flags); __napi_complete(napi); cpw16_f(IntrMask, cp_intr_mask); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a659fd0ba965..0a36fff75bd5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1497,19 +1497,22 @@ struct napi_gro_cb { /* This indicates where we are processing relative to skb->data. */ int data_offset; - /* This is non-zero if the packet may be of the same flow. */ - int same_flow; - /* This is non-zero if the packet cannot be merged with the new skb. */ int flush; /* Number of segments aggregated. */ - int count; + u16 count; + + /* This is non-zero if the packet may be of the same flow. */ + u8 same_flow; /* Free the skb? */ - int free; + u8 free; #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 + + /* jiffies when first packet was created/queued */ + unsigned long age; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) @@ -2156,7 +2159,7 @@ extern gro_result_t dev_gro_receive(struct napi_struct *napi, extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); extern gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); -extern void napi_gro_flush(struct napi_struct *napi); +extern void napi_gro_flush(struct napi_struct *napi, bool flush_old); extern struct sk_buff * napi_get_frags(struct napi_struct *napi); extern gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index de2bad717d56..d44668f63c88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3471,17 +3471,31 @@ out: return netif_receive_skb(skb); } -inline void napi_gro_flush(struct napi_struct *napi) +/* napi->gro_list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *next; + struct sk_buff *skb, *prev = NULL; - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; + /* scan list and build reverse chain */ + for (skb = napi->gro_list; skb != NULL; skb = skb->next) { + skb->prev = prev; + prev = skb; + } + + for (skb = prev; skb; skb = prev) { skb->next = NULL; + + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + + prev = skb->prev; napi_gro_complete(skb); + napi->gro_count--; } - napi->gro_count = 0; napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); @@ -3542,6 +3556,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) napi->gro_count++; NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; @@ -3878,7 +3893,7 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n); + napi_gro_flush(n, false); local_irq_save(flags); __napi_complete(n); local_irq_restore(flags); @@ -3983,8 +3998,17 @@ static void net_rx_action(struct softirq_action *h) local_irq_enable(); napi_complete(n); local_irq_disable(); - } else + } else { + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + local_irq_enable(); + napi_gro_flush(n, HZ >= 1000); + local_irq_disable(); + } list_move_tail(&n->poll_list, &sd->poll_list); + } } netpoll_poll_unlock(have); -- cgit v1.2.3 From 48cc32d38a52d0b68f91a171a8d00531edc6a46e Mon Sep 17 00:00:00 2001 From: Florian Zumbiehl Date: Sun, 7 Oct 2012 15:51:58 +0000 Subject: vlan: don't deliver frames for unknown vlans to protocols 6a32e4f9dd9219261f8856f817e6655114cfec2f made the vlan code skip marking vlan-tagged frames for not locally configured vlans as PACKET_OTHERHOST if there was an rx_handler, as the rx_handler could cause the frame to be received on a different (virtual) vlan-capable interface where that vlan might be configured. As rx_handlers do not necessarily return RX_HANDLER_ANOTHER, this could cause frames for unknown vlans to be delivered to the protocol stack as if they had been received untagged. For example, if an ipv6 router advertisement that's tagged for a locally not configured vlan is received on an interface with macvlan interfaces attached, macvlan's rx_handler returns RX_HANDLER_PASS after delivering the frame to the macvlan interfaces, which caused it to be passed to the protocol stack, leading to ipv6 addresses for the announced prefix being configured even though those are completely unusable on the underlying interface. The fix moves marking as PACKET_OTHERHOST after the rx_handler so the rx_handler, if there is one, sees the frame unchanged, but afterwards, before the frame is delivered to the protocol stack, it gets marked whether there is an rx_handler or not. Signed-off-by: Florian Zumbiehl Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 8 ++++---- net/8021q/vlan_core.c | 10 ++-------- net/core/dev.c | 7 +++++-- 3 files changed, 11 insertions(+), 14 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e6ff12dd717b..c0ff748d0aa5 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -80,6 +80,8 @@ static inline int is_vlan_dev(struct net_device *dev) } #define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) +#define vlan_tx_nonzero_tag_present(__skb) \ + (vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK)) #define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) @@ -89,7 +91,7 @@ extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev, extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); extern u16 vlan_dev_vlan_id(const struct net_device *dev); -extern bool vlan_do_receive(struct sk_buff **skb, bool last_handler); +extern bool vlan_do_receive(struct sk_buff **skb); extern struct sk_buff *vlan_untag(struct sk_buff *skb); extern int vlan_vid_add(struct net_device *dev, unsigned short vid); @@ -120,10 +122,8 @@ static inline u16 vlan_dev_vlan_id(const struct net_device *dev) return 0; } -static inline bool vlan_do_receive(struct sk_buff **skb, bool last_handler) +static inline bool vlan_do_receive(struct sk_buff **skb) { - if (((*skb)->vlan_tci & VLAN_VID_MASK) && last_handler) - (*skb)->pkt_type = PACKET_OTHERHOST; return false; } diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index add69d0fd99d..fbbf1fa00940 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -5,7 +5,7 @@ #include #include "vlan.h" -bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) +bool vlan_do_receive(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; @@ -13,14 +13,8 @@ bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) struct vlan_pcpu_stats *rx_stats; vlan_dev = vlan_find_dev(skb->dev, vlan_id); - if (!vlan_dev) { - /* Only the last call to vlan_do_receive() should change - * pkt_type to PACKET_OTHERHOST - */ - if (vlan_id && last_handler) - skb->pkt_type = PACKET_OTHERHOST; + if (!vlan_dev) return false; - } skb = *skbp = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) diff --git a/net/core/dev.c b/net/core/dev.c index d44668f63c88..09cb3f6dc40c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3300,18 +3300,18 @@ ncls: && !skb_pfmemalloc_protocol(skb)) goto drop; - rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb, !rx_handler)) + if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto unlock; } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -3331,6 +3331,9 @@ ncls: } } + if (vlan_tx_nonzero_tag_present(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; -- cgit v1.2.3