* [PATCH net-next 1/2] net: vrf: Fix dst reference counting
2016-05-25 16:35 [PATCH 4.5-stable 0/2] net: vrf: 4.5 backports David Ahern
@ 2016-05-25 16:35 ` David Ahern
2016-05-25 17:02 ` David Ahern
2016-05-25 16:35 ` [PATCH net-next 2/2] net: vrf: protect changes to private data with rcu David Ahern
1 sibling, 1 reply; 4+ messages in thread
From: David Ahern @ 2016-05-25 16:35 UTC (permalink / raw)
To: netdev; +Cc: David Ahern, David S. Miller
commit 9ab179d83b4e31ea277a123492e419067c2f129a upstream
Vivek reported a kernel exception deleting a VRF with an active
connection through it. The root cause is that the socket has a cached
reference to a dst that is destroyed. Converting the dst_destroy to
dst_release and letting proper reference counting kick in does not
work as the dst has a reference to the device which needs to be released
as well.
I talked to Hannes about this at netdev and he pointed out the ipv4 and
ipv6 dst handling has dst_ifdown for just this scenario. Rather than
continuing with the reinvented dst wheel in VRF just remove it and
leverage the ipv4 and ipv6 versions.
Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver")
Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/vrf.c | 177 +++++-------------------------------------------
include/net/ip6_route.h | 3 +
include/net/route.h | 3 +
net/ipv4/route.c | 7 +-
net/ipv6/route.c | 7 +-
5 files changed, 30 insertions(+), 167 deletions(-)
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index bdcf617a9d52..d8197f93e56c 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -61,41 +61,6 @@ struct pcpu_dstats {
struct u64_stats_sync syncp;
};
-static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie)
-{
- return dst;
-}
-
-static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- return ip_local_out(net, sk, skb);
-}
-
-static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
-{
- /* TO-DO: return max ethernet size? */
- return dst->dev->mtu;
-}
-
-static void vrf_dst_destroy(struct dst_entry *dst)
-{
- /* our dst lives forever - or until the device is closed */
-}
-
-static unsigned int vrf_default_advmss(const struct dst_entry *dst)
-{
- return 65535 - 40;
-}
-
-static struct dst_ops vrf_dst_ops = {
- .family = AF_INET,
- .local_out = vrf_ip_local_out,
- .check = vrf_ip_check,
- .mtu = vrf_v4_mtu,
- .destroy = vrf_dst_destroy,
- .default_advmss = vrf_default_advmss,
-};
-
/* neighbor handling is done with actual device; do not want
* to flip skb->dev for those ndisc packets. This really fails
* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
@@ -350,46 +315,6 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
}
#if IS_ENABLED(CONFIG_IPV6)
-static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie)
-{
- return dst;
-}
-
-static struct dst_ops vrf_dst_ops6 = {
- .family = AF_INET6,
- .local_out = ip6_local_out,
- .check = vrf_ip6_check,
- .mtu = vrf_v4_mtu,
- .destroy = vrf_dst_destroy,
- .default_advmss = vrf_default_advmss,
-};
-
-static int init_dst_ops6_kmem_cachep(void)
-{
- vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache",
- sizeof(struct rt6_info),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
-
- if (!vrf_dst_ops6.kmem_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
-static void free_dst_ops6_kmem_cachep(void)
-{
- kmem_cache_destroy(vrf_dst_ops6.kmem_cachep);
-}
-
-static int vrf_input6(struct sk_buff *skb)
-{
- skb->dev->stats.rx_errors++;
- kfree_skb(skb);
- return 0;
-}
-
/* modelled after ip6_finish_output2 */
static int vrf_finish_output6(struct net *net, struct sock *sk,
struct sk_buff *skb)
@@ -430,67 +355,34 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
-static void vrf_rt6_destroy(struct net_vrf *vrf)
+static void vrf_rt6_release(struct net_vrf *vrf)
{
- dst_destroy(&vrf->rt6->dst);
- free_percpu(vrf->rt6->rt6i_pcpu);
+ dst_release(&vrf->rt6->dst);
vrf->rt6 = NULL;
}
static int vrf_rt6_create(struct net_device *dev)
{
struct net_vrf *vrf = netdev_priv(dev);
- struct dst_entry *dst;
+ struct net *net = dev_net(dev);
struct rt6_info *rt6;
- int cpu;
int rc = -ENOMEM;
- rt6 = dst_alloc(&vrf_dst_ops6, dev, 0,
- DST_OBSOLETE_NONE,
- (DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+ rt6 = ip6_dst_alloc(net, dev,
+ DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE);
if (!rt6)
goto out;
- dst = &rt6->dst;
-
- rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL);
- if (!rt6->rt6i_pcpu) {
- dst_destroy(dst);
- goto out;
- }
- for_each_possible_cpu(cpu) {
- struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu);
- *p = NULL;
- }
-
- memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst));
-
- INIT_LIST_HEAD(&rt6->rt6i_siblings);
- INIT_LIST_HEAD(&rt6->rt6i_uncached);
-
- rt6->dst.input = vrf_input6;
rt6->dst.output = vrf_output6;
-
- rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id);
-
- atomic_set(&rt6->dst.__refcnt, 2);
-
+ rt6->rt6i_table = fib6_get_table(net, vrf->tb_id);
+ dst_hold(&rt6->dst);
vrf->rt6 = rt6;
rc = 0;
out:
return rc;
}
#else
-static int init_dst_ops6_kmem_cachep(void)
-{
- return 0;
-}
-
-static void free_dst_ops6_kmem_cachep(void)
-{
-}
-
-static void vrf_rt6_destroy(struct net_vrf *vrf)
+static void vrf_rt6_release(struct net_vrf *vrf)
{
}
@@ -558,11 +450,11 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-static void vrf_rtable_destroy(struct net_vrf *vrf)
+static void vrf_rtable_release(struct net_vrf *vrf)
{
struct dst_entry *dst = (struct dst_entry *)vrf->rth;
- dst_destroy(dst);
+ dst_release(dst);
vrf->rth = NULL;
}
@@ -571,22 +463,10 @@ static struct rtable *vrf_rtable_create(struct net_device *dev)
struct net_vrf *vrf = netdev_priv(dev);
struct rtable *rth;
- rth = dst_alloc(&vrf_dst_ops, dev, 2,
- DST_OBSOLETE_NONE,
- (DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
if (rth) {
rth->dst.output = vrf_output;
- rth->rt_genid = rt_genid_ipv4(dev_net(dev));
- rth->rt_flags = 0;
- rth->rt_type = RTN_UNICAST;
- rth->rt_is_input = 0;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
rth->rt_table_id = vrf->tb_id;
- INIT_LIST_HEAD(&rth->rt_uncached);
- rth->rt_uncached_list = NULL;
}
return rth;
@@ -674,8 +554,8 @@ static void vrf_dev_uninit(struct net_device *dev)
struct net_device *port_dev;
struct list_head *iter;
- vrf_rtable_destroy(vrf);
- vrf_rt6_destroy(vrf);
+ vrf_rtable_release(vrf);
+ vrf_rt6_release(vrf);
netdev_for_each_lower_dev(dev, port_dev, iter)
vrf_del_slave(dev, port_dev);
@@ -705,7 +585,7 @@ static int vrf_dev_init(struct net_device *dev)
return 0;
out_rth:
- vrf_rtable_destroy(vrf);
+ vrf_rtable_release(vrf);
out_stats:
free_percpu(dev->dstats);
dev->dstats = NULL;
@@ -738,7 +618,7 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev,
struct net_vrf *vrf = netdev_priv(dev);
rth = vrf->rth;
- atomic_inc(&rth->dst.__refcnt);
+ dst_hold(&rth->dst);
}
return rth;
@@ -789,7 +669,7 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
struct net_vrf *vrf = netdev_priv(dev);
rt = vrf->rt6;
- atomic_inc(&rt->dst.__refcnt);
+ dst_hold(&rt->dst);
}
return (struct dst_entry *)rt;
@@ -926,19 +806,6 @@ static int __init vrf_init_module(void)
{
int rc;
- vrf_dst_ops.kmem_cachep =
- kmem_cache_create("vrf_ip_dst_cache",
- sizeof(struct rtable), 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
-
- if (!vrf_dst_ops.kmem_cachep)
- return -ENOMEM;
-
- rc = init_dst_ops6_kmem_cachep();
- if (rc != 0)
- goto error2;
-
register_netdevice_notifier(&vrf_notifier_block);
rc = rtnl_link_register(&vrf_link_ops);
@@ -949,22 +816,10 @@ static int __init vrf_init_module(void)
error:
unregister_netdevice_notifier(&vrf_notifier_block);
- free_dst_ops6_kmem_cachep();
-error2:
- kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
return rc;
}
-static void __exit vrf_cleanup_module(void)
-{
- rtnl_link_unregister(&vrf_link_ops);
- unregister_netdevice_notifier(&vrf_notifier_block);
- kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
- free_dst_ops6_kmem_cachep();
-}
-
module_init(vrf_init_module);
-module_exit(vrf_cleanup_module);
MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
MODULE_LICENSE("GPL");
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 295d291269e2..54c779416eec 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -101,6 +101,9 @@ void fib6_force_start_gc(struct net *net);
struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
const struct in6_addr *addr, bool anycast);
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+ int flags);
+
/*
* support functions for ND
*
diff --git a/include/net/route.h b/include/net/route.h
index a3b9ef74a389..1587ca2600c9 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -209,6 +209,9 @@ unsigned int inet_addr_type_dev_table(struct net *net,
void ip_rt_multicast_event(struct in_device *);
int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
+struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
+ bool nopolicy, bool noxfrm, bool will_cache);
struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b050cf980a57..60398a9370e7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#endif
}
-static struct rtable *rt_dst_alloc(struct net_device *dev,
- unsigned int flags, u16 type,
- bool nopolicy, bool noxfrm, bool will_cache)
+struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
+ bool nopolicy, bool noxfrm, bool will_cache)
{
struct rtable *rt;
@@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
return rt;
}
+EXPORT_SYMBOL(rt_dst_alloc);
/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18e29e2f8877..f4dd14318f9c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -338,9 +338,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
return rt;
}
-static struct rt6_info *ip6_dst_alloc(struct net *net,
- struct net_device *dev,
- int flags)
+struct rt6_info *ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags)
{
struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
@@ -364,6 +364,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net,
return rt;
}
+EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
--
2.1.4
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH net-next 2/2] net: vrf: protect changes to private data with rcu
2016-05-25 16:35 [PATCH 4.5-stable 0/2] net: vrf: 4.5 backports David Ahern
2016-05-25 16:35 ` [PATCH net-next 1/2] net: vrf: Fix dst reference counting David Ahern
@ 2016-05-25 16:35 ` David Ahern
1 sibling, 0 replies; 4+ messages in thread
From: David Ahern @ 2016-05-25 16:35 UTC (permalink / raw)
To: netdev; +Cc: David Ahern, David S. Miller
commit b0e95ccdd77591f108c938bbc702b57554a1665d upstream
One cpu can be processing packets which includes using the cached route
entries in the vrf device's private data and on another cpu the device
gets deleted which releases the routes and sets the pointers in net_vrf
to NULL. This results in datapath dereferencing a NULL pointer.
Fix by protecting access to dst's with rcu.
Fixes: 193125dbd8eb ("net: Introduce VRF device driver")
Fixes: 35402e313663 ("net: Add IPv6 support to VRF device")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/vrf.c | 68 +++++++++++++++++++++++++++++++++++++------------------
1 file changed, 46 insertions(+), 22 deletions(-)
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index d8197f93e56c..bfc9febfb022 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -47,8 +47,8 @@
((struct net_device *)rcu_dereference(dev->rx_handler_data))
struct net_vrf {
- struct rtable *rth;
- struct rt6_info *rt6;
+ struct rtable __rcu *rth;
+ struct rt6_info __rcu *rt6;
u32 tb_id;
};
@@ -355,10 +355,15 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
+/* holding rtnl */
static void vrf_rt6_release(struct net_vrf *vrf)
{
- dst_release(&vrf->rt6->dst);
- vrf->rt6 = NULL;
+ struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
+
+ rcu_assign_pointer(vrf->rt6, NULL);
+
+ if (rt6)
+ dst_release(&rt6->dst);
}
static int vrf_rt6_create(struct net_device *dev)
@@ -376,7 +381,8 @@ static int vrf_rt6_create(struct net_device *dev)
rt6->dst.output = vrf_output6;
rt6->rt6i_table = fib6_get_table(net, vrf->tb_id);
dst_hold(&rt6->dst);
- vrf->rt6 = rt6;
+ rcu_assign_pointer(vrf->rt6, rt6);
+
rc = 0;
out:
return rc;
@@ -450,26 +456,32 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
+/* holding rtnl */
static void vrf_rtable_release(struct net_vrf *vrf)
{
- struct dst_entry *dst = (struct dst_entry *)vrf->rth;
+ struct rtable *rth = rtnl_dereference(vrf->rth);
+
+ rcu_assign_pointer(vrf->rth, NULL);
- dst_release(dst);
- vrf->rth = NULL;
+ if (rth)
+ dst_release(&rth->dst);
}
-static struct rtable *vrf_rtable_create(struct net_device *dev)
+static int vrf_rtable_create(struct net_device *dev)
{
struct net_vrf *vrf = netdev_priv(dev);
struct rtable *rth;
rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
- if (rth) {
- rth->dst.output = vrf_output;
- rth->rt_table_id = vrf->tb_id;
- }
+ if (!rth)
+ return -ENOMEM;
- return rth;
+ rth->dst.output = vrf_output;
+ rth->rt_table_id = vrf->tb_id;
+
+ rcu_assign_pointer(vrf->rth, rth);
+
+ return 0;
}
/**************************** device handling ********************/
@@ -573,8 +585,7 @@ static int vrf_dev_init(struct net_device *dev)
goto out_nomem;
/* create the default dst which points back to us */
- vrf->rth = vrf_rtable_create(dev);
- if (!vrf->rth)
+ if (vrf_rtable_create(dev) != 0)
goto out_stats;
if (vrf_rt6_create(dev) != 0)
@@ -617,8 +628,13 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev,
if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) {
struct net_vrf *vrf = netdev_priv(dev);
- rth = vrf->rth;
- dst_hold(&rth->dst);
+ rcu_read_lock();
+
+ rth = rcu_dereference(vrf->rth);
+ if (likely(rth))
+ dst_hold(&rth->dst);
+
+ rcu_read_unlock();
}
return rth;
@@ -663,16 +679,24 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
const struct flowi6 *fl6)
{
- struct rt6_info *rt = NULL;
+ struct dst_entry *dst = NULL;
if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
struct net_vrf *vrf = netdev_priv(dev);
+ struct rt6_info *rt;
+
+ rcu_read_lock();
+
+ rt = rcu_dereference(vrf->rt6);
+ if (likely(rt)) {
+ dst = &rt->dst;
+ dst_hold(dst);
+ }
- rt = vrf->rt6;
- dst_hold(&rt->dst);
+ rcu_read_unlock();
}
- return (struct dst_entry *)rt;
+ return dst;
}
#endif
--
2.1.4
^ permalink raw reply related [flat|nested] 4+ messages in thread