netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Florian Westphal <fw@strlen.de>
To: Eyal Birger <eyal.birger@gmail.com>
Cc: steffen.klassert@secunet.com, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, davem@davemloft.net,
	shmulik@metanetworks.com
Subject: Re: xfrm, ip tunnel: non released device reference upon device unregistration
Date: Tue, 6 Feb 2018 13:56:24 +0100	[thread overview]
Message-ID: <20180206125624.GE15427@breakpoint.cc> (raw)
In-Reply-To: <20180204132118.6fef9bf0@jimi>

Eyal Birger <eyal.birger@gmail.com> wrote:
> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via the ipip6
> tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2

This looks like it might be related to a bug reported by Hangbin Liu.

Setup is:

       Host A                --        Host B
tun0 -- br0 (ipsec) -- eth0  --  eth0 (ipsec) -- tun0

... where tun0 are ipip tunnels.

module removal then hangs forever as device refcount is held by
dst_cache.
Hangbin gave following reproducer script:

Host B:
ip link add tun0 type ipip local 192.168.7.1 remote 192.168.7.10
ip link set tun0 up
ip addr add 10.0.0.2/24 dev tun0
ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000 proto ah
auth sha1 beef_fish_pork_salad mode transport
ip xfrm state add src 192.168.7.1 dst 192.168.7.1 spi 1000 proto ah auth
sha1 beef_fish_pork_salad mode transport
ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir out tmpl src
192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport
ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir in tmpl src
192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport level use

On Host A (tun0 10.0.0.1, br0 192.168.7.10):
ip addr flush dev eth0
brctl addbr br0
brctl addif br0 eth0
ip link set br0 up
ip addr add 192.168.7.10/24 dev br0

ip link add tun0 type ipip local 192.168.7.10 remote 192.168.7.1
ip link set tun0 up
ip addr add 10.0.0.1/24 dev tun0

ip xfrm state add src 192.168.7.10 dst 192.168.7.1 spi 1000
proto ah auth sha1 beef_fish_pork_salad mode transport
ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000
proto ah auth sha1 beef_fish_pork_salad mode transport
ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir out tmpl
src 192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport
ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir in tmpl src 192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport level use

# should work, else something is broken:
ping 10.0.0.2 -c 2

# This hangs:
modprobe -r bridge

This hangs even when I remove the ipsec xfrm pcpu cache.

What does work in above setup is this:

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index 72fd5067c353..d5bcd6ea77f3 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -95,4 +95,13 @@ int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);
  */
 void dst_cache_destroy(struct dst_cache *dst_cache);
 
+/**
+ *	dst_cache_clear_dev - put device from cached tunnel
+ *	@dst_cache: the cache
+ *	@dev: device being removed
+ *
+ *	puts the device and resets the cache in case the device is
+ *	used by the cached dst.
+ */
+void dst_cache_clear_dev(struct dst_cache *dstcache, const struct net_device *dev);
 #endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 1f16773cfd76..4e337b99fa96 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -289,6 +289,8 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark);
 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
+void ip_tunnel_device_down(const struct net_device *dev, unsigned int net_id);
+
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 554d36449231..56cafe08756d 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -166,3 +166,32 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
 	free_percpu(dst_cache->cache);
 }
 EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_clear_dev(struct dst_cache *dst_cache, const struct net_device *dev)
+{
+	struct dst_entry *dst;
+	bool reset = false;
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	rcu_read_lock();
+	for_each_possible_cpu(i) {
+		dst = per_cpu_ptr(dst_cache->cache, i)->dst;
+		if (!dst)
+			continue;
+
+		if (!dst_hold_safe(dst))
+			continue;
+
+		if (dst->dev == dev) {
+			dst_dev_put(dst);
+			reset = true;
+		}
+		dst_release(dst);
+	}
+	rcu_read_unlock();
+	if (reset)
+		dst_cache_reset(dst_cache);
+}
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d786a8441bce..cea578190258 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1224,4 +1224,25 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
 
+void ip_tunnel_device_down(const struct net_device *dev, unsigned int id)
+{
+	struct net *net = dev_net(dev);
+	struct ip_tunnel_net *itn = net_generic(net,  id);
+	unsigned int i;
+
+	ASSERT_RTNL();
+
+	for (i = 0; i < IP_TNL_HASH_SIZE; i++) {
+		struct ip_tunnel *t;
+		struct hlist_node *n;
+		struct hlist_head *thead = &itn->tunnels[i];
+
+		hlist_for_each_entry_safe(t, n, thead, hash_node)
+			dst_cache_clear_dev(&t->dst_cache, dev);
+
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_device_down);
+
 MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..da9640e082eb 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -357,6 +357,21 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return 0;
 }
 
+int ipip_tunnel_device_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		ip_tunnel_device_down(dev, ipip_net_id);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(ipip_tunnel_device_event);
+
 static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_init       = ipip_tunnel_init,
 	.ndo_uninit     = ip_tunnel_uninit,
@@ -671,6 +686,10 @@ static struct pernet_operations ipip_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+static struct notifier_block ipip_device_notifier = {
+	.notifier_call	= ipip_tunnel_device_event,
+};
+
 static int __init ipip_init(void)
 {
 	int err;
@@ -696,6 +715,11 @@ static int __init ipip_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_netdevice_notifier(&ipip_device_notifier);
+	if (err < 0) {
+		rtnl_link_unregister(&ipip_link_ops);
+		goto rtnl_link_failed;
+	}
 out:
 	return err;
 
@@ -713,6 +737,7 @@ static int __init ipip_init(void)
 
 static void __exit ipip_fini(void)
 {
+	unregister_netdevice_notifier(&ipip_device_notifier);
 	rtnl_link_unregister(&ipip_link_ops);
 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
 		pr_info("%s: can't deregister tunnel\n", __func__);

  parent reply	other threads:[~2018-02-06 12:59 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-04 11:21 xfrm, ip tunnel: non released device reference upon device unregistration Eyal Birger
2018-02-04 13:32 ` Eyal Birger
2018-02-06  8:53 ` Steffen Klassert
2018-02-06 10:32   ` Florian Westphal
2018-02-06 10:42   ` Eyal Birger
2018-02-06 12:56 ` Florian Westphal [this message]
2018-02-06 13:09   ` Steffen Klassert
2018-02-06 13:15     ` Florian Westphal
2018-02-06 13:21       ` Steffen Klassert
2018-02-06 19:19       ` Eyal Birger
2018-02-11 15:46         ` Florian Westphal
2018-02-12 11:54           ` Eyal Birger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180206125624.GE15427@breakpoint.cc \
    --to=fw@strlen.de \
    --cc=davem@davemloft.net \
    --cc=eyal.birger@gmail.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=netdev@vger.kernel.org \
    --cc=shmulik@metanetworks.com \
    --cc=steffen.klassert@secunet.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).