netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ipt_CLUSTERIP: Add network device notifier
@ 2014-04-07 11:58 Kirill Tkhai
  2014-04-28 14:23 ` Pablo Neira Ayuso
  0 siblings, 1 reply; 10+ messages in thread
From: Kirill Tkhai @ 2014-04-07 11:58 UTC (permalink / raw)
  To: netfilter-devel
  Cc: Pablo Neira Ayuso, Patrick McHardy, Jozsef Kadlecsik, tkhai

Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
So, unregister_netdevice catches the leak:

# modprobe dummy
# iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
# rmmod dummy

   Message from syslogd@localhost ...
     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1

To fix that we add netdevice notifier, which masks target is inactive
(i.e., it sets target.dev to NULL) when device is going away,
and executes dev_put().

If the device comes back, the target is being masked as active and does
dev_hold().

Initially I found this on OpenVZ kernel 2.6.32, when I was investigating
a leak on container stopping.

The call:

unregister_netdevice()  --->  waiting for device refcnt is 1  ---> fail

was before:

ipt_unregister_table()  --->  remove_target.destroy() ---> dev_put();

It looks like, containers on vanila kernel have the same problem
with stopping.

***

The another solution for this is to implement generic deletion of
any rule from ip table, but it requires a lot of userspace iptable
parsing functionality to be moved into kernel. ipt_CLUSTERIP is
the only module which needs this, so it looks like the local fix
of this patch is enough.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
CC: Patrick McHardy <kaber@trash.net>
CC: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c |  146 +++++++++++++++++++++++++++++++++---
 1 file changed, 134 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2510c02..f878946 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -46,7 +46,10 @@ struct clusterip_config {
 
 	__be32 clusterip;			/* the IP address */
 	u_int8_t clustermac[ETH_ALEN];		/* the MAC address */
-	struct net_device *dev;			/* device */
+	struct net_device *dev;			/* device. Access to dev->...
+						 * fields requires
+						 * &cn->lock is held */
+	char dev_name[IFNAMSIZ];
 	u_int16_t num_total_nodes;		/* total number of nodes */
 	unsigned long local_nodes;		/* node number array */
 
@@ -97,9 +100,8 @@ clusterip_config_put(struct clusterip_config *c)
  * entry(rule) is removed, remove the config from lists, but don't free it
  * yet, since proc-files could still be holding references */
 static inline void
-clusterip_config_entry_put(struct clusterip_config *c)
+clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
 {
-	struct net *net = dev_net(c->dev);
 	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
 	local_bh_disable();
@@ -108,8 +110,10 @@ clusterip_config_entry_put(struct clusterip_config *c)
 		spin_unlock(&cn->lock);
 		local_bh_enable();
 
-		dev_mc_del(c->dev, c->clustermac);
-		dev_put(c->dev);
+		if (c->dev) {
+			dev_mc_del(c->dev, c->clustermac);
+			dev_put(c->dev);
+		}
 
 		/* In case anyone still accesses the file, the open/close
 		 * functions are also incrementing the refcount on their own,
@@ -176,6 +180,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 		return NULL;
 
 	c->dev = dev;
+	memcpy(c->dev_name, dev->name, IFNAMSIZ);
 	c->clusterip = ip;
 	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
 	c->num_total_nodes = i->num_total_nodes;
@@ -295,6 +300,91 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
 }
 
 /***********************************************************************
+ * NETDEVICE NOTIFIER
+ ***********************************************************************/
+static int cip_unregister_device(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct clusterip_config *c;
+
+	spin_lock_bh(&cn->lock);
+
+	list_for_each_entry_rcu(c, &cn->configs, list)
+		if (c->dev == dev) {
+			c->dev = NULL;
+			dev_mc_del(dev, c->clustermac);
+			dev_put(dev);
+		}
+
+	spin_unlock_bh(&cn->lock);
+
+	synchronize_net();
+
+	return NOTIFY_DONE;
+}
+
+static int cip_register_device(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct clusterip_config *c;
+
+	spin_lock_bh(&cn->lock);
+
+	list_for_each_entry_rcu(c, &cn->configs, list)
+		if (!c->dev && strcmp(c->dev_name, dev->name) == 0) {
+			dev_hold(dev);
+			dev_mc_add(dev, c->clustermac);
+			c->dev = dev;
+		}
+
+	spin_unlock_bh(&cn->lock);
+
+	synchronize_net();
+
+	return NOTIFY_DONE;
+}
+
+static int cip_rename_device(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct clusterip_config *c;
+
+	spin_lock_bh(&cn->lock);
+
+	list_for_each_entry_rcu(c, &cn->configs, list)
+		if (c->dev == dev)
+			memcpy(c->dev_name, dev->name, IFNAMSIZ);
+
+	spin_unlock_bh(&cn->lock);
+
+	return NOTIFY_DONE;
+}
+
+static int cip_dev_notify(struct notifier_block *this, unsigned long event,
+			  void *data)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(data);
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		return cip_unregister_device(dev);
+	case NETDEV_REGISTER:
+		return cip_register_device(dev);
+	case NETDEV_CHANGENAME:
+		return cip_rename_device(dev);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cip_dev_notifier = {
+	.notifier_call = cip_dev_notify,
+	.priority = 0
+};
+
+/***********************************************************************
  * IPTABLES TARGET
  ***********************************************************************/
 
@@ -365,6 +455,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 	struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
 	const struct ipt_entry *e = par->entryinfo;
 	struct clusterip_config *config;
+	struct net *net = par->net;
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct net_device *dev;
 	int ret;
 
 	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
@@ -382,15 +475,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 
 	/* FIXME: further sanity checks */
 
-	config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
+	config = clusterip_config_find_get(net, e->ip.dst.s_addr, 1);
 	if (!config) {
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			pr_info("no config found for %pI4, need 'new'\n",
 				&e->ip.dst.s_addr);
 			return -EINVAL;
 		} else {
-			struct net_device *dev;
-
 			if (e->ip.iniface[0] == '\0') {
 				pr_info("Please specify an interface name\n");
 				return -EINVAL;
@@ -411,6 +502,30 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 			}
 			dev_mc_add(config->dev, config->clustermac);
 		}
+	} else if (!config->dev) {
+		int fail = 0;
+		/* Device were removed, so we need insert it back */
+		dev = dev_get_by_name(net, e->ip.iniface);
+		if (!dev) {
+			clusterip_config_entry_put(net, config);
+			printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+			return false;
+		}
+		spin_lock_bh(&cn->lock);
+		/* Check again under lock */
+		if (config->dev == NULL) {
+			config->dev = dev;
+			memcpy(config->dev_name, dev->name, IFNAMSIZ);
+			dev_mc_add(config->dev, config->clustermac);
+		} else
+			fail = 1;
+		spin_unlock_bh(&cn->lock);
+
+		if (fail) {
+			dev_put(dev);
+			clusterip_config_entry_put(net, config);
+			return false;
+		}
 	}
 	cipinfo->config = config;
 
@@ -428,7 +543,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
 
 	/* if no more entries are referencing the config, remove it
 	 * from the list and destroy the proc entry */
-	clusterip_config_entry_put(cipinfo->config);
+	clusterip_config_entry_put(par->net, cipinfo->config);
 
 	clusterip_config_put(cipinfo->config);
 
@@ -522,7 +637,7 @@ arp_mangle(const struct nf_hook_ops *ops,
 	/* if there is no clusterip configuration for the arp reply's
 	 * source ip, we don't want to mangle it */
 	c = clusterip_config_find_get(net, payload->src_ip, 0);
-	if (!c)
+	if (!c || !c->dev)
 		return NF_ACCEPT;
 
 	/* normally the linux kernel always replies to arp queries of
@@ -532,7 +647,7 @@ arp_mangle(const struct nf_hook_ops *ops,
 	if (c->dev != out) {
 		pr_debug("not mangling arp reply on different "
 			 "interface: cip'%s'-skb'%s'\n",
-			 c->dev->name, out->name);
+			 c->dev_name, out->name);
 		clusterip_config_put(c);
 		return NF_ACCEPT;
 	}
@@ -753,10 +868,14 @@ static int __init clusterip_tg_init(void)
 	if (ret < 0)
 		return ret;
 
-	ret = xt_register_target(&clusterip_tg_reg);
+	ret = register_netdevice_notifier(&cip_dev_notifier);
 	if (ret < 0)
 		goto cleanup_subsys;
 
+	ret = xt_register_target(&clusterip_tg_reg);
+	if (ret < 0)
+		goto cleanup_notifier;
+
 	ret = nf_register_hook(&cip_arp_ops);
 	if (ret < 0)
 		goto cleanup_target;
@@ -768,6 +887,8 @@ static int __init clusterip_tg_init(void)
 
 cleanup_target:
 	xt_unregister_target(&clusterip_tg_reg);
+cleanup_notifier:
+	unregister_netdevice_notifier(&cip_dev_notifier);
 cleanup_subsys:
 	unregister_pernet_subsys(&clusterip_net_ops);
 	return ret;
@@ -779,6 +900,7 @@ static void __exit clusterip_tg_exit(void)
 
 	nf_unregister_hook(&cip_arp_ops);
 	xt_unregister_target(&clusterip_tg_reg);
+	unregister_netdevice_notifier(&cip_dev_notifier);
 	unregister_pernet_subsys(&clusterip_net_ops);
 
 	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */




^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-04-07 11:58 [PATCH] ipt_CLUSTERIP: Add network device notifier Kirill Tkhai
@ 2014-04-28 14:23 ` Pablo Neira Ayuso
  2014-06-11 11:44   ` Kirill Tkhai
  0 siblings, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-04-28 14:23 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: netfilter-devel, Patrick McHardy, Jozsef Kadlecsik, tkhai

Hi,

On Mon, Apr 07, 2014 at 03:58:49PM +0400, Kirill Tkhai wrote:
> Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
> So, unregister_netdevice catches the leak:
> 
> # modprobe dummy
> # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
> # rmmod dummy
>
>   Message from syslogd@localhost ...
>     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1
>
[...]
>  1 file changed, 134 insertions(+), 12 deletions(-)

I have spinned several times on this patch, and I'm not very happy
with taking this fix:

1) It's quite large fix for a situation that seems unlikely to me.

2) We have this problem since the beginning, since the CLUSTERIP
   target was merged mainstream.

3) We have theses days the cluster match, which is more flexible as
   you can also use it not only for backend, but also in active-active
   gateway setups. It just requires a couple of arptables rules for
   mangling ARP replies to include the multicast MAC there.

Perhaps linking net_device structure with the module that have created
would simplify this, but I guess David won't take such patch just to
fix this rare iptables extension, unless this is manifesting in other
netdev code, eg. tunneling protocols.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-04-28 14:23 ` Pablo Neira Ayuso
@ 2014-06-11 11:44   ` Kirill Tkhai
  2014-06-11 11:49     ` Pablo Neira Ayuso
  0 siblings, 1 reply; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-11 11:44 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, Patrick McHardy, Jozsef Kadlecsik, tkhai,
	Pavel Emelyanov

Hi, Pablo,

В Пн, 28/04/2014 в 16:23 +0200, Pablo Neira Ayuso пишет:
> Hi,
> 
> On Mon, Apr 07, 2014 at 03:58:49PM +0400, Kirill Tkhai wrote:
> > Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
> > So, unregister_netdevice catches the leak:
> > 
> > # modprobe dummy
> > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
> > # rmmod dummy
> >
> >   Message from syslogd@localhost ...
> >     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1
> >
> [...]
> >  1 file changed, 134 insertions(+), 12 deletions(-)
> 
> I have spinned several times on this patch, and I'm not very happy
> with taking this fix:
> 
> 1) It's quite large fix for a situation that seems unlikely to me.

We have several reports from containers users, who bumped into this.
The hang happens on netns stop, it's 100% reproducible. Every time
a container is stopping or a device is going away, the unregistration
fails and hungs if CLUSTERIP is used. So, we'd want to have some fix
of this.

> 2) We have this problem since the beginning, since the CLUSTERIP
>    target was merged mainstream.

Some time ago we discovered, several our BUGs are connected with this
problem. It's not once-only for us.

> 3) We have theses days the cluster match, which is more flexible as
>    you can also use it not only for backend, but also in active-active
>    gateway setups. It just requires a couple of arptables rules for
>    mangling ARP replies to include the multicast MAC there.

Yes, but CLUSTERIP is still in upstream, and some people still use it.
This bug potentially is a bomb, when the only user of a container can
kill all the system.

> Perhaps linking net_device structure with the module that have created
> would simplify this, but I guess David won't take such patch just to
> fix this rare iptables extension, unless this is manifesting in other
> netdev code, eg. tunneling protocols.

Is there a better decidion? We'd could fix it in other way if you suggest.

Thanks,
Kirill

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 11:44   ` Kirill Tkhai
@ 2014-06-11 11:49     ` Pablo Neira Ayuso
  2014-06-11 11:55       ` Kirill Tkhai
  0 siblings, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-06-11 11:49 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: netfilter-devel, Patrick McHardy, Jozsef Kadlecsik, tkhai,
	Pavel Emelyanov

On Wed, Jun 11, 2014 at 03:44:39PM +0400, Kirill Tkhai wrote:
> Hi, Pablo,
> 
> В Пн, 28/04/2014 в 16:23 +0200, Pablo Neira Ayuso пишет:
> > Hi,
> > 
> > On Mon, Apr 07, 2014 at 03:58:49PM +0400, Kirill Tkhai wrote:
> > > Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
> > > So, unregister_netdevice catches the leak:
> > > 
> > > # modprobe dummy
> > > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
> > > # rmmod dummy
> > >
> > >   Message from syslogd@localhost ...
> > >     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1
> > >
> > [...]
> > >  1 file changed, 134 insertions(+), 12 deletions(-)
> > 
> > I have spinned several times on this patch, and I'm not very happy
> > with taking this fix:
> > 
> > 1) It's quite large fix for a situation that seems unlikely to me.
> 
> We have several reports from containers users, who bumped into this.
> The hang happens on netns stop, it's 100% reproducible. Every time
> a container is stopping or a device is going away, the unregistration
> fails and hungs if CLUSTERIP is used. So, we'd want to have some fix
> of this.

How it this combination being triggered there? I mean:

# modprobe dummy
# iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
# rmmod dummy

Is it something included in some scripts that automate the setup?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 11:49     ` Pablo Neira Ayuso
@ 2014-06-11 11:55       ` Kirill Tkhai
  2014-06-11 11:58         ` Pavel Emelyanov
  2014-06-11 12:00         ` Pablo Neira Ayuso
  0 siblings, 2 replies; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-11 11:55 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, Patrick McHardy, Jozsef Kadlecsik, tkhai,
	Pavel Emelyanov

В Ср, 11/06/2014 в 13:49 +0200, Pablo Neira Ayuso пишет:
> On Wed, Jun 11, 2014 at 03:44:39PM +0400, Kirill Tkhai wrote:
> > Hi, Pablo,
> > 
> > В Пн, 28/04/2014 в 16:23 +0200, Pablo Neira Ayuso пишет:
> > > Hi,
> > > 
> > > On Mon, Apr 07, 2014 at 03:58:49PM +0400, Kirill Tkhai wrote:
> > > > Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
> > > > So, unregister_netdevice catches the leak:
> > > > 
> > > > # modprobe dummy
> > > > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
> > > > # rmmod dummy
> > > >
> > > >   Message from syslogd@localhost ...
> > > >     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1
> > > >
> > > [...]
> > > >  1 file changed, 134 insertions(+), 12 deletions(-)
> > > 
> > > I have spinned several times on this patch, and I'm not very happy
> > > with taking this fix:
> > > 
> > > 1) It's quite large fix for a situation that seems unlikely to me.
> > 
> > We have several reports from containers users, who bumped into this.
> > The hang happens on netns stop, it's 100% reproducible. Every time
> > a container is stopping or a device is going away, the unregistration
> > fails and hungs if CLUSTERIP is used. So, we'd want to have some fix
> > of this.
> 
> How it this combination being triggered there? I mean:
> 
> # modprobe dummy
> # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
> # rmmod dummy
> 
> Is it something included in some scripts that automate the setup?

It's a sample of how to trigger this. The problem is not in rmmod.

Really it happens when container is stopping and device is going away.
It's not OpenVZ related, current LXC has the same problem.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 11:55       ` Kirill Tkhai
@ 2014-06-11 11:58         ` Pavel Emelyanov
  2014-06-11 12:03           ` Kirill Tkhai
  2014-06-11 12:00         ` Pablo Neira Ayuso
  1 sibling, 1 reply; 10+ messages in thread
From: Pavel Emelyanov @ 2014-06-11 11:58 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Pablo Neira Ayuso, netfilter-devel, Patrick McHardy,
	Jozsef Kadlecsik, tkhai

On 06/11/2014 03:55 PM, Kirill Tkhai wrote:

>> How it this combination being triggered there? I mean:
>>
>> # modprobe dummy
>> # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
>> # rmmod dummy
>>
>> Is it something included in some scripts that automate the setup?
> 
> It's a sample of how to trigger this. The problem is not in rmmod.
> 
> Really it happens when container is stopping and device is going away.
> It's not OpenVZ related, current LXC has the same problem.

Then it should be reproducible with the help of simple 'unshare --net'
plus 'ip link add ...' commands that would create netns and populate
it with devices. Right?

Thanks,
Pavel


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 11:55       ` Kirill Tkhai
  2014-06-11 11:58         ` Pavel Emelyanov
@ 2014-06-11 12:00         ` Pablo Neira Ayuso
  2014-06-11 12:10           ` Kirill Tkhai
  1 sibling, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-06-11 12:00 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: netfilter-devel, Patrick McHardy, Jozsef Kadlecsik, tkhai,
	Pavel Emelyanov

On Wed, Jun 11, 2014 at 03:55:18PM +0400, Kirill Tkhai wrote:
> В Ср, 11/06/2014 в 13:49 +0200, Pablo Neira Ayuso пишет:
> > On Wed, Jun 11, 2014 at 03:44:39PM +0400, Kirill Tkhai wrote:
> > > Hi, Pablo,
> > > 
> > > В Пн, 28/04/2014 в 16:23 +0200, Pablo Neira Ayuso пишет:
> > > > Hi,
> > > > 
> > > > On Mon, Apr 07, 2014 at 03:58:49PM +0400, Kirill Tkhai wrote:
> > > > > Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
> > > > > So, unregister_netdevice catches the leak:
> > > > > 
> > > > > # modprobe dummy
> > > > > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
> > > > > # rmmod dummy
> > > > >
> > > > >   Message from syslogd@localhost ...
> > > > >     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1
> > > > >
> > > > [...]
> > > > >  1 file changed, 134 insertions(+), 12 deletions(-)
> > > > 
> > > > I have spinned several times on this patch, and I'm not very happy
> > > > with taking this fix:
> > > > 
> > > > 1) It's quite large fix for a situation that seems unlikely to me.
> > > 
> > > We have several reports from containers users, who bumped into this.
> > > The hang happens on netns stop, it's 100% reproducible. Every time
> > > a container is stopping or a device is going away, the unregistration
> > > fails and hungs if CLUSTERIP is used. So, we'd want to have some fix
> > > of this.
> > 
> > How it this combination being triggered there? I mean:
> > 
> > # modprobe dummy
> > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
> > # rmmod dummy
> > 
> > Is it something included in some scripts that automate the setup?
> 
> It's a sample of how to trigger this. The problem is not in rmmod.
> 
> Really it happens when container is stopping and device is going away.
> It's not OpenVZ related, current LXC has the same problem.

But that sample should be really easy to trigger if you're getting
lost of reports for this.

Are your users really hitting that problem by accident? It seems quite
rare condition to me. Please, clarify.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 11:58         ` Pavel Emelyanov
@ 2014-06-11 12:03           ` Kirill Tkhai
  2014-06-11 12:10             ` Pavel Emelyanov
  0 siblings, 1 reply; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-11 12:03 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Pablo Neira Ayuso, netfilter-devel, Patrick McHardy,
	Jozsef Kadlecsik, tkhai

В Ср, 11/06/2014 в 15:58 +0400, Pavel Emelyanov пишет:
> On 06/11/2014 03:55 PM, Kirill Tkhai wrote:
> 
> >> How it this combination being triggered there? I mean:
> >>
> >> # modprobe dummy
> >> # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
> >> # rmmod dummy
> >>
> >> Is it something included in some scripts that automate the setup?
> > 
> > It's a sample of how to trigger this. The problem is not in rmmod.
> > 
> > Really it happens when container is stopping and device is going away.
> > It's not OpenVZ related, current LXC has the same problem.
> 
> Then it should be reproducible with the help of simple 'unshare --net'
> plus 'ip link add ...' commands that would create netns and populate
> it with devices. Right?

Yes, exactly. It has to be triggered this way.

Thanks,
Kirill

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 12:03           ` Kirill Tkhai
@ 2014-06-11 12:10             ` Pavel Emelyanov
  0 siblings, 0 replies; 10+ messages in thread
From: Pavel Emelyanov @ 2014-06-11 12:10 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Pablo Neira Ayuso, netfilter-devel, Patrick McHardy,
	Jozsef Kadlecsik, tkhai

On 06/11/2014 04:03 PM, Kirill Tkhai wrote:
> В Ср, 11/06/2014 в 15:58 +0400, Pavel Emelyanov пишет:
>> On 06/11/2014 03:55 PM, Kirill Tkhai wrote:
>>
>>>> How it this combination being triggered there? I mean:
>>>>
>>>> # modprobe dummy
>>>> # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
>>>> # rmmod dummy
>>>>
>>>> Is it something included in some scripts that automate the setup?
>>>
>>> It's a sample of how to trigger this. The problem is not in rmmod.
>>>
>>> Really it happens when container is stopping and device is going away.
>>> It's not OpenVZ related, current LXC has the same problem.
>>
>> Then it should be reproducible with the help of simple 'unshare --net'
>> plus 'ip link add ...' commands that would create netns and populate
>> it with devices. Right?
> 
> Yes, exactly. It has to be triggered this way.

Can you find out how to do it exactly?

> Thanks,
> Kirill
> 
> .
> 


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipt_CLUSTERIP: Add network device notifier
  2014-06-11 12:00         ` Pablo Neira Ayuso
@ 2014-06-11 12:10           ` Kirill Tkhai
  0 siblings, 0 replies; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-11 12:10 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, Patrick McHardy, Jozsef Kadlecsik, tkhai,
	Pavel Emelyanov

В Ср, 11/06/2014 в 14:00 +0200, Pablo Neira Ayuso пишет:
> On Wed, Jun 11, 2014 at 03:55:18PM +0400, Kirill Tkhai wrote:
> > В Ср, 11/06/2014 в 13:49 +0200, Pablo Neira Ayuso пишет:
> > > On Wed, Jun 11, 2014 at 03:44:39PM +0400, Kirill Tkhai wrote:
> > > > Hi, Pablo,
> > > > 
> > > > В Пн, 28/04/2014 в 16:23 +0200, Pablo Neira Ayuso пишет:
> > > > > Hi,
> > > > > 
> > > > > On Mon, Apr 07, 2014 at 03:58:49PM +0400, Kirill Tkhai wrote:
> > > > > > Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
> > > > > > So, unregister_netdevice catches the leak:
> > > > > > 
> > > > > > # modprobe dummy
> > > > > > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
> > > > > > # rmmod dummy
> > > > > >
> > > > > >   Message from syslogd@localhost ...
> > > > > >     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1
> > > > > >
> > > > > [...]
> > > > > >  1 file changed, 134 insertions(+), 12 deletions(-)
> > > > > 
> > > > > I have spinned several times on this patch, and I'm not very happy
> > > > > with taking this fix:
> > > > > 
> > > > > 1) It's quite large fix for a situation that seems unlikely to me.
> > > > 
> > > > We have several reports from containers users, who bumped into this.
> > > > The hang happens on netns stop, it's 100% reproducible. Every time
> > > > a container is stopping or a device is going away, the unregistration
> > > > fails and hungs if CLUSTERIP is used. So, we'd want to have some fix
> > > > of this.
> > > 
> > > How it this combination being triggered there? I mean:
> > > 
> > > # modprobe dummy
> > > # iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP ...
> > > # rmmod dummy
> > > 
> > > Is it something included in some scripts that automate the setup?
> > 
> > It's a sample of how to trigger this. The problem is not in rmmod.
> > 
> > Really it happens when container is stopping and device is going away.
> > It's not OpenVZ related, current LXC has the same problem.
> 
> But that sample should be really easy to trigger if you're getting
> lost of reports for this.
> 
> Are your users really hitting that problem by accident? It seems quite
> rare condition to me. Please, clarify.

We had it so many times, that we had to add this ugly workaround
in our kernels in mid 2011:

commit 56ec6942c28cd6823f1481da8d0df829672f03d3
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date:   Mon Aug 8 12:11:16 2011 +0400

    kernel.spec v2.6.32-131.6.1.el6-042stab026.1-vz
---
 net/core/dev.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index eaa31c1..d29131d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5483,6 +5483,64 @@ out:
 EXPORT_SYMBOL(register_netdev);
 
 /*
+ * We do horrible things -- we left a netdevice
+ * in "leaked" state, which means we release as much
+ * resources as possible but the device will remain
+ * present in namespace because someone holds a reference.
+ *
+ * The idea is to be able to force stop VE.
+ */
+static void ve_netdev_leak(struct net_device *dev)
+{
+	struct napi_struct *p, *n;
+
+	dev->is_leaked = 1;
+	barrier();
+
+	/*
+	 * Make sure we're unable to tx/rx
+	 * network packets to outside.
+	 */
+	WARN_ON_ONCE(dev->flags & IFF_UP);
+	WARN_ON_ONCE(dev->qdisc != &noop_qdisc);
+
+	rtnl_lock();
+
+	/*
+	 * No address and napi after that.
+	 */
+	dev_addr_flush(dev);
+	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+		netif_napi_del(p);
+
+	/*
+	 * No release_net() here since the device remains
+	 * present in the namespace.
+	 */
+
+	__rtnl_unlock();
+
+	put_beancounter(netdev_bc(dev)->exec_ub);
+	put_beancounter(netdev_bc(dev)->owner_ub);
+
+	netdev_bc(dev)->exec_ub		= get_beancounter(get_ub0());
+	netdev_bc(dev)->owner_ub	= get_beancounter(get_ub0());
+
+	/*
+	 * Since we've already screwed the device and releasing
+	 * it in a normal way is not possible anymore, we're
+	 * to be sure the device will remain here forever.
+	 */
+	dev_hold(dev);
+
+	synchronize_net();
+
+	pr_emerg("Device (%s:%d:%u:%p) marked as leaked\n",
+		 dev->name, atomic_read(&dev->refcnt) - 1,
+		 VEID(dev->owner_env), dev);
+}
+
+/*
  * netdev_wait_allrefs - wait until all references are gone.
  *
  * This is called when unregistering network devices.
@@ -5493,9 +5551,10 @@ EXPORT_SYMBOL(register_netdev);
  * We can get stuck here if buggy protocols don't correctly
  * call dev_put.
  */
-static void netdev_wait_allrefs(struct net_device *dev)
+static int netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int i = 0;
 
 	rebroadcast_time = warning_time = jiffies;
 	while (atomic_read(&dev->refcnt) != 0) {
@@ -5525,12 +5584,27 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
-			       "waiting for %s to become free. Usage "
-			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       "waiting for %s=%p to become free. Usage "
+			       "count = %d\n ve=%u",
+			       dev->name, dev, atomic_read(&dev->refcnt),
+			       VEID(get_exec_env()));
 			warning_time = jiffies;
 		}
+
+		/*
+		 * If device has lost the reference we might stuck
+		 * in this loop forever not having a chance the VE
+		 * to stop.
+		 */
+		if (++i > 200) { /* give 50 seconds to try */
+			if (!ve_is_super(dev->owner_env)) {
+				ve_netdev_leak(dev);
+				return -EBUSY;
+			}
+		}
 	}
+
+	return 0;
 }
 
 /* The sequence is:
@@ -5585,7 +5659,12 @@ void netdev_run_todo(void)
 
 		on_each_cpu(flush_backlog, dev, 1);
 
-		netdev_wait_allrefs(dev);
+		/*
+		 * Even if device get stuck here we are
+		 * to proceed the rest of the list.
+		 */
+		if (netdev_wait_allrefs(dev))
+			continue;
 
 		/* paranoia */
 		BUG_ON(atomic_read(&dev->refcnt));
@@ -5768,6 +5847,13 @@ void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
+	if (dev->is_leaked) {
+		pr_emerg("%s: device %s=%p is leaked\n",
+			 __func__, dev->name, dev);
+		dump_stack();
+		return;
+	}
+
 	release_net(dev_net(dev));
 
 	kfree(dev->_tx);


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-06-11 12:10 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-04-07 11:58 [PATCH] ipt_CLUSTERIP: Add network device notifier Kirill Tkhai
2014-04-28 14:23 ` Pablo Neira Ayuso
2014-06-11 11:44   ` Kirill Tkhai
2014-06-11 11:49     ` Pablo Neira Ayuso
2014-06-11 11:55       ` Kirill Tkhai
2014-06-11 11:58         ` Pavel Emelyanov
2014-06-11 12:03           ` Kirill Tkhai
2014-06-11 12:10             ` Pavel Emelyanov
2014-06-11 12:00         ` Pablo Neira Ayuso
2014-06-11 12:10           ` Kirill Tkhai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).