All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kirill Tkhai <ktkhai@parallels.com>
To: netfilter-devel@vger.kernel.org
Cc: tkhai@yandex.ru, kaber@trash.net, pablo@netfilter.org,
	kadlec@blackhole.kfki.hu
Subject: [PATCH --smtp-server=mail.sw.ru] ipt_CLUSTERIP: Add network device notifier
Date: Mon, 07 Apr 2014 15:37:11 +0400	[thread overview]
Message-ID: <20140407113644.22540.42391.stgit@tkhai> (raw)

Clusterip target does dev_hold() in .checkentry, while dev_put() in .destroy.
So, unregister_netdevice catches the leak:

# modprobe dummy
# iptables -A INPUT -d 10.31.3.236 -j CLUSTERIP --new --hashmode sourceip -i dummy0 --clustermac 01:aa:7b:47:f7:d7 --total-nodes 2 --local-node 1
# rmmod dummy

   Message from syslogd@localhost ...
     kernel: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1

To fix that we add netdevice notifier, which masks target is inactive
(i.e., it sets target.dev to NULL) when device is going away,
and executes dev_put().

If the device comes back, the target is being masked as active and does
dev_hold().

Initially I found this on OpenVZ kernel 2.6.32, when I was investigating
a leak on container stopping.

The call:

unregister_netdevice()  --->  waiting for device refcnt is 1  ---> fail

was before:

ipt_unregister_table()  --->  remove_target.destroy() ---> dev_put();

It looks like, containers on vanila kernel have the same problem
with stopping.

***

The another solution for this is to implement generic deletion of
any rule from ip table, but it requires a lot of userspace iptable
parsing functionality to be moved into kernel. ipt_CLUSTERIP is
the only module which needs this, so it looks like the local fix
of this patch is enough.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
CC: Patrick McHardy <kaber@trash.net>
CC: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c |  146 +++++++++++++++++++++++++++++++++---
 1 file changed, 134 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2510c02..f878946 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -46,7 +46,10 @@ struct clusterip_config {
 
 	__be32 clusterip;			/* the IP address */
 	u_int8_t clustermac[ETH_ALEN];		/* the MAC address */
-	struct net_device *dev;			/* device */
+	struct net_device *dev;			/* device. Access to dev->...
+						 * fields requires
+						 * &cn->lock is held */
+	char dev_name[IFNAMSIZ];
 	u_int16_t num_total_nodes;		/* total number of nodes */
 	unsigned long local_nodes;		/* node number array */
 
@@ -97,9 +100,8 @@ clusterip_config_put(struct clusterip_config *c)
  * entry(rule) is removed, remove the config from lists, but don't free it
  * yet, since proc-files could still be holding references */
 static inline void
-clusterip_config_entry_put(struct clusterip_config *c)
+clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
 {
-	struct net *net = dev_net(c->dev);
 	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
 	local_bh_disable();
@@ -108,8 +110,10 @@ clusterip_config_entry_put(struct clusterip_config *c)
 		spin_unlock(&cn->lock);
 		local_bh_enable();
 
-		dev_mc_del(c->dev, c->clustermac);
-		dev_put(c->dev);
+		if (c->dev) {
+			dev_mc_del(c->dev, c->clustermac);
+			dev_put(c->dev);
+		}
 
 		/* In case anyone still accesses the file, the open/close
 		 * functions are also incrementing the refcount on their own,
@@ -176,6 +180,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 		return NULL;
 
 	c->dev = dev;
+	memcpy(c->dev_name, dev->name, IFNAMSIZ);
 	c->clusterip = ip;
 	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
 	c->num_total_nodes = i->num_total_nodes;
@@ -295,6 +300,91 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
 }
 
 /***********************************************************************
+ * NETDEVICE NOTIFIER
+ ***********************************************************************/
+static int cip_unregister_device(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct clusterip_config *c;
+
+	spin_lock_bh(&cn->lock);
+
+	list_for_each_entry_rcu(c, &cn->configs, list)
+		if (c->dev == dev) {
+			c->dev = NULL;
+			dev_mc_del(dev, c->clustermac);
+			dev_put(dev);
+		}
+
+	spin_unlock_bh(&cn->lock);
+
+	synchronize_net();
+
+	return NOTIFY_DONE;
+}
+
+static int cip_register_device(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct clusterip_config *c;
+
+	spin_lock_bh(&cn->lock);
+
+	list_for_each_entry_rcu(c, &cn->configs, list)
+		if (!c->dev && strcmp(c->dev_name, dev->name) == 0) {
+			dev_hold(dev);
+			dev_mc_add(dev, c->clustermac);
+			c->dev = dev;
+		}
+
+	spin_unlock_bh(&cn->lock);
+
+	synchronize_net();
+
+	return NOTIFY_DONE;
+}
+
+static int cip_rename_device(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct clusterip_config *c;
+
+	spin_lock_bh(&cn->lock);
+
+	list_for_each_entry_rcu(c, &cn->configs, list)
+		if (c->dev == dev)
+			memcpy(c->dev_name, dev->name, IFNAMSIZ);
+
+	spin_unlock_bh(&cn->lock);
+
+	return NOTIFY_DONE;
+}
+
+static int cip_dev_notify(struct notifier_block *this, unsigned long event,
+			  void *data)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(data);
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		return cip_unregister_device(dev);
+	case NETDEV_REGISTER:
+		return cip_register_device(dev);
+	case NETDEV_CHANGENAME:
+		return cip_rename_device(dev);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cip_dev_notifier = {
+	.notifier_call = cip_dev_notify,
+	.priority = 0
+};
+
+/***********************************************************************
  * IPTABLES TARGET
  ***********************************************************************/
 
@@ -365,6 +455,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 	struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
 	const struct ipt_entry *e = par->entryinfo;
 	struct clusterip_config *config;
+	struct net *net = par->net;
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	struct net_device *dev;
 	int ret;
 
 	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
@@ -382,15 +475,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 
 	/* FIXME: further sanity checks */
 
-	config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
+	config = clusterip_config_find_get(net, e->ip.dst.s_addr, 1);
 	if (!config) {
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			pr_info("no config found for %pI4, need 'new'\n",
 				&e->ip.dst.s_addr);
 			return -EINVAL;
 		} else {
-			struct net_device *dev;
-
 			if (e->ip.iniface[0] == '\0') {
 				pr_info("Please specify an interface name\n");
 				return -EINVAL;
@@ -411,6 +502,30 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 			}
 			dev_mc_add(config->dev, config->clustermac);
 		}
+	} else if (!config->dev) {
+		int fail = 0;
+		/* Device were removed, so we need insert it back */
+		dev = dev_get_by_name(net, e->ip.iniface);
+		if (!dev) {
+			clusterip_config_entry_put(net, config);
+			printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+			return false;
+		}
+		spin_lock_bh(&cn->lock);
+		/* Check again under lock */
+		if (config->dev == NULL) {
+			config->dev = dev;
+			memcpy(config->dev_name, dev->name, IFNAMSIZ);
+			dev_mc_add(config->dev, config->clustermac);
+		} else
+			fail = 1;
+		spin_unlock_bh(&cn->lock);
+
+		if (fail) {
+			dev_put(dev);
+			clusterip_config_entry_put(net, config);
+			return false;
+		}
 	}
 	cipinfo->config = config;
 
@@ -428,7 +543,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
 
 	/* if no more entries are referencing the config, remove it
 	 * from the list and destroy the proc entry */
-	clusterip_config_entry_put(cipinfo->config);
+	clusterip_config_entry_put(par->net, cipinfo->config);
 
 	clusterip_config_put(cipinfo->config);
 
@@ -522,7 +637,7 @@ arp_mangle(const struct nf_hook_ops *ops,
 	/* if there is no clusterip configuration for the arp reply's
 	 * source ip, we don't want to mangle it */
 	c = clusterip_config_find_get(net, payload->src_ip, 0);
-	if (!c)
+	if (!c || !c->dev)
 		return NF_ACCEPT;
 
 	/* normally the linux kernel always replies to arp queries of
@@ -532,7 +647,7 @@ arp_mangle(const struct nf_hook_ops *ops,
 	if (c->dev != out) {
 		pr_debug("not mangling arp reply on different "
 			 "interface: cip'%s'-skb'%s'\n",
-			 c->dev->name, out->name);
+			 c->dev_name, out->name);
 		clusterip_config_put(c);
 		return NF_ACCEPT;
 	}
@@ -753,10 +868,14 @@ static int __init clusterip_tg_init(void)
 	if (ret < 0)
 		return ret;
 
-	ret = xt_register_target(&clusterip_tg_reg);
+	ret = register_netdevice_notifier(&cip_dev_notifier);
 	if (ret < 0)
 		goto cleanup_subsys;
 
+	ret = xt_register_target(&clusterip_tg_reg);
+	if (ret < 0)
+		goto cleanup_notifier;
+
 	ret = nf_register_hook(&cip_arp_ops);
 	if (ret < 0)
 		goto cleanup_target;
@@ -768,6 +887,8 @@ static int __init clusterip_tg_init(void)
 
 cleanup_target:
 	xt_unregister_target(&clusterip_tg_reg);
+cleanup_notifier:
+	unregister_netdevice_notifier(&cip_dev_notifier);
 cleanup_subsys:
 	unregister_pernet_subsys(&clusterip_net_ops);
 	return ret;
@@ -779,6 +900,7 @@ static void __exit clusterip_tg_exit(void)
 
 	nf_unregister_hook(&cip_arp_ops);
 	xt_unregister_target(&clusterip_tg_reg);
+	unregister_netdevice_notifier(&cip_dev_notifier);
 	unregister_pernet_subsys(&clusterip_net_ops);
 
 	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */


                 reply	other threads:[~2014-04-07 11:51 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140407113644.22540.42391.stgit@tkhai \
    --to=ktkhai@parallels.com \
    --cc=kaber@trash.net \
    --cc=kadlec@blackhole.kfki.hu \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pablo@netfilter.org \
    --cc=tkhai@yandex.ru \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.