From: Cyrill Gorcunov <gorcunov@gmail.com>
To: NETDEV <netdev@vger.kernel.org>
Cc: Solar Designer <solar@openwall.com>,
Vasily Averin <vvs@virtuozzo.com>,
Andrey Vagin <avagin@virtuozzo.com>,
Pavel Emelianov <xemul@virtuozzo.com>,
Vladimir Davydov <vdavydov@virtuozzo.com>,
Konstantin Khorenko <khorenko@virtuozzo.com>,
David Miller <davem@davemloft.net>,
Eric Dumazet <eric.dumazet@gmail.com>
Subject: [RFC] net: ipv4 -- Introduce ifa limit per net
Date: Sat, 5 Mar 2016 00:39:20 +0300 [thread overview]
Message-ID: <20160304213920.GJ4184@uranus.lan> (raw)
Currenlty all the kernels (including vanilla) free ifa
list under rtln_lock() taken which takes a huge time
to release all entries when we stop the container.
Moreover it's allowed to create unlimited number
of addresses from inside of net-namespace if
CAP-NET_ADMIN granted (which is common for containers).
Lets introduce per-net limit (4096 by default)
of addresses, which can be tuned up via sysfs
entry /proc/sys/net/ipv4/ifa_limit.
Reported-by: Solar Designer <solar@openwall.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
CC: Vasily Averin <vvs@virtuozzo.com>
CC: Andrey Vagin <avagin@virtuozzo.com>
CC: Pavel Emelianov <xemul@virtuozzo.com>
CC: Vladimir Davydov <vdavydov@virtuozzo.com>
CC: Konstantin Khorenko <khorenko@virtuozzo.com>
CC: David Miller <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
---
Please share the idea if there some more elegant way exist
to fix this problem, maybe I miss something obvious. Thanks!
include/net/netns/ipv4.h | 3 +++
net/ipv4/devinet.c | 34 +++++++++++++++++++---------------
net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
3 files changed, 30 insertions(+), 15 deletions(-)
Index: linux-ml.git/include/net/netns/ipv4.h
===================================================================
--- linux-ml.git.orig/include/net/netns/ipv4.h
+++ linux-ml.git/include/net/netns/ipv4.h
@@ -77,6 +77,8 @@ struct netns_ipv4 {
struct local_ports ip_local_ports;
+ int sysctl_ifa_limit;
+
int sysctl_tcp_ecn;
int sysctl_tcp_ecn_fallback;
@@ -101,6 +103,7 @@ struct netns_ipv4 {
struct ping_group_range ping_group_range;
atomic_t dev_addr_genid;
+ atomic_t ifa_nr;
#ifdef CONFIG_SYSCTL
unsigned long *sysctl_local_reserved_ports;
Index: linux-ml.git/net/ipv4/devinet.c
===================================================================
--- linux-ml.git.orig/net/ipv4/devinet.c
+++ linux-ml.git/net/ipv4/devinet.c
@@ -194,8 +194,11 @@ static void devinet_sysctl_unregister(st
/* Locks all the inet devices. */
-static struct in_ifaddr *inet_alloc_ifa(void)
+static struct in_ifaddr *inet_alloc_ifa(struct net *net)
{
+ if (atomic_add_return(1, &net->ipv4.ifa_nr) >
+ net->ipv4.sysctl_ifa_limit)
+ return NULL;
return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
}
@@ -207,8 +210,9 @@ static void inet_rcu_free_ifa(struct rcu
kfree(ifa);
}
-static void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct net *net, struct in_ifaddr *ifa)
{
+ atomic_dec(&net->ipv4.ifa_nr);
call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
}
@@ -296,7 +300,7 @@ static void inetdev_destroy(struct in_de
while ((ifa = in_dev->ifa_list) != NULL) {
inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
- inet_free_ifa(ifa);
+ inet_free_ifa(dev_net(dev), ifa);
}
RCU_INIT_POINTER(dev->ip_ptr, NULL);
@@ -361,7 +365,7 @@ static void __inet_del_ifa(struct in_dev
rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_DOWN, ifa);
- inet_free_ifa(ifa);
+ inet_free_ifa(dev_net(in_dev->dev), ifa);
} else {
promote = ifa;
break;
@@ -420,7 +424,7 @@ static void __inet_del_ifa(struct in_dev
}
if (destroy)
- inet_free_ifa(ifa1);
+ inet_free_ifa(dev_net(in_dev->dev), ifa1);
}
static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
@@ -442,7 +446,7 @@ static int __inet_insert_ifa(struct in_i
ASSERT_RTNL();
if (!ifa->ifa_local) {
- inet_free_ifa(ifa);
+ inet_free_ifa(dev_net(in_dev->dev), ifa);
return 0;
}
@@ -457,11 +461,11 @@ static int __inet_insert_ifa(struct in_i
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa)) {
if (ifa1->ifa_local == ifa->ifa_local) {
- inet_free_ifa(ifa);
+ inet_free_ifa(dev_net(in_dev->dev), ifa);
return -EEXIST;
}
if (ifa1->ifa_scope != ifa->ifa_scope) {
- inet_free_ifa(ifa);
+ inet_free_ifa(dev_net(in_dev->dev), ifa);
return -EINVAL;
}
ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -502,7 +506,7 @@ static int inet_set_ifa(struct net_devic
ASSERT_RTNL();
if (!in_dev) {
- inet_free_ifa(ifa);
+ inet_free_ifa(dev_net(dev), ifa);
return -ENOBUFS;
}
ipv4_devconf_setall(in_dev);
@@ -768,7 +772,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s
if (!in_dev)
goto errout;
- ifa = inet_alloc_ifa();
+ ifa = inet_alloc_ifa(net);
if (!ifa)
/*
* A potential indev allocation can be left alive, it stays
@@ -817,7 +821,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s
return ifa;
errout_free:
- inet_free_ifa(ifa);
+ inet_free_ifa(net, ifa);
errout:
return ERR_PTR(err);
}
@@ -865,13 +869,13 @@ static int inet_rtm_newaddr(struct sk_bu
true, ifa);
if (ret < 0) {
- inet_free_ifa(ifa);
+ inet_free_ifa(net, ifa);
return ret;
}
}
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
} else {
- inet_free_ifa(ifa);
+ inet_free_ifa(net, ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1055,7 +1059,7 @@ int devinet_ioctl(struct net *net, unsig
if (!ifa) {
ret = -ENOBUFS;
- ifa = inet_alloc_ifa();
+ ifa = inet_alloc_ifa(net);
if (!ifa)
break;
INIT_HLIST_NODE(&ifa->hash);
@@ -1408,7 +1412,7 @@ static int inetdev_event(struct notifier
if (!inetdev_valid_mtu(dev->mtu))
break;
if (dev->flags & IFF_LOOPBACK) {
- struct in_ifaddr *ifa = inet_alloc_ifa();
+ struct in_ifaddr *ifa = inet_alloc_ifa(dev_net(dev));
if (ifa) {
INIT_HLIST_NODE(&ifa->hash);
Index: linux-ml.git/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-ml.git.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-ml.git/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,13 @@ static struct ctl_table ipv4_net_table[]
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "ifa_limit",
+ .data = &init_net.ipv4.sysctl_ifa_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -988,6 +995,7 @@ static __net_init int ipv4_sysctl_init_n
if (!net->ipv4.sysctl_local_reserved_ports)
goto err_ports;
+ net->ipv4.sysctl_ifa_limit = 4096;
return 0;
err_ports:
next reply other threads:[~2016-03-04 21:39 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-03-04 21:39 Cyrill Gorcunov [this message]
2016-03-04 22:50 ` [RFC] net: ipv4 -- Introduce ifa limit per net David Miller
2016-03-05 0:08 ` Eric Dumazet
2016-03-05 4:11 ` David Miller
2016-03-05 7:18 ` Cyrill Gorcunov
2016-03-05 15:57 ` Cyrill Gorcunov
2016-03-05 16:33 ` David Miller
2016-03-05 17:00 ` Cyrill Gorcunov
2016-03-05 18:44 ` Cyrill Gorcunov
2016-03-06 10:09 ` Cyrill Gorcunov
2016-03-06 16:23 ` Eric Dumazet
2016-03-06 17:06 ` Cyrill Gorcunov
2016-03-09 16:39 ` Cyrill Gorcunov
2016-03-09 16:51 ` Cyrill Gorcunov
2016-03-09 16:58 ` Alexei Starovoitov
2016-03-09 17:09 ` Cyrill Gorcunov
2016-03-09 17:24 ` David Miller
2016-03-09 17:53 ` Cyrill Gorcunov
2016-03-09 19:55 ` Cyrill Gorcunov
2016-03-09 20:27 ` David Miller
2016-03-09 20:41 ` Cyrill Gorcunov
2016-03-09 20:47 ` David Miller
2016-03-09 20:57 ` Cyrill Gorcunov
2016-03-09 21:10 ` David Miller
2016-03-09 21:16 ` Cyrill Gorcunov
2016-03-10 10:20 ` Cyrill Gorcunov
2016-03-10 11:03 ` Cyrill Gorcunov
2016-03-10 15:09 ` Cyrill Gorcunov
2016-03-10 18:01 ` David Miller
2016-03-10 18:48 ` Cyrill Gorcunov
2016-03-10 19:02 ` Cong Wang
2016-03-10 19:55 ` David Miller
2016-03-10 20:01 ` Cyrill Gorcunov
2016-03-10 20:03 ` David Miller
2016-03-10 20:13 ` Cyrill Gorcunov
2016-03-10 20:19 ` Cyrill Gorcunov
2016-03-10 21:05 ` David Miller
2016-03-10 21:19 ` Cyrill Gorcunov
2016-03-10 21:59 ` Cyrill Gorcunov
2016-03-10 22:36 ` David Miller
2016-03-10 22:40 ` Cyrill Gorcunov
2016-03-11 20:40 ` David Miller
2016-03-11 20:58 ` Florian Westphal
2016-03-11 21:00 ` Cyrill Gorcunov
2016-03-11 21:22 ` Cyrill Gorcunov
2016-03-11 21:59 ` Cyrill Gorcunov
2016-03-14 3:29 ` David Miller
2016-03-10 21:09 ` Cong Wang
2016-03-09 17:19 ` David Miller
2016-03-05 6:58 ` Cyrill Gorcunov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160304213920.GJ4184@uranus.lan \
--to=gorcunov@gmail.com \
--cc=avagin@virtuozzo.com \
--cc=davem@davemloft.net \
--cc=eric.dumazet@gmail.com \
--cc=khorenko@virtuozzo.com \
--cc=netdev@vger.kernel.org \
--cc=solar@openwall.com \
--cc=vdavydov@virtuozzo.com \
--cc=vvs@virtuozzo.com \
--cc=xemul@virtuozzo.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).