* [PATCH 0/7] Batched netns improvements.
@ 2009-12-03 12:27 Eric W. Biederman
2009-12-03 12:29 ` [PATCH 1/7] net: Add support for batching network namespace cleanups Eric W. Biederman
` (8 more replies)
0 siblings, 9 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:27 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano
This series of patches takes my simple 4k network namespace exit test
from about 44s to 1.6s, with a minuscule increase in code side.
While batching has given me big improvements there are a few
changes in there that improve the performance of cleaning up
a single network namespace.
Eric
include/net/fib_rules.h | 3 +-
include/net/inet_timewait_sock.h | 6 +-
include/net/net_namespace.h | 2 +
include/net/netns/xfrm.h | 1 +
net/core/dev.c | 25 +++++++
net/core/fib_rules.c | 36 +++++++++-
net/core/net_namespace.c | 138 ++++++++++++++++----------------------
net/decnet/dn_rules.c | 22 +++---
net/ipv4/fib_rules.c | 12 +---
net/ipv4/inet_timewait_sock.c | 45 ++++++++-----
net/ipv4/tcp_ipv4.c | 11 ++-
net/ipv6/fib6_rules.c | 22 ++----
net/ipv6/tcp_ipv6.c | 11 ++-
net/xfrm/xfrm_user.c | 18 +++--
14 files changed, 200 insertions(+), 152 deletions(-)
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 1/7] net: Add support for batching network namespace cleanups
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 12:29 ` [PATCH 2/7] net: Move network device exit batching Eric W. Biederman
` (7 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
- Add exit_list to struct net to support building lists of network
namespaces to cleanup.
- Add exit_batch to pernet_operations to allow running operations only
once during a network namespace exit. Instead of once per network
namespace.
- Factor opt ops_exit_list and ops_exit_free so the logic with cleanup
up a network namespace does not need to be duplicated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
include/net/net_namespace.h | 2 +
net/core/net_namespace.c | 122 +++++++++++++++++++++---------------------
2 files changed, 63 insertions(+), 61 deletions(-)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 24a8c55..f307e13 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -43,6 +43,7 @@ struct net {
#endif
struct list_head list; /* list of network namespaces */
struct list_head cleanup_list; /* namespaces on death row */
+ struct list_head exit_list; /* Use only net_mutex */
struct proc_dir_entry *proc_net;
struct proc_dir_entry *proc_net_stat;
@@ -236,6 +237,7 @@ struct pernet_operations {
struct list_head list;
int (*init)(struct net *net);
void (*exit)(struct net *net);
+ void (*exit_batch)(struct list_head *net_exit_list);
int *id;
size_t size;
};
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9679ad2..6c7f6e0 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -70,6 +70,36 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
}
}
+static void ops_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->exit(net);
+ }
+ if (&ops->list == first_device) {
+ LIST_HEAD(dev_kill_list);
+ rtnl_lock();
+ list_for_each_entry(net, net_exit_list, exit_list)
+ unregister_netdevices(net, &dev_kill_list);
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+ }
+ if (ops->exit_batch)
+ ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->size && ops->id) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops_free(ops, net);
+ }
+}
+
/*
* setup_net runs the initializers for the network namespace object.
*/
@@ -78,6 +108,7 @@ static __net_init int setup_net(struct net *net)
/* Must be called with net_mutex held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
+ LIST_HEAD(net_exit_list);
atomic_set(&net->count, 1);
@@ -97,21 +128,14 @@ out_undo:
/* Walk through the list backwards calling the exit functions
* for the pernet modules whose init functions did not fail.
*/
+ list_add(&net->exit_list, &net_exit_list);
saved_ops = ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
- if (ops->exit)
- ops->exit(net);
- if (&ops->list == first_device) {
- LIST_HEAD(dev_kill_list);
- rtnl_lock();
- unregister_netdevices(net, &dev_kill_list);
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
- }
- }
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
ops = saved_ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_free(ops, net);
+ ops_free_list(ops, &net_exit_list);
rcu_barrier();
goto out;
@@ -207,6 +231,7 @@ static void cleanup_net(struct work_struct *work)
const struct pernet_operations *ops;
struct net *net, *tmp;
LIST_HEAD(net_kill_list);
+ LIST_HEAD(net_exit_list);
/* Atomically snapshot the list of namespaces to cleanup */
spin_lock_irq(&cleanup_list_lock);
@@ -217,8 +242,10 @@ static void cleanup_net(struct work_struct *work)
/* Don't let anyone else find us. */
rtnl_lock();
- list_for_each_entry(net, &net_kill_list, cleanup_list)
+ list_for_each_entry(net, &net_kill_list, cleanup_list) {
list_del_rcu(&net->list);
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
rtnl_unlock();
/*
@@ -229,27 +256,12 @@ static void cleanup_net(struct work_struct *work)
synchronize_rcu();
/* Run all of the network namespace exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list) {
- if (ops->exit) {
- list_for_each_entry(net, &net_kill_list, cleanup_list)
- ops->exit(net);
- }
- if (&ops->list == first_device) {
- LIST_HEAD(dev_kill_list);
- rtnl_lock();
- list_for_each_entry(net, &net_kill_list, cleanup_list)
- unregister_netdevices(net, &dev_kill_list);
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
- }
- }
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
/* Free the net generic variables */
- list_for_each_entry_reverse(ops, &pernet_list, list) {
- if (ops->size && ops->id) {
- list_for_each_entry(net, &net_kill_list, cleanup_list)
- ops_free(ops, net);
- }
- }
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
mutex_unlock(&net_mutex);
@@ -259,8 +271,8 @@ static void cleanup_net(struct work_struct *work)
rcu_barrier();
/* Finally it is safe to free my network namespace structure */
- list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) {
- list_del_init(&net->cleanup_list);
+ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+ list_del_init(&net->exit_list);
net_free(net);
}
}
@@ -348,8 +360,9 @@ pure_initcall(net_ns_init);
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
- struct net *net, *undo_net;
+ struct net *net;
int error;
+ LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
if (ops->init || (ops->id && ops->size)) {
@@ -357,6 +370,7 @@ static int __register_pernet_operations(struct list_head *list,
error = ops_init(ops, net);
if (error)
goto out_undo;
+ list_add_tail(&net->exit_list, &net_exit_list);
}
}
return 0;
@@ -364,36 +378,21 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- if (ops->exit) {
- for_each_net(undo_net) {
- if (net_eq(undo_net, net))
- goto undone;
- ops->exit(undo_net);
- }
- }
-undone:
- if (ops->size && ops->id) {
- for_each_net(undo_net) {
- if (net_eq(undo_net, net))
- goto freed;
- ops_free(ops, undo_net);
- }
- }
-freed:
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
return error;
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
struct net *net;
+ LIST_HEAD(net_exit_list);
list_del(&ops->list);
- if (ops->exit)
- for_each_net(net)
- ops->exit(net);
- if (ops->id && ops->size)
- for_each_net(net)
- ops_free(ops, net);
+ for_each_net(net)
+ list_add_tail(&net->exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
}
#else
@@ -411,9 +410,10 @@ static int __register_pernet_operations(struct list_head *list,
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- if (ops->exit)
- ops->exit(&init_net);
- ops_free(ops, &init_net);
+ LIST_HEAD(net_exit_list);
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
}
#endif /* CONFIG_NET_NS */
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 2/7] net: Move network device exit batching
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
2009-12-03 12:29 ` [PATCH 1/7] net: Add support for batching network namespace cleanups Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 12:29 ` [PATCH 3/7] net: Allow xfrm_user_net_exit to batch efficiently Eric W. Biederman
` (6 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
Move network device exit batching from a special case in
net_namespace.c to using common mechanisms in dev.c
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
net/core/dev.c | 25 +++++++++++++++++++++++++
net/core/net_namespace.c | 24 ------------------------
2 files changed, 25 insertions(+), 24 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index e3e18de..0913a08 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5766,8 +5766,33 @@ static void __net_exit default_device_exit(struct net *net)
rtnl_unlock();
}
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+ /* At exit all network devices most be removed from a network
+ * namespace. Do this in the reverse order of registeration.
+ * Do this across as many network namespaces as possible to
+ * improve batching efficiency.
+ */
+ struct net_device *dev;
+ struct net *net;
+ LIST_HEAD(dev_kill_list);
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ for_each_netdev_reverse(net, dev) {
+ if (dev->rtnl_link_ops)
+ dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+ else
+ unregister_netdevice_queue(dev, &dev_kill_list);
+ }
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+}
+
static struct pernet_operations __net_initdata default_device_ops = {
.exit = default_device_exit,
+ .exit_batch = default_device_exit_batch,
};
/*
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6c7f6e0..4026a4c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -8,10 +8,8 @@
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
-#include <linux/netdevice.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <net/rtnetlink.h>
/*
* Our network namespace constructor/destructor lists
@@ -29,20 +27,6 @@ EXPORT_SYMBOL(init_net);
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
-static void unregister_netdevices(struct net *net, struct list_head *list)
-{
- struct net_device *dev;
- /* At exit all network devices most be removed from a network
- * namespace. Do this in the reverse order of registeration.
- */
- for_each_netdev_reverse(net, dev) {
- if (dev->rtnl_link_ops)
- dev->rtnl_link_ops->dellink(dev, list);
- else
- unregister_netdevice_queue(dev, list);
- }
-}
-
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
int err;
@@ -78,14 +62,6 @@ static void ops_exit_list(const struct pernet_operations *ops,
list_for_each_entry(net, net_exit_list, exit_list)
ops->exit(net);
}
- if (&ops->list == first_device) {
- LIST_HEAD(dev_kill_list);
- rtnl_lock();
- list_for_each_entry(net, net_exit_list, exit_list)
- unregister_netdevices(net, &dev_kill_list);
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
- }
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
}
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 3/7] net: Allow xfrm_user_net_exit to batch efficiently.
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
2009-12-03 12:29 ` [PATCH 1/7] net: Add support for batching network namespace cleanups Eric W. Biederman
2009-12-03 12:29 ` [PATCH 2/7] net: Move network device exit batching Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 12:29 ` [PATCH 4/7] netns: Add an explicit rcu_barrier to unregister_pernet_{device|subsys} Eric W. Biederman
` (5 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
xfrm.nlsk is provided by the xfrm_user module and is access via rcu from
other parts of the xfrm code. Add xfrm.nlsk_stash a copy of xfrm.nlsk that
will never be set to NULL. This allows the synchronize_net and
netlink_kernel_release to be deferred until a whole batch of xfrm.nlsk sockets
have been set to NULL.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
include/net/netns/xfrm.h | 1 +
net/xfrm/xfrm_user.c | 18 ++++++++++--------
2 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1ba9127..56f8e55 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -43,6 +43,7 @@ struct netns_xfrm {
struct work_struct policy_hash_work;
struct sock *nlsk;
+ struct sock *nlsk_stash;
u32 sysctl_aevent_etime;
u32 sysctl_aevent_rseqth;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index fb42d77..1ada618 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2721,22 +2721,24 @@ static int __net_init xfrm_user_net_init(struct net *net)
xfrm_netlink_rcv, NULL, THIS_MODULE);
if (nlsk == NULL)
return -ENOMEM;
+ net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
rcu_assign_pointer(net->xfrm.nlsk, nlsk);
return 0;
}
-static void __net_exit xfrm_user_net_exit(struct net *net)
+static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
{
- struct sock *nlsk = net->xfrm.nlsk;
-
- rcu_assign_pointer(net->xfrm.nlsk, NULL);
- synchronize_rcu();
- netlink_kernel_release(nlsk);
+ struct net *net;
+ list_for_each_entry(net, net_exit_list, exit_list)
+ rcu_assign_pointer(net->xfrm.nlsk, NULL);
+ synchronize_net();
+ list_for_each_entry(net, net_exit_list, exit_list)
+ netlink_kernel_release(net->xfrm.nlsk_stash);
}
static struct pernet_operations xfrm_user_net_ops = {
- .init = xfrm_user_net_init,
- .exit = xfrm_user_net_exit,
+ .init = xfrm_user_net_init,
+ .exit_batch = xfrm_user_net_exit,
};
static int __init xfrm_user_init(void)
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 4/7] netns: Add an explicit rcu_barrier to unregister_pernet_{device|subsys}
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
` (2 preceding siblings ...)
2009-12-03 12:29 ` [PATCH 3/7] net: Allow xfrm_user_net_exit to batch efficiently Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 12:29 ` [PATCH 5/7] net: Allow fib_rule_unregister to batch Eric W. Biederman
` (4 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
This allows namespace exit methods to batch work that comes requires an
rcu barrier using call_rcu without having to treat the
unregister_pernet_operations cases specially.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
net/core/net_namespace.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 4026a4c..bd8c471 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -413,8 +413,11 @@ again:
}
}
error = __register_pernet_operations(list, ops);
- if (error && ops->id)
- ida_remove(&net_generic_ids, *ops->id);
+ if (error) {
+ rcu_barrier();
+ if (ops->id)
+ ida_remove(&net_generic_ids, *ops->id);
+ }
return error;
}
@@ -423,6 +426,7 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
{
__unregister_pernet_operations(ops);
+ rcu_barrier();
if (ops->id)
ida_remove(&net_generic_ids, *ops->id);
}
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 5/7] net: Allow fib_rule_unregister to batch
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
` (3 preceding siblings ...)
2009-12-03 12:29 ` [PATCH 4/7] netns: Add an explicit rcu_barrier to unregister_pernet_{device|subsys} Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 12:29 ` [PATCH 6/7] net: Use rcu lookups in inet_twsk_purge Eric W. Biederman
` (3 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
Refactor the code so fib_rules_register always takes a template instead
of the actual fib_rules_ops structure that will be used. This is
required for network namespace support so 2 out of the 3 callers already
do this, it allows the error handling to be made common, and it allows
fib_rules_unregister to free the template for hte caller.
Modify fib_rules_unregister to use call_rcu instead of syncrhonize_rcu
to allw multiple namespaces to be cleaned up in the same rcu grace
period.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
include/net/fib_rules.h | 3 ++-
net/core/fib_rules.c | 36 +++++++++++++++++++++++++++++++++---
net/decnet/dn_rules.c | 22 ++++++++++++----------
net/ipv4/fib_rules.c | 12 +++---------
net/ipv6/fib6_rules.c | 22 +++++++---------------
5 files changed, 57 insertions(+), 38 deletions(-)
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 2cd707b..c27dfe7 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -64,6 +64,7 @@ struct fib_rules_ops {
struct list_head rules_list;
struct module *owner;
struct net *fro_net;
+ struct rcu_head rcu;
};
#define FRA_GENERIC_POLICY \
@@ -99,7 +100,7 @@ static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla)
return frh->table;
}
-extern int fib_rules_register(struct fib_rules_ops *);
+extern struct fib_rules_ops *fib_rules_register(struct fib_rules_ops *, struct net *);
extern void fib_rules_unregister(struct fib_rules_ops *);
extern void fib_rules_cleanup_ops(struct fib_rules_ops *);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bd30938..7e8e18f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -72,7 +72,7 @@ static void flush_route_cache(struct fib_rules_ops *ops)
ops->flush_cache(ops);
}
-int fib_rules_register(struct fib_rules_ops *ops)
+static int __fib_rules_register(struct fib_rules_ops *ops)
{
int err = -EEXIST;
struct fib_rules_ops *o;
@@ -102,6 +102,28 @@ errout:
return err;
}
+struct fib_rules_ops *
+fib_rules_register(struct fib_rules_ops *tmpl, struct net *net)
+{
+ struct fib_rules_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL);
+ if (ops == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ops->rules_list);
+ ops->fro_net = net;
+
+ err = __fib_rules_register(ops);
+ if (err) {
+ kfree(ops);
+ ops = ERR_PTR(err);
+ }
+
+ return ops;
+}
+
EXPORT_SYMBOL_GPL(fib_rules_register);
void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
@@ -115,6 +137,15 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
+static void fib_rules_put_rcu(struct rcu_head *head)
+{
+ struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
+ struct net *net = ops->fro_net;
+
+ release_net(net);
+ kfree(ops);
+}
+
void fib_rules_unregister(struct fib_rules_ops *ops)
{
struct net *net = ops->fro_net;
@@ -124,8 +155,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
fib_rules_cleanup_ops(ops);
spin_unlock(&net->rules_mod_lock);
- synchronize_rcu();
- release_net(net);
+ call_rcu(&ops->rcu, fib_rules_put_rcu);
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 72495f2..7466c54 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -33,7 +33,7 @@
#include <net/dn_dev.h>
#include <net/dn_route.h>
-static struct fib_rules_ops dn_fib_rules_ops;
+static struct fib_rules_ops *dn_fib_rules_ops;
struct dn_fib_rule
{
@@ -56,7 +56,7 @@ int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
};
int err;
- err = fib_rules_lookup(&dn_fib_rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(dn_fib_rules_ops, flp, 0, &arg);
res->r = arg.rule;
return err;
@@ -217,9 +217,9 @@ static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops)
struct list_head *pos;
struct fib_rule *rule;
- if (!list_empty(&dn_fib_rules_ops.rules_list)) {
- pos = dn_fib_rules_ops.rules_list.next;
- if (pos->next != &dn_fib_rules_ops.rules_list) {
+ if (!list_empty(&dn_fib_rules_ops->rules_list)) {
+ pos = dn_fib_rules_ops->rules_list.next;
+ if (pos->next != &dn_fib_rules_ops->rules_list) {
rule = list_entry(pos->next, struct fib_rule, list);
if (rule->pref)
return rule->pref - 1;
@@ -234,7 +234,7 @@ static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
dn_rt_cache_flush(-1);
}
-static struct fib_rules_ops dn_fib_rules_ops = {
+static struct fib_rules_ops dn_fib_rules_ops_template = {
.family = AF_DECnet,
.rule_size = sizeof(struct dn_fib_rule),
.addr_size = sizeof(u16),
@@ -247,21 +247,23 @@ static struct fib_rules_ops dn_fib_rules_ops = {
.flush_cache = dn_fib_rule_flush_cache,
.nlgroup = RTNLGRP_DECnet_RULE,
.policy = dn_fib_rule_policy,
- .rules_list = LIST_HEAD_INIT(dn_fib_rules_ops.rules_list),
.owner = THIS_MODULE,
.fro_net = &init_net,
};
void __init dn_fib_rules_init(void)
{
- BUG_ON(fib_default_rule_add(&dn_fib_rules_ops, 0x7fff,
+ dn_fib_rules_ops =
+ fib_rules_register(&dn_fib_rules_ops_template, &init_net);
+ BUG_ON(IS_ERR(dn_fib_rules_ops));
+ BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff,
RT_TABLE_MAIN, 0));
- fib_rules_register(&dn_fib_rules_ops);
}
void __exit dn_fib_rules_cleanup(void)
{
- fib_rules_unregister(&dn_fib_rules_ops);
+ fib_rules_unregister(dn_fib_rules_ops);
+ rcu_barrier();
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 835262c..9a6d30b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net)
int err;
struct fib_rules_ops *ops;
- ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL);
- if (ops == NULL)
- return -ENOMEM;
- INIT_LIST_HEAD(&ops->rules_list);
- ops->fro_net = net;
-
- fib_rules_register(ops);
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
err = fib_default_rules_init(ops);
if (err < 0)
@@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net)
fail:
/* also cleans all rules already added */
fib_rules_unregister(ops);
- kfree(ops);
return err;
}
void __net_exit fib4_rules_exit(struct net *net)
{
fib_rules_unregister(net->ipv4.rules_ops);
- kfree(net->ipv4.rules_ops);
}
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 00a7a5e..1eb1c07 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -264,16 +264,14 @@ static struct fib_rules_ops fib6_rules_ops_template = {
static int fib6_rules_net_init(struct net *net)
{
+ struct fib_rules_ops *ops;
int err = -ENOMEM;
- net->ipv6.fib6_rules_ops = kmemdup(&fib6_rules_ops_template,
- sizeof(*net->ipv6.fib6_rules_ops),
- GFP_KERNEL);
- if (!net->ipv6.fib6_rules_ops)
- goto out;
+ ops = fib_rules_register(&fib6_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv6.fib6_rules_ops = ops;
- net->ipv6.fib6_rules_ops->fro_net = net;
- INIT_LIST_HEAD(&net->ipv6.fib6_rules_ops->rules_list);
err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0,
RT6_TABLE_LOCAL, FIB_RULE_PERMANENT);
@@ -283,25 +281,19 @@ static int fib6_rules_net_init(struct net *net)
err = fib_default_rule_add(net->ipv6.fib6_rules_ops,
0x7FFE, RT6_TABLE_MAIN, 0);
if (err)
- goto out_fib6_default_rule_add;
+ goto out_fib6_rules_ops;
- err = fib_rules_register(net->ipv6.fib6_rules_ops);
- if (err)
- goto out_fib6_default_rule_add;
out:
return err;
-out_fib6_default_rule_add:
- fib_rules_cleanup_ops(net->ipv6.fib6_rules_ops);
out_fib6_rules_ops:
- kfree(net->ipv6.fib6_rules_ops);
+ fib_rules_unregister(ops);
goto out;
}
static void fib6_rules_net_exit(struct net *net)
{
fib_rules_unregister(net->ipv6.fib6_rules_ops);
- kfree(net->ipv6.fib6_rules_ops);
}
static struct pernet_operations fib6_rules_net_ops = {
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 6/7] net: Use rcu lookups in inet_twsk_purge.
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
` (4 preceding siblings ...)
2009-12-03 12:29 ` [PATCH 5/7] net: Allow fib_rule_unregister to batch Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 13:17 ` Eric Dumazet
2009-12-03 12:29 ` [PATCH 7/7] net: Batch inet_twsk_purge Eric W. Biederman
` (2 subsequent siblings)
8 siblings, 1 reply; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
While we are looking up entries to free there is no reason to take
the lock in inet_twsk_purge. We have to drop locks and restart
occassionally anyway so adding a few more in case we get on the
wrong list because of a timewait move is no big deal. At the
same time not taking the lock for long periods of time is much
more polite to the rest of the users of the hash table.
In my test configuration of killing 4k network namespaces
this change causes 4k back to back runs of inet_twsk_purge on an
empty hash table to go from roughly 20.7s to 3.3s, and the total
time to destroy 4k network namespaces goes from roughly 44s to
3.3s.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
net/ipv4/inet_timewait_sock.c | 39 ++++++++++++++++++++++++---------------
1 files changed, 24 insertions(+), 15 deletions(-)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1f5d508..683ecec 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -427,31 +427,40 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
struct inet_timewait_sock *tw;
struct sock *sk;
struct hlist_nulls_node *node;
- int h;
+ unsigned int slot;
- local_bh_disable();
- for (h = 0; h <= hashinfo->ehash_mask; h++) {
- struct inet_ehash_bucket *head =
- inet_ehash_bucket(hashinfo, h);
- spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
+ for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+ rcu_read_lock();
restart:
- spin_lock(lock);
- sk_nulls_for_each(sk, node, &head->twchain) {
-
+ sk_nulls_for_each_rcu(sk, node, &head->twchain) {
tw = inet_twsk(sk);
if (!net_eq(twsk_net(tw), net) ||
tw->tw_family != family)
continue;
- atomic_inc(&tw->tw_refcnt);
- spin_unlock(lock);
+ if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
+ continue;
+
+ if (unlikely(!net_eq(twsk_net(tw), net) ||
+ tw->tw_family != family)) {
+ inet_twsk_put(tw);
+ goto restart;
+ }
+
+ rcu_read_unlock();
inet_twsk_deschedule(tw, twdr);
inet_twsk_put(tw);
-
- goto restart;
+ goto restart_rcu;
}
- spin_unlock(lock);
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
}
- local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_twsk_purge);
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 7/7] net: Batch inet_twsk_purge
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
` (5 preceding siblings ...)
2009-12-03 12:29 ` [PATCH 6/7] net: Use rcu lookups in inet_twsk_purge Eric W. Biederman
@ 2009-12-03 12:29 ` Eric W. Biederman
2009-12-03 13:23 ` Eric Dumazet
2009-12-03 13:06 ` [PATCH 0/7] Batched netns improvements jamal
2009-12-03 20:24 ` David Miller
8 siblings, 1 reply; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 12:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jamal, Daniel Lezcano, Eric W. Biederman
From: Eric W. Biederman <ebiederm@xmission.com>
This function walks the whole hashtable so there is no point in
passing it a network namespace. Instead I purge all timewait
sockets from dead network namespaces that I find. If the namespace
is one of the once I am trying to purge I am guaranteed no new timewait
sockets can be formed so this will get them all. If the namespace
is one I am not acting for it might form a few more but I will
call inet_twsk_purge again and shortly to get rid of them. In
any even if the network namespace is dead timewait sockets are
useless.
Move the calls of inet_twsk_purge into batch_exit routines so
that if I am killing a bunch of namespaces at once I will just
call inet_twsk_purge once and save a lot of redundant unnecessary
work.
My simple 4k network namespace exit test the cleanup time dropped from
roughly 8.2s to 1.6s. While the time spent running inet_twsk_purge fell
to about 2ms. 1ms for ipv4 and 1ms for ipv6.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
include/net/inet_timewait_sock.h | 6 +++---
net/ipv4/inet_timewait_sock.c | 10 +++++-----
net/ipv4/tcp_ipv4.c | 11 ++++++++---
net/ipv6/tcp_ipv6.c | 11 ++++++++---
4 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 773b10f..4fd007f 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -212,14 +212,14 @@ extern void inet_twsk_schedule(struct inet_timewait_sock *tw,
extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr);
-extern void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
+extern void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family);
static inline
struct net *twsk_net(const struct inet_timewait_sock *twsk)
{
#ifdef CONFIG_NET_NS
- return twsk->tw_net;
+ return rcu_dereference(twsk->tw_net);
#else
return &init_net;
#endif
@@ -229,7 +229,7 @@ static inline
void twsk_net_set(struct inet_timewait_sock *twsk, struct net *net)
{
#ifdef CONFIG_NET_NS
- twsk->tw_net = net;
+ rcu_assign_pointer(twsk->tw_net, net);
#endif
}
#endif /* _INET_TIMEWAIT_SOCK_ */
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 683ecec..a3699ac 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -421,7 +421,7 @@ out:
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
-void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family)
{
struct inet_timewait_sock *tw;
@@ -436,15 +436,15 @@ restart_rcu:
restart:
sk_nulls_for_each_rcu(sk, node, &head->twchain) {
tw = inet_twsk(sk);
- if (!net_eq(twsk_net(tw), net) ||
- tw->tw_family != family)
+ if ((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))
continue;
if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
continue;
- if (unlikely(!net_eq(twsk_net(tw), net) ||
- tw->tw_family != family)) {
+ if (unlikely((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))) {
inet_twsk_put(tw);
goto restart;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index df18ce0..e30f026 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2468,12 +2468,17 @@ static int __net_init tcp_sk_init(struct net *net)
static void __net_exit tcp_sk_exit(struct net *net)
{
inet_ctl_sock_destroy(net->ipv4.tcp_sock);
- inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
- .init = tcp_sk_init,
- .exit = tcp_sk_exit,
+ .init = tcp_sk_init,
+ .exit = tcp_sk_exit,
+ .exit_batch = tcp_sk_exit_batch,
};
void __init tcp_v4_init(void)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index de70909..5f46d36 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2126,12 +2126,17 @@ static int tcpv6_net_init(struct net *net)
static void tcpv6_net_exit(struct net *net)
{
inet_ctl_sock_destroy(net->ipv6.tcp_sk);
- inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET6);
+}
+
+static void tcpv6_net_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
}
static struct pernet_operations tcpv6_net_ops = {
- .init = tcpv6_net_init,
- .exit = tcpv6_net_exit,
+ .init = tcpv6_net_init,
+ .exit = tcpv6_net_exit,
+ .exit_batch = tcpv6_net_exit_batch,
};
int __init tcpv6_init(void)
--
1.6.5.2.143.g8cc62
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 0/7] Batched netns improvements.
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
` (6 preceding siblings ...)
2009-12-03 12:29 ` [PATCH 7/7] net: Batch inet_twsk_purge Eric W. Biederman
@ 2009-12-03 13:06 ` jamal
2009-12-03 13:23 ` Eric W. Biederman
2009-12-03 20:24 ` David Miller
8 siblings, 1 reply; 16+ messages in thread
From: jamal @ 2009-12-03 13:06 UTC (permalink / raw)
To: Eric W. Biederman; +Cc: David Miller, netdev, Daniel Lezcano
On Thu, 2009-12-03 at 04:27 -0800, Eric W. Biederman wrote:
> This series of patches takes my simple 4k network namespace exit test
> from about 44s to 1.6s, with a minuscule increase in code side.
Nice.
I will test these as soon as they make it into net-next (just easier
for me given the setup).
I should confirm that:
The previous patches which are already in net-next from yesterday do
improve performance to under 1 minute. Can we call that a several-factor
improvement?;->
Are you planning to give some tender loving to the virtual devices
as well (gre etc) or is that taken care of now you have a pernet batch
exit op?
cheers,
jamal
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 6/7] net: Use rcu lookups in inet_twsk_purge.
2009-12-03 12:29 ` [PATCH 6/7] net: Use rcu lookups in inet_twsk_purge Eric W. Biederman
@ 2009-12-03 13:17 ` Eric Dumazet
0 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2009-12-03 13:17 UTC (permalink / raw)
To: Eric W. Biederman; +Cc: David Miller, netdev, jamal, Daniel Lezcano
Eric W. Biederman a écrit :
> From: Eric W. Biederman <ebiederm@xmission.com>
>
> While we are looking up entries to free there is no reason to take
> the lock in inet_twsk_purge. We have to drop locks and restart
> occassionally anyway so adding a few more in case we get on the
> wrong list because of a timewait move is no big deal. At the
> same time not taking the lock for long periods of time is much
> more polite to the rest of the users of the hash table.
>
> In my test configuration of killing 4k network namespaces
> this change causes 4k back to back runs of inet_twsk_purge on an
> empty hash table to go from roughly 20.7s to 3.3s, and the total
> time to destroy 4k network namespaces goes from roughly 44s to
> 3.3s.
>
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Very nice patch Eric
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> net/ipv4/inet_timewait_sock.c | 39 ++++++++++++++++++++++++---------------
> 1 files changed, 24 insertions(+), 15 deletions(-)
>
> diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
> index 1f5d508..683ecec 100644
> --- a/net/ipv4/inet_timewait_sock.c
> +++ b/net/ipv4/inet_timewait_sock.c
> @@ -427,31 +427,40 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
> struct inet_timewait_sock *tw;
> struct sock *sk;
> struct hlist_nulls_node *node;
> - int h;
> + unsigned int slot;
>
> - local_bh_disable();
> - for (h = 0; h <= hashinfo->ehash_mask; h++) {
> - struct inet_ehash_bucket *head =
> - inet_ehash_bucket(hashinfo, h);
> - spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
> + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
> + struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
> +restart_rcu:
> + rcu_read_lock();
> restart:
> - spin_lock(lock);
> - sk_nulls_for_each(sk, node, &head->twchain) {
> -
> + sk_nulls_for_each_rcu(sk, node, &head->twchain) {
> tw = inet_twsk(sk);
> if (!net_eq(twsk_net(tw), net) ||
> tw->tw_family != family)
> continue;
>
> - atomic_inc(&tw->tw_refcnt);
> - spin_unlock(lock);
> + if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
> + continue;
> +
> + if (unlikely(!net_eq(twsk_net(tw), net) ||
> + tw->tw_family != family)) {
> + inet_twsk_put(tw);
> + goto restart;
> + }
> +
> + rcu_read_unlock();
> inet_twsk_deschedule(tw, twdr);
> inet_twsk_put(tw);
> -
> - goto restart;
> + goto restart_rcu;
> }
> - spin_unlock(lock);
> + /* If the nulls value we got at the end of this lookup is
> + * not the expected one, we must restart lookup.
> + * We probably met an item that was moved to another chain.
> + */
> + if (get_nulls_value(node) != slot)
> + goto restart;
> + rcu_read_unlock();
> }
> - local_bh_enable();
> }
> EXPORT_SYMBOL_GPL(inet_twsk_purge);
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/7] Batched netns improvements.
2009-12-03 13:06 ` [PATCH 0/7] Batched netns improvements jamal
@ 2009-12-03 13:23 ` Eric W. Biederman
0 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 13:23 UTC (permalink / raw)
To: hadi; +Cc: David Miller, netdev, Daniel Lezcano
jamal <hadi@cyberus.ca> writes:
> On Thu, 2009-12-03 at 04:27 -0800, Eric W. Biederman wrote:
>> This series of patches takes my simple 4k network namespace exit test
>> from about 44s to 1.6s, with a minuscule increase in code side.
>
> Nice.
> I will test these as soon as they make it into net-next (just easier
> for me given the setup).
> I should confirm that:
> The previous patches which are already in net-next from yesterday do
> improve performance to under 1 minute. Can we call that a several-factor
> improvement?;->
>
> Are you planning to give some tender loving to the virtual devices
> as well (gre etc) or is that taken care of now you have a pernet batch
> exit op?
The tunnel devices gre, ipip, sit, ip6_tunnel, and the ethernet
bridging still need some tlc to give them rtnl_link_ops, and in
particular dellink operations so the deletions can be done in one big
batch.
The batch operations are mostly a generalization of a bunch of what
was already there.
I keep planning on doing something else and then I wind up working on
this.
Mostly what I have done turned out to be some very low hanging fruit,
and some general cleanups.
Eric
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 7/7] net: Batch inet_twsk_purge
2009-12-03 12:29 ` [PATCH 7/7] net: Batch inet_twsk_purge Eric W. Biederman
@ 2009-12-03 13:23 ` Eric Dumazet
2009-12-03 13:36 ` Eric W. Biederman
0 siblings, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2009-12-03 13:23 UTC (permalink / raw)
To: Eric W. Biederman; +Cc: David Miller, netdev, jamal, Daniel Lezcano
Eric W. Biederman a écrit :
> From: Eric W. Biederman <ebiederm@xmission.com>
>
> This function walks the whole hashtable so there is no point in
> passing it a network namespace. Instead I purge all timewait
> sockets from dead network namespaces that I find. If the namespace
> is one of the once I am trying to purge I am guaranteed no new timewait
> sockets can be formed so this will get them all. If the namespace
> is one I am not acting for it might form a few more but I will
> call inet_twsk_purge again and shortly to get rid of them. In
> any even if the network namespace is dead timewait sockets are
> useless.
>
> Move the calls of inet_twsk_purge into batch_exit routines so
> that if I am killing a bunch of namespaces at once I will just
> call inet_twsk_purge once and save a lot of redundant unnecessary
> work.
>
> My simple 4k network namespace exit test the cleanup time dropped from
> roughly 8.2s to 1.6s. While the time spent running inet_twsk_purge fell
> to about 2ms. 1ms for ipv4 and 1ms for ipv6.
>
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
> ---
> include/net/inet_timewait_sock.h | 6 +++---
> net/ipv4/inet_timewait_sock.c | 10 +++++-----
> net/ipv4/tcp_ipv4.c | 11 ++++++++---
> net/ipv6/tcp_ipv6.c | 11 ++++++++---
> 4 files changed, 24 insertions(+), 14 deletions(-)
>
> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> index 773b10f..4fd007f 100644
> --- a/include/net/inet_timewait_sock.h
> +++ b/include/net/inet_timewait_sock.h
> @@ -212,14 +212,14 @@ extern void inet_twsk_schedule(struct inet_timewait_sock *tw,
> extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
> struct inet_timewait_death_row *twdr);
>
> -extern void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
> +extern void inet_twsk_purge(struct inet_hashinfo *hashinfo,
> struct inet_timewait_death_row *twdr, int family);
>
> static inline
> struct net *twsk_net(const struct inet_timewait_sock *twsk)
> {
> #ifdef CONFIG_NET_NS
> - return twsk->tw_net;
> + return rcu_dereference(twsk->tw_net);
> #else
> return &init_net;
> #endif
> @@ -229,7 +229,7 @@ static inline
> void twsk_net_set(struct inet_timewait_sock *twsk, struct net *net)
> {
> #ifdef CONFIG_NET_NS
> - twsk->tw_net = net;
> + rcu_assign_pointer(twsk->tw_net, net);
> #endif
> }
> #endif /* _INET_TIMEWAIT_SOCK_ */
> diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
> index 683ecec..a3699ac 100644
> --- a/net/ipv4/inet_timewait_sock.c
> +++ b/net/ipv4/inet_timewait_sock.c
> @@ -421,7 +421,7 @@ out:
>
> EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
>
> -void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
> +void inet_twsk_purge(struct inet_hashinfo *hashinfo,
> struct inet_timewait_death_row *twdr, int family)
> {
> struct inet_timewait_sock *tw;
> @@ -436,15 +436,15 @@ restart_rcu:
> restart:
> sk_nulls_for_each_rcu(sk, node, &head->twchain) {
> tw = inet_twsk(sk);
> - if (!net_eq(twsk_net(tw), net) ||
> - tw->tw_family != family)
> + if ((tw->tw_family != family) ||
> + atomic_read(&twsk_net(tw)->count))
> continue;
>
> if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
> continue;
>
> - if (unlikely(!net_eq(twsk_net(tw), net) ||
> - tw->tw_family != family)) {
> + if (unlikely((tw->tw_family != family) ||
> + atomic_read(&twsk_net(tw)->count))) {
> inet_twsk_put(tw);
> goto restart;
> }
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index df18ce0..e30f026 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2468,12 +2468,17 @@ static int __net_init tcp_sk_init(struct net *net)
> static void __net_exit tcp_sk_exit(struct net *net)
> {
> inet_ctl_sock_destroy(net->ipv4.tcp_sock);
> - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
> +}
> +
> +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
> +{
> + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
> }
>
> static struct pernet_operations __net_initdata tcp_sk_ops = {
> - .init = tcp_sk_init,
> - .exit = tcp_sk_exit,
> + .init = tcp_sk_init,
> + .exit = tcp_sk_exit,
> + .exit_batch = tcp_sk_exit_batch,
> };
>
> void __init tcp_v4_init(void)
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index de70909..5f46d36 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -2126,12 +2126,17 @@ static int tcpv6_net_init(struct net *net)
> static void tcpv6_net_exit(struct net *net)
> {
> inet_ctl_sock_destroy(net->ipv6.tcp_sk);
> - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET6);
> +}
> +
> +static void tcpv6_net_exit_batch(struct list_head *net_exit_list)
> +{
> + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
> }
>
> static struct pernet_operations tcpv6_net_ops = {
> - .init = tcpv6_net_init,
> - .exit = tcpv6_net_exit,
> + .init = tcpv6_net_init,
> + .exit = tcpv6_net_exit,
> + .exit_batch = tcpv6_net_exit_batch,
> };
>
> int __init tcpv6_init(void)
OK, but why calling inet_twsk_purge() twice, one for AF_INET, once for AF_INET6
I believe you could zap family check as well in inet_twsk_purge(), and not
need tcpv6_net_ops.exit_batch = tcpv6_net_exit_batch
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 7/7] net: Batch inet_twsk_purge
2009-12-03 13:23 ` Eric Dumazet
@ 2009-12-03 13:36 ` Eric W. Biederman
2009-12-03 20:24 ` David Miller
0 siblings, 1 reply; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 13:36 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, jamal, Daniel Lezcano
Eric Dumazet <eric.dumazet@gmail.com> writes:
> Eric W. Biederman a écrit :
>> From: Eric W. Biederman <ebiederm@xmission.com>
>>
>> This function walks the whole hashtable so there is no point in
>> passing it a network namespace. Instead I purge all timewait
>> sockets from dead network namespaces that I find. If the namespace
>> is one of the once I am trying to purge I am guaranteed no new timewait
>> sockets can be formed so this will get them all. If the namespace
>> is one I am not acting for it might form a few more but I will
>> call inet_twsk_purge again and shortly to get rid of them. In
>> any even if the network namespace is dead timewait sockets are
>> useless.
>>
>> Move the calls of inet_twsk_purge into batch_exit routines so
>> that if I am killing a bunch of namespaces at once I will just
>> call inet_twsk_purge once and save a lot of redundant unnecessary
>> work.
>>
>> My simple 4k network namespace exit test the cleanup time dropped from
>> roughly 8.2s to 1.6s. While the time spent running inet_twsk_purge fell
>> to about 2ms. 1ms for ipv4 and 1ms for ipv6.
>>
>> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
>> ---
>> include/net/inet_timewait_sock.h | 6 +++---
>> net/ipv4/inet_timewait_sock.c | 10 +++++-----
>> net/ipv4/tcp_ipv4.c | 11 ++++++++---
>> net/ipv6/tcp_ipv6.c | 11 ++++++++---
>> 4 files changed, 24 insertions(+), 14 deletions(-)
>>
>> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
>> index 773b10f..4fd007f 100644
>> --- a/include/net/inet_timewait_sock.h
>> +++ b/include/net/inet_timewait_sock.h
>> @@ -212,14 +212,14 @@ extern void inet_twsk_schedule(struct inet_timewait_sock *tw,
>> extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
>> struct inet_timewait_death_row *twdr);
>>
>> -extern void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
>> +extern void inet_twsk_purge(struct inet_hashinfo *hashinfo,
>> struct inet_timewait_death_row *twdr, int family);
>>
>> static inline
>> struct net *twsk_net(const struct inet_timewait_sock *twsk)
>> {
>> #ifdef CONFIG_NET_NS
>> - return twsk->tw_net;
>> + return rcu_dereference(twsk->tw_net);
>> #else
>> return &init_net;
>> #endif
>> @@ -229,7 +229,7 @@ static inline
>> void twsk_net_set(struct inet_timewait_sock *twsk, struct net *net)
>> {
>> #ifdef CONFIG_NET_NS
>> - twsk->tw_net = net;
>> + rcu_assign_pointer(twsk->tw_net, net);
>> #endif
>> }
>> #endif /* _INET_TIMEWAIT_SOCK_ */
>> diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
>> index 683ecec..a3699ac 100644
>> --- a/net/ipv4/inet_timewait_sock.c
>> +++ b/net/ipv4/inet_timewait_sock.c
>> @@ -421,7 +421,7 @@ out:
>>
>> EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
>>
>> -void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
>> +void inet_twsk_purge(struct inet_hashinfo *hashinfo,
>> struct inet_timewait_death_row *twdr, int family)
>> {
>> struct inet_timewait_sock *tw;
>> @@ -436,15 +436,15 @@ restart_rcu:
>> restart:
>> sk_nulls_for_each_rcu(sk, node, &head->twchain) {
>> tw = inet_twsk(sk);
>> - if (!net_eq(twsk_net(tw), net) ||
>> - tw->tw_family != family)
>> + if ((tw->tw_family != family) ||
>> + atomic_read(&twsk_net(tw)->count))
>> continue;
>>
>> if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
>> continue;
>>
>> - if (unlikely(!net_eq(twsk_net(tw), net) ||
>> - tw->tw_family != family)) {
>> + if (unlikely((tw->tw_family != family) ||
>> + atomic_read(&twsk_net(tw)->count))) {
>> inet_twsk_put(tw);
>> goto restart;
>> }
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index df18ce0..e30f026 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -2468,12 +2468,17 @@ static int __net_init tcp_sk_init(struct net *net)
>> static void __net_exit tcp_sk_exit(struct net *net)
>> {
>> inet_ctl_sock_destroy(net->ipv4.tcp_sock);
>> - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
>> +}
>> +
>> +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
>> +{
>> + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
>> }
>>
>> static struct pernet_operations __net_initdata tcp_sk_ops = {
>> - .init = tcp_sk_init,
>> - .exit = tcp_sk_exit,
>> + .init = tcp_sk_init,
>> + .exit = tcp_sk_exit,
>> + .exit_batch = tcp_sk_exit_batch,
>> };
>>
>> void __init tcp_v4_init(void)
>> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
>> index de70909..5f46d36 100644
>> --- a/net/ipv6/tcp_ipv6.c
>> +++ b/net/ipv6/tcp_ipv6.c
>> @@ -2126,12 +2126,17 @@ static int tcpv6_net_init(struct net *net)
>> static void tcpv6_net_exit(struct net *net)
>> {
>> inet_ctl_sock_destroy(net->ipv6.tcp_sk);
>> - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET6);
>> +}
>> +
>> +static void tcpv6_net_exit_batch(struct list_head *net_exit_list)
>> +{
>> + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
>> }
>>
>> static struct pernet_operations tcpv6_net_ops = {
>> - .init = tcpv6_net_init,
>> - .exit = tcpv6_net_exit,
>> + .init = tcpv6_net_init,
>> + .exit = tcpv6_net_exit,
>> + .exit_batch = tcpv6_net_exit_batch,
>> };
>>
>> int __init tcpv6_init(void)
>
>
> OK, but why calling inet_twsk_purge() twice, one for AF_INET, once for AF_INET6
>
> I believe you could zap family check as well in inet_twsk_purge(), and not
> need tcpv6_net_ops.exit_batch = tcpv6_net_exit_batch
Technically it is needed if someone does rmmod ipv6. rmmod ipv6 didn't seem
to work for me, but if it ever does... That and the cost is now in the noise
in human terms.
I think dccp may also need a inet_twsk_purge as well, but I couldn't figure out what
it was doing with timewait sockets.
Eric
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/7] Batched netns improvements.
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
` (7 preceding siblings ...)
2009-12-03 13:06 ` [PATCH 0/7] Batched netns improvements jamal
@ 2009-12-03 20:24 ` David Miller
8 siblings, 0 replies; 16+ messages in thread
From: David Miller @ 2009-12-03 20:24 UTC (permalink / raw)
To: ebiederm; +Cc: netdev, hadi, dlezcano
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 03 Dec 2009 04:27:08 -0800
>
> This series of patches takes my simple 4k network namespace exit test
> from about 44s to 1.6s, with a minuscule increase in code side.
>
> While batching has given me big improvements there are a few
> changes in there that improve the performance of cleaning up
> a single network namespace.
All applied to net-next-2.6, thanks Eric.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 7/7] net: Batch inet_twsk_purge
2009-12-03 13:36 ` Eric W. Biederman
@ 2009-12-03 20:24 ` David Miller
2009-12-03 20:45 ` Eric W. Biederman
0 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2009-12-03 20:24 UTC (permalink / raw)
To: ebiederm; +Cc: eric.dumazet, netdev, hadi, dlezcano
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 03 Dec 2009 05:36:52 -0800
> I think dccp may also need a inet_twsk_purge as well, but I couldn't
> figure out what it was doing with timewait sockets.
DCCP doesn't have time-wait sockets, it's more like UDP than TCP.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 7/7] net: Batch inet_twsk_purge
2009-12-03 20:24 ` David Miller
@ 2009-12-03 20:45 ` Eric W. Biederman
0 siblings, 0 replies; 16+ messages in thread
From: Eric W. Biederman @ 2009-12-03 20:45 UTC (permalink / raw)
To: David Miller; +Cc: eric.dumazet, netdev, hadi, dlezcano
David Miller <davem@davemloft.net> writes:
> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Thu, 03 Dec 2009 05:36:52 -0800
>
>> I think dccp may also need a inet_twsk_purge as well, but I couldn't
>> figure out what it was doing with timewait sockets.
>
> DCCP doesn't have time-wait sockets, it's more like UDP than TCP.
There is net/dccp/minisocks.c:dccp_time_wait that calls
inet_twsk_alloc().
So dccp is doing something with timewait sockets.
Eric
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2009-12-03 20:45 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-03 12:27 [PATCH 0/7] Batched netns improvements Eric W. Biederman
2009-12-03 12:29 ` [PATCH 1/7] net: Add support for batching network namespace cleanups Eric W. Biederman
2009-12-03 12:29 ` [PATCH 2/7] net: Move network device exit batching Eric W. Biederman
2009-12-03 12:29 ` [PATCH 3/7] net: Allow xfrm_user_net_exit to batch efficiently Eric W. Biederman
2009-12-03 12:29 ` [PATCH 4/7] netns: Add an explicit rcu_barrier to unregister_pernet_{device|subsys} Eric W. Biederman
2009-12-03 12:29 ` [PATCH 5/7] net: Allow fib_rule_unregister to batch Eric W. Biederman
2009-12-03 12:29 ` [PATCH 6/7] net: Use rcu lookups in inet_twsk_purge Eric W. Biederman
2009-12-03 13:17 ` Eric Dumazet
2009-12-03 12:29 ` [PATCH 7/7] net: Batch inet_twsk_purge Eric W. Biederman
2009-12-03 13:23 ` Eric Dumazet
2009-12-03 13:36 ` Eric W. Biederman
2009-12-03 20:24 ` David Miller
2009-12-03 20:45 ` Eric W. Biederman
2009-12-03 13:06 ` [PATCH 0/7] Batched netns improvements jamal
2009-12-03 13:23 ` Eric W. Biederman
2009-12-03 20:24 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).