* [PATCH] netfilter: per netns nf_conntrack_cachep
@ 2010-02-01 14:52 ` Eric Dumazet
0 siblings, 0 replies; 71+ messages in thread
From: Eric Dumazet @ 2010-02-01 14:52 UTC (permalink / raw)
To: Alexey Dobriyan, Patrick McHardy
Cc: Jon Masters, linux-kernel, netdev, netfilter-devel,
Paul E. McKenney
Le lundi 01 février 2010 à 12:23 +0100, Eric Dumazet a écrit :
> Le lundi 01 février 2010 à 12:25 +0200, Alexey Dobriyan a écrit :
>
> > > 2) nf_conntrack_cachep is shared, it should be not shared.
> >
> > There is no need for it to be shared, unless you measured something.
> >
>
Patrick, here is the relevant patch for this specific problem.
Thanks
[PATCH] netfilter: per netns nf_conntrack_cachep
nf_conntrack_cachep is currently shared by all netns instances, but
because of SLAB_DESTROY_BY_RCU special semantics, this is wrong.
If we use a shared slab cache, one object can instantly flight between
one hash table (netns ONE) to another one (netns TWO), and concurrent
reader (doing a lookup in netns ONE, 'finding' an object of netns TWO)
can be fooled without notice, because no RCU grace period has to be
observed between object freeing and its reuse.
We dont have this problem with UDP/TCP slab caches because TCP/UDP
hashtables are global to the machine (and each object has a pointer to
its netns).
If we use per netns conntrack hash tables, we also *must* use per netns
conntrack slab caches, to guarantee an object can not escape from one
namespace to another one.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index ba1ba0c..d969a50 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -11,6 +11,7 @@ struct nf_conntrack_ecache;
struct netns_ct {
atomic_t count;
unsigned int expect_count;
+ struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0e98c32..1667285 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -63,8 +63,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
struct nf_conn nf_conntrack_untracked __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
-static struct kmem_cache *nf_conntrack_cachep __read_mostly;
-
static int nf_conntrack_hash_rnd_initted;
static unsigned int nf_conntrack_hash_rnd;
@@ -572,7 +570,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
if (ct == NULL) {
pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
atomic_dec(&net->ct.count);
@@ -611,7 +609,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
atomic_dec(&net->ct.count);
nf_ct_ext_free(ct);
- kmem_cache_free(nf_conntrack_cachep, ct);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
@@ -1115,7 +1113,6 @@ static void nf_conntrack_cleanup_init_net(void)
{
nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
- kmem_cache_destroy(nf_conntrack_cachep);
}
static void nf_conntrack_cleanup_net(struct net *net)
@@ -1136,6 +1133,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
free_percpu(net->ct.stat);
}
@@ -1271,15 +1269,6 @@ static int nf_conntrack_init_init_net(void)
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
- nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
- sizeof(struct nf_conn),
- 0, SLAB_DESTROY_BY_RCU, NULL);
- if (!nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- ret = -ENOMEM;
- goto err_cache;
- }
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -1293,8 +1282,6 @@ static int nf_conntrack_init_init_net(void)
err_helper:
nf_conntrack_proto_fini();
err_proto:
- kmem_cache_destroy(nf_conntrack_cachep);
-err_cache:
return ret;
}
@@ -1316,6 +1303,14 @@ static int nf_conntrack_init_net(struct net *net)
ret = -ENOMEM;
goto err_stat;
}
+ net->ct.nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!net->ct.nf_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+ ret = -ENOMEM;
+ goto err_cache;
+ }
net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
&net->ct.hash_vmalloc, 1);
if (!net->ct.hash) {
@@ -1352,6 +1347,8 @@ err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
err_hash:
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+err_cache:
free_percpu(net->ct.stat);
err_stat:
return ret;
^ permalink raw reply related [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-01 14:52 ` Eric Dumazet
(?)
@ 2010-02-01 14:58 ` Alexey Dobriyan
2010-02-01 15:02 ` Eric Dumazet
-1 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-01 14:58 UTC (permalink / raw)
To: Eric Dumazet
Cc: Patrick McHardy, Jon Masters, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Mon, Feb 1, 2010 at 4:52 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> + net->ct.nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
> + sizeof(struct nf_conn), 0,
> + SLAB_DESTROY_BY_RCU, NULL);
Duplicate slab name detected.
Can we clarify this?
Is checking for ct->ct_net enough to avoid the bug
while maintaining per-netns/global status quo?
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-01 14:58 ` Alexey Dobriyan
@ 2010-02-01 15:02 ` Eric Dumazet
2010-02-02 11:04 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Eric Dumazet @ 2010-02-01 15:02 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Jon Masters, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Le lundi 01 février 2010 à 16:58 +0200, Alexey Dobriyan a écrit :
> On Mon, Feb 1, 2010 at 4:52 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > + net->ct.nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
> > + sizeof(struct nf_conn), 0,
> > + SLAB_DESTROY_BY_RCU, NULL);
>
> Duplicate slab name detected.
>
OK, need to build an unique name I guess... "nf_conntrack-%d", net->id
> Can we clarify this?
>
> Is checking for ct->ct_net enough to avoid the bug
> while maintaining per-netns/global status quo?
No, I am afraid its not possible without adding big overhead in
fastpath.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-01 15:02 ` Eric Dumazet
@ 2010-02-02 11:04 ` Jon Masters
2010-02-02 11:35 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-02 11:04 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Mon, 2010-02-01 at 16:02 +0100, Eric Dumazet wrote:
> Le lundi 01 février 2010 à 16:58 +0200, Alexey Dobriyan a écrit :
> > On Mon, Feb 1, 2010 at 4:52 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > + net->ct.nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
> > > + sizeof(struct nf_conn), 0,
> > > + SLAB_DESTROY_BY_RCU, NULL);
> >
> > Duplicate slab name detected.
> >
>
> OK, need to build an unique name I guess... "nf_conntrack-%d", net->id
I shoved in an kasprintf but of course there isn't a per-namespace "id".
We probably should have one (or, a nice "name"), but meanwhile I am
using the address of the net struct like "nf_ct-%p".
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 11:04 ` Jon Masters
@ 2010-02-02 11:35 ` Jon Masters
2010-02-02 16:46 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-02 11:35 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 06:04 -0500, Jon Masters wrote:
> On Mon, 2010-02-01 at 16:02 +0100, Eric Dumazet wrote:
> > Le lundi 01 février 2010 à 16:58 +0200, Alexey Dobriyan a écrit :
> > > On Mon, Feb 1, 2010 at 4:52 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > > + net->ct.nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
> > > > + sizeof(struct nf_conn), 0,
> > > > + SLAB_DESTROY_BY_RCU, NULL);
> > >
> > > Duplicate slab name detected.
> > >
> >
> > OK, need to build an unique name I guess... "nf_conntrack-%d", net->id
>
> I shoved in an kasprintf but of course there isn't a per-namespace "id".
> We probably should have one (or, a nice "name"), but meanwhile I am
> using the address of the net struct like "nf_ct-%p".
-ENOBANANA
Applying just this patch (without the per-ns hashtable metadata, but
with a trivial fix to name using nf_ct-%p for now), we still fall over
in the conntrack lookup code every single time:
[ 210.697337] device vnet2 entered promiscuous mode
[ 210.703868] br0: port 4(vnet2) entering forwarding state
[ 220.766146] vnet2: no IPv6 routers present
[ 236.216957] BUG: unable to handle kernel paging request at
ffff88037e613588
[ 236.217638] IP: [<ffffffff813d47cc>] __nf_conntrack_find+0x53/0xb1
[ 236.217638] PGD 1a3c063 PUD 0
[ 236.217638] Oops: 0000 [#1] SMP
[ 236.217638] last sysfs
file: /sys/devices/virtual/block/md0/md/sync_speed
Entering kdb (current=0xffff8801f32e8000, pid 3214) on processor 1 Oops:
(null)
due to oops @ 0xffffffff813d47cc
CPU 1 <c>
<d>Pid: 3214, comm: qemu-kvm Not tainted 2.6.33-rc5 #25 0F9382/Precision
WorkStation 490
<d>RIP: 0010:[<ffffffff813d47cc>] [<ffffffff813d47cc>]
__nf_conntrack_find+0x53/0xb1
<d>RSP: 0018:ffff8801d41a3758 EFLAGS: 00010286
<d>RAX: ffff88037e613588 RBX: ffff8801d41a3868 RCX: 000000004d1bab3a
<d>RDX: ffff8801f32e8000 RSI: 0000000081b04540 RDI: 0000000000000246
<d>RBP: ffff8801d41a3798 R08: 0000000045f1b45f R09: 000000005e5ffada
<d>R10: 00000000501b6d3f R11: ffff8801d41a388c R12: ffffffff8288ef70
<d>R13: ffff8801d41a3868 R14: ffffffffffffffb8 R15: 000000002bfc66b1
<d>FS: 00007f0059b01780(0000) GS:ffff88002fa00000(0000)
knlGS:0000000000000000
<d>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
<d>CR2: ffff88037e613588 CR3: 00000001f05ed000 CR4: 00000000000026e0
<d>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
<d>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process qemu-kvm (pid: 3214, threadinfo ffff8801d41a2000, task
ffff8801f32e8000)
<0>Stack:
ffff8802206302e0 000000015fe33588 ffffffffffffffb8 ffffffff8288ef70
<0> ffff8802206302e0 ffff8801d41a3868 ffffffffffffffb8 ffffffff8288ef70
<0> ffff8801d41a37e8 ffffffff813d485d ffff8802206302e0 ffff8802206302e0
<0>Call Trace:
[1]more> <0> [<ffffffff813d485d>] nf_conntrack_find_get+0x33/0xb7
[1]more> <0> [<ffffffff813d58d5>] nf_conntrack_in+0x209/0x7b4
[1]more> <0> [<ffffffff8141a413>] ipv4_conntrack_local+0x40/0x49
[1]more> <0> [<ffffffff813d278a>] nf_iterate+0x46/0x89
[1]more> <0> [<ffffffff813e5458>] ? dst_output+0x0/0x12
[1]more> <0> [<ffffffff813d2845>] nf_hook_slow+0x78/0xe0
[1]more> <0> [<ffffffff813e5458>] ? dst_output+0x0/0x12
[1]more> <0> [<ffffffff813e67f2>] nf_hook_thresh.clone.0+0x41/0x4a
[1]more> <0> [<ffffffff81126846>] ? poll_freewait+0x32/0x70
[1]more> <0> [<ffffffff813e6ad2>] __ip_local_out+0x7e/0x80
[1]more> <0> [<ffffffff813e6aea>] ip_local_out+0x16/0x27
[1]more> <0> [<ffffffff813e7118>] ip_queue_xmit+0x30e/0x36e
[1]more> <0> [<ffffffff813f8aec>] tcp_transmit_skb+0x707/0x745
[1]more> <0> [<ffffffff813fb15e>] tcp_write_xmit+0x7cb/0x8ba
[1]more> <0> [<ffffffff813fb2b2>] __tcp_push_pending_frames+0x2f/0x5d
[1]more> <0> [<ffffffff813edecf>] tcp_push+0x88/0x8a
[1]more> <0> [<ffffffff813f01f0>] tcp_sendmsg+0x760/0x85b
[1]more> <0> [<ffffffff813a3ccc>] __sock_sendmsg+0x5e/0x69
[1]more> <0> [<ffffffff813a3fe2>] sock_sendmsg+0xa8/0xc1
[1]more> <0> [<ffffffff81119641>] ? fget_light+0x57/0xf2
[1]more> <0> [<ffffffff811195e8>] ? rcu_read_unlock+0x21/0x23
[1]more> <0> [<ffffffff81119641>] ? fget_light+0x57/0xf2
[1]more> <0> [<ffffffff8114893e>] ? eventfd_write+0x94/0x186
[1]more> <0> [<ffffffff813a4072>] ? sockfd_lookup_light+0x20/0x58
[1]more> <0> [<ffffffff813a5d37>] sys_sendto+0x110/0x152
[1]more> <0> [<ffffffff81118318>] ? fsnotify_modify+0x6c/0x74
[1]more> <0> [<ffffffff81118ad6>] ? vfs_write+0xd3/0x10b
[1]more> <0> [<ffffffff81457f00>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[1]more> <0> [<ffffffff81009bf2>] system_call_fastpath+0x16/0x1b
[1]more> <0>Code: 48 89 df e8 21 f5 ff ff 41 89 c7 45 89 ff e8 e7 fb c7
ff 4a 8d 04 fd 00 00 00 00 48 89 45 c8 48 8b 45 c8 49 03 84 24 98 06 00
00 <4c> 8b 28 eb 14 65 83 40 04 01 e8 bc fc c7 ff eb 3b 65 83 00 01
[1]more> Call Trace:
[1]more> [<ffffffff813d47b4>] ? __nf_conntrack_find+0x3b/0xb1
[1]more> [<ffffffff813d485d>] nf_conntrack_find_get+0x33/0xb7
[1]more> [<ffffffff813d58d5>] nf_conntrack_in+0x209/0x7b4
[1]more> [<ffffffff8141a413>] ipv4_conntrack_local+0x40/0x49
[1]more> [<ffffffff813d278a>] nf_iterate+0x46/0x89
[1]more> [<ffffffff813e5458>] ? dst_output+0x0/0x12
[1]more> [<ffffffff813d2845>] nf_hook_slow+0x78/0xe0
[1]more> [<ffffffff813e5458>] ? dst_output+0x0/0x12
[1]more> [<ffffffff813e67f2>] nf_hook_thresh.clone.0+0x41/0x4a
[1]more> [<ffffffff81126846>] ? poll_freewait+0x32/0x70
[1]more> [<ffffffff813e6ad2>] __ip_local_out+0x7e/0x80
[1]more> [<ffffffff813e6aea>] ip_local_out+0x16/0x27
[1]more> [<ffffffff813e7118>] ip_queue_xmit+0x30e/0x36e
[1]more> [<ffffffff813f8aec>] tcp_transmit_skb+0x707/0x745
[1]more> [<ffffffff813fb15e>] tcp_write_xmit+0x7cb/0x8ba
[1]more> [<ffffffff813fb2b2>] __tcp_push_pending_frames+0x2f/0x5d
[1]more> [<ffffffff813edecf>] tcp_push+0x88/0x8a
[1]more> [<ffffffff813f01f0>] tcp_sendmsg+0x760/0x85b
[1]more> [<ffffffff813a3ccc>] __sock_sendmsg+0x5e/0x69
[1]more> [<ffffffff813a3fe2>] sock_sendmsg+0xa8/0xc1
[1]more> [<ffffffff81119641>] ? fget_light+0x57/0xf2
[1]more> [<ffffffff811195e8>] ? rcu_read_unlock+0x21/0x23
[1]more> [<ffffffff81119641>] ? fget_light+0x57/0xf2
[1]more> [<ffffffff8114893e>] ? eventfd_write+0x94/0x186
[1]more> [<ffffffff813a4072>] ? sockfd_lookup_light+0x20/0x58
[1]more> [<ffffffff813a5d37>] sys_sendto+0x110/0x152
[1]more> [<ffffffff81118318>] ? fsnotify_modify+0x6c/0x74
[1]more> [<ffffffff81118ad6>] ? vfs_write+0xd3/0x10b
[1]more> [<ffffffff81457f00>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[1]more> [<ffffffff81009bf2>] system_call_fastpath+0x16/0x1b
I think there's something more fundamental going on here.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 11:35 ` Jon Masters
@ 2010-02-02 16:46 ` Jon Masters
2010-02-02 16:48 ` Patrick McHardy
2010-02-02 16:58 ` PROBLEM with summary: " Jon Masters
0 siblings, 2 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 16:46 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 06:35 -0500, Jon Masters wrote:
> I think there's something more fundamental going on here.
What happens is the conntrack code attempts to free
nf_conntrack_untracked back into the SL[U]B cache from which it
allocates other ct's. There's just one problem...that's a static struct
not from the cache. So, this is why we end up with the SLAB being
corrupted and the address immediately following the
nf_conntrack_untracked being corrupted.
I shoved some debug comments into the destroy code to see if we were
trying to free nf_conntrack_untracked, and bingo. I have shoved a panic
in there now, will send you a backtrace.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 16:46 ` Jon Masters
@ 2010-02-02 16:48 ` Patrick McHardy
2010-02-02 17:07 ` Jon Masters
2010-02-02 16:58 ` PROBLEM with summary: " Jon Masters
1 sibling, 1 reply; 71+ messages in thread
From: Patrick McHardy @ 2010-02-02 16:48 UTC (permalink / raw)
To: Jon Masters
Cc: Eric Dumazet, Alexey Dobriyan, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Jon Masters wrote:
> On Tue, 2010-02-02 at 06:35 -0500, Jon Masters wrote:
>
>> I think there's something more fundamental going on here.
>
> What happens is the conntrack code attempts to free
> nf_conntrack_untracked back into the SL[U]B cache from which it
> allocates other ct's.
That shouldn't happen, the untracked conntrack is initialized to a
refcount of 1, which is never released.
> There's just one problem...that's a static struct
> not from the cache. So, this is why we end up with the SLAB being
> corrupted and the address immediately following the
> nf_conntrack_untracked being corrupted.
>
> I shoved some debug comments into the destroy code to see if we were
> trying to free nf_conntrack_untracked, and bingo. I have shoved a panic
> in there now, will send you a backtrace.
Thanks.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 16:48 ` Patrick McHardy
@ 2010-02-02 17:07 ` Jon Masters
2010-02-02 17:58 ` Alexey Dobriyan
0 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-02 17:07 UTC (permalink / raw)
To: Patrick McHardy
Cc: Eric Dumazet, Alexey Dobriyan, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 17:48 +0100, Patrick McHardy wrote:
> Jon Masters wrote:
> > On Tue, 2010-02-02 at 06:35 -0500, Jon Masters wrote:
> >
> >> I think there's something more fundamental going on here.
> >
> > What happens is the conntrack code attempts to free
> > nf_conntrack_untracked back into the SL[U]B cache from which it
> > allocates other ct's.
>
> That shouldn't happen, the untracked conntrack is initialized to a
> refcount of 1, which is never released.
Ah, but I think it is :) It's also re-initialized (with an atomic_set)
every time a new namespace is created, whereas this should probably only
be done in the init_init_net code, not in init_net :)
> > There's just one problem...that's a static struct
> > not from the cache. So, this is why we end up with the SLAB being
> > corrupted and the address immediately following the
> > nf_conntrack_untracked being corrupted.
> >
> > I shoved some debug comments into the destroy code to see if we were
> > trying to free nf_conntrack_untracked, and bingo. I have shoved a panic
> > in there now, will send you a backtrace.
>
> Thanks.
No problem. And thanks for your help. I'm sorry if I sound frustrated at
this, it's just causing all of my test machines running KVM guests to
fall over :)
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 17:07 ` Jon Masters
@ 2010-02-02 17:58 ` Alexey Dobriyan
2010-02-02 18:16 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-02 17:58 UTC (permalink / raw)
To: Jon Masters
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, Feb 2, 2010 at 7:07 PM, Jon Masters <jonathan@jonmasters.org> wrote:
> No problem. And thanks for your help. I'm sorry if I sound frustrated at
> this, it's just causing all of my test machines running KVM guests to
> fall over :)
So it's my bug either way. :-(
Yes, moving to init_net-only function is fine.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 17:58 ` Alexey Dobriyan
@ 2010-02-02 18:16 ` Jon Masters
2010-02-02 18:34 ` Jon Masters
2010-02-02 18:36 ` Patrick McHardy
0 siblings, 2 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 18:16 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
> Yes, moving to init_net-only function is fine.
So moving the "setup up fake conntrack" bits to init_init_net from
init_net still results in the panic, which means that the use count
really is dropping to zero and we really are trying to free it when
using multiple namespaces. Per ns is probably an easier way to go.
Just for kicks, I'll have it error out on attempting to free to see if I
can get this box to stay up for a while.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 18:16 ` Jon Masters
@ 2010-02-02 18:34 ` Jon Masters
2010-02-02 18:36 ` Patrick McHardy
1 sibling, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 18:34 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 13:16 -0500, Jon Masters wrote:
> On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
>
> > Yes, moving to init_net-only function is fine.
>
> So moving the "setup up fake conntrack" bits to init_init_net from
> init_net still results in the panic, which means that the use count
> really is dropping to zero and we really are trying to free it when
> using multiple namespaces. Per ns is probably an easier way to go.
>
> Just for kicks, I'll have it error out on attempting to free to see if I
> can get this box to stay up for a while.
Confirmed. It boots and the hashsize is not inadvertedly corrupted if I
do the following:
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
void (*destroy)(struct nf_conntrack *);
if ((struct nf_conn *)nfct == &nf_conntrack_untracked) {
printk("JCM: nf_conntrack_destroy: trying to destroy
nf_conntrack_untracked! CONTINUING...\n");
//panic("JCM: nf_conntrack_destroy: trying to destroy
nf_conntrack_untracked!\n");
return; /* refuse to free nf_conntrack_untracked */
}
rcu_read_lock();
destroy = rcu_dereference(nf_ct_destroy);
BUG_ON(destroy == NULL);
destroy(nfct);
rcu_read_unlock();
}
EXPORT_SYMBOL(nf_conntrack_destroy);
Clearly that's just a hack (though catching the specific attempt to free
the untracked conntrack sounds like a very good idea in general). I will
leave this running for a while, but so far no problems:
[jcm@perihelion jcm_26]$ dmesg|grep JCM
[ 29.952717] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 30.207091] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 30.403248] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 31.403319] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 32.977106] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 33.347100] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 33.966092] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 34.967111] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 35.404323] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 35.911430] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 35.912442] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 38.967061] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 39.403342] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 41.288392] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 42.966063] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 44.036451] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 95.621174] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 95.673061] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 95.741180] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 96.273429] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 96.741296] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 97.246108] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 98.246486] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 100.747170] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 102.252067] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 104.747066] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 105.762433] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 106.252084] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 147.260998] JCM: nf_conntrack_destroy: trying to destroy
nf_conntrack_untracked! CONTINUING...
[ 155.485160] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 156.403661] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 157.402928] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 161.402477] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 163.270622] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 163.277368] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 167.718677] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 176.901877] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 182.554024] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 182.616693] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 183.616932] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 187.617026] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 188.362002] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 191.617001] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 193.574899] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 197.455338] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 214.163491] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 214.311528] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 215.311577] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 219.317496] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 220.693493] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 223.317095] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 240.326198] JCM: nf_conntrack_destroy: trying to destroy
nf_conntrack_untracked! CONTINUING...
[ 252.481116] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 252.881087] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 253.881251] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 257.922816] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 258.882989] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 261.413093] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 261.921943] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
[ 266.350983] JCM: icmpv6_error: attaching to nf_conntrack_untracked.
So, over to you :)
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 18:16 ` Jon Masters
2010-02-02 18:34 ` Jon Masters
@ 2010-02-02 18:36 ` Patrick McHardy
2010-02-02 18:39 ` Jon Masters
2010-02-03 12:10 ` Patrick McHardy
1 sibling, 2 replies; 71+ messages in thread
From: Patrick McHardy @ 2010-02-02 18:36 UTC (permalink / raw)
To: Jon Masters
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Jon Masters wrote:
> On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
>
>> Yes, moving to init_net-only function is fine.
>
> So moving the "setup up fake conntrack" bits to init_init_net from
> init_net still results in the panic, which means that the use count
> really is dropping to zero and we really are trying to free it when
> using multiple namespaces. Per ns is probably an easier way to go.
Agreed, that will also avoid problems in the future with the
ct_net pointer pointing to &init_net. I'll take care of this
tommorrow.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 18:36 ` Patrick McHardy
@ 2010-02-02 18:39 ` Jon Masters
2010-02-02 18:42 ` Jon Masters
2010-02-03 12:10 ` Patrick McHardy
1 sibling, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-02 18:39 UTC (permalink / raw)
To: Patrick McHardy
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 19:36 +0100, Patrick McHardy wrote:
> Jon Masters wrote:
> > On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
> >
> >> Yes, moving to init_net-only function is fine.
> >
> > So moving the "setup up fake conntrack" bits to init_init_net from
> > init_net still results in the panic, which means that the use count
> > really is dropping to zero and we really are trying to free it when
> > using multiple namespaces. Per ns is probably an easier way to go.
>
> Agreed, that will also avoid problems in the future with the
> ct_net pointer pointing to &init_net. I'll take care of this
> tommorrow.
Ok. I'll leave this box running with the hack. I think at the very least
that this specific issue needs to get fixed and in the stable tree, then
the other bits (per namespace cachep...) are probably a good idea at the
same time but that's up to you.
Thanks for your help!
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 18:39 ` Jon Masters
@ 2010-02-02 18:42 ` Jon Masters
0 siblings, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 18:42 UTC (permalink / raw)
To: Patrick McHardy
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 13:39 -0500, Jon Masters wrote:
> On Tue, 2010-02-02 at 19:36 +0100, Patrick McHardy wrote:
> > Jon Masters wrote:
> > > On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
> > >
> > >> Yes, moving to init_net-only function is fine.
> > >
> > > So moving the "setup up fake conntrack" bits to init_init_net from
> > > init_net still results in the panic, which means that the use count
> > > really is dropping to zero and we really are trying to free it when
> > > using multiple namespaces. Per ns is probably an easier way to go.
> >
> > Agreed, that will also avoid problems in the future with the
> > ct_net pointer pointing to &init_net. I'll take care of this
> > tommorrow.
>
> Ok. I'll leave this box running with the hack. I think at the very least
> that this specific issue needs to get fixed and in the stable tree, then
> the other bits (per namespace cachep...) are probably a good idea at the
> same time but that's up to you.
FYI, my box has the quick don't free untracked hack *and* per-ns cachep.
I don't think the latter has anything specific to do with this (though
it needs fixing also), but worth knowing my test is using both.
Back to the podcasts tonight instead of this ;)
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 18:36 ` Patrick McHardy
2010-02-02 18:39 ` Jon Masters
@ 2010-02-03 12:10 ` Patrick McHardy
2010-02-03 18:38 ` Jon Masters
2010-02-03 20:21 ` Jon Masters
1 sibling, 2 replies; 71+ messages in thread
From: Patrick McHardy @ 2010-02-03 12:10 UTC (permalink / raw)
To: Jon Masters
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
[-- Attachment #1: Type: text/plain, Size: 914 bytes --]
Patrick McHardy wrote:
> Jon Masters wrote:
>> On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
>>
>>> Yes, moving to init_net-only function is fine.
>> So moving the "setup up fake conntrack" bits to init_init_net from
>> init_net still results in the panic, which means that the use count
>> really is dropping to zero and we really are trying to free it when
>> using multiple namespaces. Per ns is probably an easier way to go.
>
> Agreed, that will also avoid problems in the future with the
> ct_net pointer pointing to &init_net. I'll take care of this
> tommorrow.
Unfortunately a per-namespace conntrack is not easily possible without
larger changes (most of which are already queued in nf-next-2.6.git
though). So for now I just moved the untrack handling to the init_net
setup and cleanup functions and we can try to fix the remainder in
2.6.34.
Jon, could you give this patch a try please?
[-- Attachment #2: x --]
[-- Type: text/plain, Size: 3006 bytes --]
commit 056ff3e3bd1563969a311697323ff929df94415c
Author: Patrick McHardy <kaber@trash.net>
Date: Wed Feb 3 12:58:06 2010 +0100
netfilter: nf_conntrack: fix memory corruption with multiple namespaces
As discovered by Jon Masters <jonathan@jonmasters.org>, the "untracked"
conntrack, which is located in the data section, might be accidentally
freed when a new namespace is instantiated while the untracked conntrack
is attached to a skb because the reference count it re-initialized.
The best fix would be to use a seperate untracked conntrack per
namespace since it includes a namespace pointer. Unfortunately this is
not possible without larger changes since the namespace is not easily
available everywhere we need it. For now move the untracked conntrack
initialization to the init_net setup function to make sure the reference
count is not re-initialized and handle cleanup in the init_net cleanup
function to make sure namespaces can exit properly while the untracked
conntrack is in use in other namespaces.
Signed-off-by: Patrick McHardy <kaber@trash.net>
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0e98c32..37e2b88 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1113,6 +1113,10 @@ static void nf_ct_release_dying_list(struct net *net)
static void nf_conntrack_cleanup_init_net(void)
{
+ /* wait until all references to nf_conntrack_untracked are dropped */
+ while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+ schedule();
+
nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
kmem_cache_destroy(nf_conntrack_cachep);
@@ -1127,9 +1131,6 @@ static void nf_conntrack_cleanup_net(struct net *net)
schedule();
goto i_see_dead_people;
}
- /* wait until all references to nf_conntrack_untracked are dropped */
- while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
- schedule();
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
@@ -1288,6 +1289,14 @@ static int nf_conntrack_init_init_net(void)
if (ret < 0)
goto err_helper;
+ /* Set up fake conntrack: to never be deleted, not in any hashes */
+#ifdef CONFIG_NET_NS
+ nf_conntrack_untracked.ct_net = &init_net;
+#endif
+ atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
+ /* - and look it like as a confirmed connection */
+ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
+
return 0;
err_helper:
@@ -1333,15 +1342,6 @@ static int nf_conntrack_init_net(struct net *net)
if (ret < 0)
goto err_ecache;
- /* Set up fake conntrack:
- - to never be deleted, not in any hashes */
-#ifdef CONFIG_NET_NS
- nf_conntrack_untracked.ct_net = &init_net;
-#endif
- atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
- /* - and look it like as a confirmed connection */
- set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
-
return 0;
err_ecache:
^ permalink raw reply related [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 12:10 ` Patrick McHardy
@ 2010-02-03 18:38 ` Jon Masters
2010-02-03 19:09 ` Alexey Dobriyan
2010-02-03 20:21 ` Jon Masters
1 sibling, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-03 18:38 UTC (permalink / raw)
To: Patrick McHardy
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, 2010-02-03 at 13:10 +0100, Patrick McHardy wrote:
> Patrick McHardy wrote:
> > Jon Masters wrote:
> >> On Tue, 2010-02-02 at 19:58 +0200, Alexey Dobriyan wrote:
> >>
> >>> Yes, moving to init_net-only function is fine.
> >> So moving the "setup up fake conntrack" bits to init_init_net from
> >> init_net still results in the panic, which means that the use count
> >> really is dropping to zero and we really are trying to free it when
> >> using multiple namespaces. Per ns is probably an easier way to go.
> >
> > Agreed, that will also avoid problems in the future with the
> > ct_net pointer pointing to &init_net. I'll take care of this
> > tommorrow.
>
> Unfortunately a per-namespace conntrack is not easily possible without
> larger changes (most of which are already queued in nf-next-2.6.git
> though). So for now I just moved the untrack handling to the init_net
> setup and cleanup functions and we can try to fix the remainder in
> 2.6.34.
Ok. I'd love to help out actually, given that I've been poking at this,
and it's quite fun. So please at least send me patches. The only other
thing I consider a priority issue at the moment for this is that writing
into /sys/module/nf_conntrack/parameters/hashsize on a running system
with multiple namespaces will cause the system to corrupt random memory
silently and fall over. That probably needs fixing until there is
per-namespace hashsize tracking, and this isn't a global tunable.
Also, some other things I think are required before 2.6.34:
*). Per namespace cacheing allocation (the cachep bits). We know it's
still possible for weirdness to happen in the SLAB cache here.
*). Per namespace hashsize tracking. Existing code corrupts hashtables
if the global size is changed when there is more than one netns
*). Per namespace expectations. This is for similar reasons to the need
for multiple hashtables, though I haven't poked at that.
I also think it is necessary to expose net namespace layout and
configuration via sysfs or some other interface, add a net->id parameter
(and may even an optional name), etc. Where does netns discussion
happen, on netdev I would presume?
> Jon, could you give this patch a try please?
Yup. Box is stable and boots multiple virtual machines as it did with
the quick hack from yesterday, so this has now fixed the problem.
Can you let me know if this is the final patch you want to post? If so,
we should get this into stable asap (and I have a couple of vendor
kernels that will need a version of this fix also).
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 18:38 ` Jon Masters
@ 2010-02-03 19:09 ` Alexey Dobriyan
2010-02-03 19:43 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-03 19:09 UTC (permalink / raw)
To: Jon Masters
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> *). Per namespace cacheing allocation (the cachep bits). We know it's
> still possible for weirdness to happen in the SLAB cache here.
Tiny race, needs reproducer.
> *). Per namespace hashsize tracking. Existing code corrupts hashtables
> if the global size is changed when there is more than one netns
I think, no.
Changing hash size will change hashsize for all netns, current and future.
> *). Per namespace expectations. This is for similar reasons to the need
> for multiple hashtables, though I haven't poked at that.
Expectation cache is not SLAB_DESTROY_BY_RCU, so the logic doesn't
apply, I hope.
> I also think it is necessary to expose net namespace layout
Not necessary. Why?
> and configuration via sysfs
Which configuration?
> or some other interface, add a net->id parameter (and may even an optional name),
No name, please :-)
->id is more or less required for per-netns conntrack cache.
> etc. Where does netns discussion happen, on netdev I would presume?
Yep. And containters@, I think.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:09 ` Alexey Dobriyan
@ 2010-02-03 19:43 ` Jon Masters
2010-02-03 19:46 ` Jon Masters
` (2 more replies)
0 siblings, 3 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-03 19:43 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> > *). Per namespace cacheing allocation (the cachep bits). We know it's
> > still possible for weirdness to happen in the SLAB cache here.
>
> Tiny race, needs reproducer.
Maybe. I think it's worth fixing anyway.
> > *). Per namespace hashsize tracking. Existing code corrupts hashtables
> > if the global size is changed when there is more than one netns
>
> I think, no.
> Changing hash size will change hashsize for all netns, current and future.
Nope. Look at the logic in nf_conntrack_set_hashsize where you iterate
over init_net.ct.hash but don't touch other namespaces. So then you go
setting nf_conntrack_htable_size and will deference that in accessing
other per-namespace hashtables using the wrong size information.
> > *). Per namespace expectations. This is for similar reasons to the need
> > for multiple hashtables, though I haven't poked at that.
>
> Expectation cache is not SLAB_DESTROY_BY_RCU, so the logic doesn't
> apply, I hope.
I'm not sure, I didn't poke at that much yet. But hashtables certainly
need fixing unless you can point out where I'm wrong.
> > I also think it is necessary to expose net namespace layout
>
> Not necessary. Why?
How am I as a sysadmin supposed to figure out which net namespaces exist
on my system, and as a developer, supposed to debug these situations?
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:43 ` Jon Masters
@ 2010-02-03 19:46 ` Jon Masters
2010-02-03 19:53 ` Alexey Dobriyan
2010-02-03 19:51 ` Alexey Dobriyan
2010-02-04 12:25 ` Patrick McHardy
2 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-03 19:46 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, 2010-02-03 at 14:44 -0500, Jon Masters wrote:
> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> > On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> > > I also think it is necessary to expose net namespace layout
> >
> > Not necessary. Why?
>
> How am I as a sysadmin supposed to figure out which net namespaces exist
> on my system, and as a developer, supposed to debug these situations?
(without Jason's excellent kgdb patches, which really help)
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:46 ` Jon Masters
@ 2010-02-03 19:53 ` Alexey Dobriyan
2010-02-03 20:04 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-03 19:53 UTC (permalink / raw)
To: Jon Masters
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, Feb 03, 2010 at 02:46:11PM -0500, Jon Masters wrote:
> On Wed, 2010-02-03 at 14:44 -0500, Jon Masters wrote:
> > On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> > > On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
>
> > > > I also think it is necessary to expose net namespace layout
> > >
> > > Not necessary. Why?
> >
> > How am I as a sysadmin supposed to figure out which net namespaces exist
> > on my system, and as a developer, supposed to debug these situations?
>
> (without Jason's excellent kgdb patches, which really help)
Oh! Just like as usual, thinking and looking at oops and code.
Because when box is dead, netns info is not goint to be printed
anyway.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:53 ` Alexey Dobriyan
@ 2010-02-03 20:04 ` Jon Masters
0 siblings, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-03 20:04 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, 2010-02-03 at 21:53 +0200, Alexey Dobriyan wrote:
> On Wed, Feb 03, 2010 at 02:46:11PM -0500, Jon Masters wrote:
> > On Wed, 2010-02-03 at 14:44 -0500, Jon Masters wrote:
> > > On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> > > > On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> >
> > > > > I also think it is necessary to expose net namespace layout
> > > >
> > > > Not necessary. Why?
> > >
> > > How am I as a sysadmin supposed to figure out which net namespaces exist
> > > on my system, and as a developer, supposed to debug these situations?
> >
> > (without Jason's excellent kgdb patches, which really help)
>
> Oh! Just like as usual, thinking and looking at oops and code.
>
> Because when box is dead, netns info is not goint to be printed
> anyway.
It would really have been helpful over the weekend to have just been
able to look around in sysfs to see what was going on with namespaces.
I'm not saying it can't be done in a debugger, or by poking at
backtraces, but it's easier to say to the many people who had this
problem in Fedora kernels "tell me what this file says" just like we do
for slab, memory, whatever other information. Just a suggestion.
> And what do you want to see at it?
Just an ability to walk through which namespaces exist (debugfs even)
and which resources are currently assigned to them. Somehow. I didn't
give it a lot of thought, but something would be useful.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:43 ` Jon Masters
2010-02-03 19:46 ` Jon Masters
@ 2010-02-03 19:51 ` Alexey Dobriyan
2010-02-03 19:53 ` Jon Masters
2010-02-04 12:25 ` Patrick McHardy
2 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-03 19:51 UTC (permalink / raw)
To: Jon Masters
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, Feb 03, 2010 at 02:43:47PM -0500, Jon Masters wrote:
> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> > On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> > > *). Per namespace hashsize tracking. Existing code corrupts hashtables
> > > if the global size is changed when there is more than one netns
> >
> > I think, no.
> > Changing hash size will change hashsize for all netns, current and future.
>
> Nope. Look at the logic in nf_conntrack_set_hashsize where you iterate
> over init_net.ct.hash but don't touch other namespaces. So then you go
> setting nf_conntrack_htable_size and will deference that in accessing
> other per-namespace hashtables using the wrong size information.
> > > I also think it is necessary to expose net namespace layout
> >
> > Not necessary. Why?
>
> How am I as a sysadmin supposed to figure out which net namespaces exist
> on my system, and as a developer, supposed to debug these situations?
We don't expose many relations to userspace, and it's generally fine.
As a developer you fire a debugger and look at net_namespace_list.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:51 ` Alexey Dobriyan
@ 2010-02-03 19:53 ` Jon Masters
2010-02-03 20:01 ` Alexey Dobriyan
0 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-03 19:53 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, 2010-02-03 at 21:51 +0200, Alexey Dobriyan wrote:
> On Wed, Feb 03, 2010 at 02:43:47PM -0500, Jon Masters wrote:
> > On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> > > On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> > > > I also think it is necessary to expose net namespace layout
> > >
> > > Not necessary. Why?
> >
> > How am I as a sysadmin supposed to figure out which net namespaces exist
> > on my system, and as a developer, supposed to debug these situations?
>
> We don't expose many relations to userspace, and it's generally fine.
I can see slabs via /proc, memory layout, heck I can even expose the
kernel page tables if I really want to. I guess that's not too many :)
> As a developer you fire a debugger and look at net_namespace_list.
Yeah, but being able to cat a nice file is always handy.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:53 ` Jon Masters
@ 2010-02-03 20:01 ` Alexey Dobriyan
0 siblings, 0 replies; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-03 20:01 UTC (permalink / raw)
To: Jon Masters
Cc: Patrick McHardy, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, Feb 03, 2010 at 02:53:58PM -0500, Jon Masters wrote:
> > As a developer you fire a debugger and look at net_namespace_list.
>
> Yeah, but being able to cat a nice file is always handy.
And what do you want to see at it?
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 19:43 ` Jon Masters
2010-02-03 19:46 ` Jon Masters
2010-02-03 19:51 ` Alexey Dobriyan
@ 2010-02-04 12:25 ` Patrick McHardy
2010-02-04 12:27 ` Alexey Dobriyan
2 siblings, 1 reply; 71+ messages in thread
From: Patrick McHardy @ 2010-02-04 12:25 UTC (permalink / raw)
To: Jon Masters
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Jon Masters wrote:
> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
>> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
>>> *). Per namespace cacheing allocation (the cachep bits). We know it's
>>> still possible for weirdness to happen in the SLAB cache here.
>> Tiny race, needs reproducer.
>
> Maybe. I think it's worth fixing anyway.
Absolutely, I'll also apply Eric's patch with the %p fix for the
slab name.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 12:25 ` Patrick McHardy
@ 2010-02-04 12:27 ` Alexey Dobriyan
2010-02-04 12:30 ` Patrick McHardy
0 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-04 12:27 UTC (permalink / raw)
To: Patrick McHardy
Cc: Jon Masters, Eric Dumazet, linux-kernel, netdev, netfilter-devel,
Paul E. McKenney
On Thu, Feb 4, 2010 at 2:25 PM, Patrick McHardy <kaber@trash.net> wrote:
> Jon Masters wrote:
>> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
>>> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
>>>> *). Per namespace cacheing allocation (the cachep bits). We know it's
>>>> still possible for weirdness to happen in the SLAB cache here.
>>> Tiny race, needs reproducer.
>>
>> Maybe. I think it's worth fixing anyway.
>
> Absolutely, I'll also apply Eric's patch with the %p fix for the
> slab name.
This would show kernel pointers in userspace ;-)
So, net->id is required.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 12:27 ` Alexey Dobriyan
@ 2010-02-04 12:30 ` Patrick McHardy
2010-02-04 12:35 ` Alexey Dobriyan
0 siblings, 1 reply; 71+ messages in thread
From: Patrick McHardy @ 2010-02-04 12:30 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Jon Masters, Eric Dumazet, linux-kernel, netdev, netfilter-devel,
Paul E. McKenney
Alexey Dobriyan wrote:
> On Thu, Feb 4, 2010 at 2:25 PM, Patrick McHardy <kaber@trash.net> wrote:
>> Jon Masters wrote:
>>> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
>>>> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
>>>>> *). Per namespace cacheing allocation (the cachep bits). We know it's
>>>>> still possible for weirdness to happen in the SLAB cache here.
>>>> Tiny race, needs reproducer.
>>> Maybe. I think it's worth fixing anyway.
>> Absolutely, I'll also apply Eric's patch with the %p fix for the
>> slab name.
>
> This would show kernel pointers in userspace ;-)
> So, net->id is required.
I don't see the problem. But yes, it would be nicer to have an ID.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 12:30 ` Patrick McHardy
@ 2010-02-04 12:35 ` Alexey Dobriyan
2010-02-04 13:04 ` Patrick McHardy
0 siblings, 1 reply; 71+ messages in thread
From: Alexey Dobriyan @ 2010-02-04 12:35 UTC (permalink / raw)
To: Patrick McHardy
Cc: Jon Masters, Eric Dumazet, linux-kernel, netdev, netfilter-devel,
Paul E. McKenney
On Thu, Feb 4, 2010 at 2:30 PM, Patrick McHardy <kaber@trash.net> wrote:
> Alexey Dobriyan wrote:
>> On Thu, Feb 4, 2010 at 2:25 PM, Patrick McHardy <kaber@trash.net> wrote:
>>> Jon Masters wrote:
>>>> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
>>>>> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
>>>>>> *). Per namespace cacheing allocation (the cachep bits). We know it's
>>>>>> still possible for weirdness to happen in the SLAB cache here.
>>>>> Tiny race, needs reproducer.
>>>> Maybe. I think it's worth fixing anyway.
>>> Absolutely, I'll also apply Eric's patch with the %p fix for the
>>> slab name.
>>
>> This would show kernel pointers in userspace ;-)
>> So, net->id is required.
>
> I don't see the problem. But yes, it would be nicer to have an ID.
This is done (or rather, not done) to not show attackers
where data structures are.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 12:35 ` Alexey Dobriyan
@ 2010-02-04 13:04 ` Patrick McHardy
2010-02-04 13:18 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Patrick McHardy @ 2010-02-04 13:04 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: Jon Masters, Eric Dumazet, linux-kernel, netdev, netfilter-devel,
Paul E. McKenney
Alexey Dobriyan wrote:
> On Thu, Feb 4, 2010 at 2:30 PM, Patrick McHardy <kaber@trash.net> wrote:
>> Alexey Dobriyan wrote:
>>> On Thu, Feb 4, 2010 at 2:25 PM, Patrick McHardy <kaber@trash.net> wrote:
>>>> Jon Masters wrote:
>>>>> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
>>>>>> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
>>>>>>> *). Per namespace cacheing allocation (the cachep bits). We know it's
>>>>>>> still possible for weirdness to happen in the SLAB cache here.
>>>>>> Tiny race, needs reproducer.
>>>>> Maybe. I think it's worth fixing anyway.
>>>> Absolutely, I'll also apply Eric's patch with the %p fix for the
>>>> slab name.
>>> This would show kernel pointers in userspace ;-)
>>> So, net->id is required.
>> I don't see the problem. But yes, it would be nicer to have an ID.
>
> This is done (or rather, not done) to not show attackers
> where data structures are.
That's news to me, my /proc is full of kernel space pointers,
including data.
In any case, we need a fix for this suitable for 2.6.33. If
you don't like using the pointer, please send a patch to add
an id to the network namespaces.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 13:04 ` Patrick McHardy
@ 2010-02-04 13:18 ` Jon Masters
2010-02-04 13:37 ` Patrick McHardy
0 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-04 13:18 UTC (permalink / raw)
To: Patrick McHardy
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Thu, 2010-02-04 at 14:04 +0100, Patrick McHardy wrote:
> Alexey Dobriyan wrote:
> > On Thu, Feb 4, 2010 at 2:30 PM, Patrick McHardy <kaber@trash.net> wrote:
> >> Alexey Dobriyan wrote:
> >>> On Thu, Feb 4, 2010 at 2:25 PM, Patrick McHardy <kaber@trash.net> wrote:
> >>>> Jon Masters wrote:
> >>>>> On Wed, 2010-02-03 at 21:09 +0200, Alexey Dobriyan wrote:
> >>>>>> On Wed, Feb 03, 2010 at 01:38:09PM -0500, Jon Masters wrote:
> >>>>>>> *). Per namespace cacheing allocation (the cachep bits). We know it's
> >>>>>>> still possible for weirdness to happen in the SLAB cache here.
> >>>>>> Tiny race, needs reproducer.
> >>>>> Maybe. I think it's worth fixing anyway.
> >>>> Absolutely, I'll also apply Eric's patch with the %p fix for the
> >>>> slab name.
> >>> This would show kernel pointers in userspace ;-)
> >>> So, net->id is required.
> >> I don't see the problem. But yes, it would be nicer to have an ID.
> >
> > This is done (or rather, not done) to not show attackers
> > where data structures are.
>
> That's news to me, my /proc is full of kernel space pointers,
> including data.
And anyway, you can guess it in many cases. I don't think there's a huge
problem, but you could of course just stash a global atomic somewhere to
keep track of how many of these you've made if that's easier for now.
> In any case, we need a fix for this suitable for 2.6.33. If
> you don't like using the pointer, please send a patch to add
> an id to the network namespaces.
Right. I think the quick solution is fine for 2.6.33. So that makes the
hashtable non-resize patch, the crash fix, and the cachep bits. I will
try to get involved and help you out with the per-ns hashtable clean
rather than just being a whiner :)
Thanks a bunch! Fedora kernels have already been built with this fix,
since it will allow us to close a fair number of "KVM goes boom" bugs.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 13:18 ` Jon Masters
@ 2010-02-04 13:37 ` Patrick McHardy
2010-02-04 13:42 ` Jon Masters
0 siblings, 1 reply; 71+ messages in thread
From: Patrick McHardy @ 2010-02-04 13:37 UTC (permalink / raw)
To: Jon Masters
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
[-- Attachment #1: Type: text/plain, Size: 838 bytes --]
Jon Masters wrote:
> On Thu, 2010-02-04 at 14:04 +0100, Patrick McHardy wrote:
>> In any case, we need a fix for this suitable for 2.6.33. If
>> you don't like using the pointer, please send a patch to add
>> an id to the network namespaces.
>
> Right. I think the quick solution is fine for 2.6.33. So that makes the
> hashtable non-resize patch, the crash fix, and the cachep bits. I will
> try to get involved and help you out with the per-ns hashtable clean
> rather than just being a whiner :)
This is the patch I'm going to commit unless unless there are further
objections. Its Eric's patch with a change on top to allocate a unique
name for the slab.
> Thanks a bunch! Fedora kernels have already been built with this fix,
> since it will allow us to close a fair number of "KVM goes boom" bugs.
Thanks as well for your help.
[-- Attachment #2: x --]
[-- Type: text/plain, Size: 3859 bytes --]
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index ba1ba0c..aed23b6 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -11,6 +11,7 @@ struct nf_conntrack_ecache;
struct netns_ct {
atomic_t count;
unsigned int expect_count;
+ struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
@@ -28,5 +29,6 @@ struct netns_ct {
#endif
int hash_vmalloc;
int expect_vmalloc;
+ char *slabname;
};
#endif
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 37e2b88..7ac027a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -63,8 +63,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
struct nf_conn nf_conntrack_untracked __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
-static struct kmem_cache *nf_conntrack_cachep __read_mostly;
-
static int nf_conntrack_hash_rnd_initted;
static unsigned int nf_conntrack_hash_rnd;
@@ -572,7 +570,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
if (ct == NULL) {
pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
atomic_dec(&net->ct.count);
@@ -611,7 +609,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
atomic_dec(&net->ct.count);
nf_ct_ext_free(ct);
- kmem_cache_free(nf_conntrack_cachep, ct);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
@@ -1119,7 +1117,6 @@ static void nf_conntrack_cleanup_init_net(void)
nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
- kmem_cache_destroy(nf_conntrack_cachep);
}
static void nf_conntrack_cleanup_net(struct net *net)
@@ -1137,6 +1134,8 @@ static void nf_conntrack_cleanup_net(struct net *net)
nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
free_percpu(net->ct.stat);
}
@@ -1272,15 +1271,6 @@ static int nf_conntrack_init_init_net(void)
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
- nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
- sizeof(struct nf_conn),
- 0, SLAB_DESTROY_BY_RCU, NULL);
- if (!nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- ret = -ENOMEM;
- goto err_cache;
- }
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -1302,8 +1292,6 @@ static int nf_conntrack_init_init_net(void)
err_helper:
nf_conntrack_proto_fini();
err_proto:
- kmem_cache_destroy(nf_conntrack_cachep);
-err_cache:
return ret;
}
@@ -1325,6 +1313,19 @@ static int nf_conntrack_init_net(struct net *net)
ret = -ENOMEM;
goto err_stat;
}
+
+ net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+ if (!net->ct.slabname)
+ goto err_slabname;
+
+ net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!net->ct.nf_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+ ret = -ENOMEM;
+ goto err_cache;
+ }
net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
&net->ct.hash_vmalloc, 1);
if (!net->ct.hash) {
@@ -1352,6 +1353,10 @@ err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
err_hash:
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+err_cache:
+ kfree(net->ct.slabname);
+err_slabname:
free_percpu(net->ct.stat);
err_stat:
return ret;
^ permalink raw reply related [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-04 13:37 ` Patrick McHardy
@ 2010-02-04 13:42 ` Jon Masters
0 siblings, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-04 13:42 UTC (permalink / raw)
To: Patrick McHardy
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Thu, 2010-02-04 at 14:37 +0100, Patrick McHardy wrote:
> Jon Masters wrote:
> > On Thu, 2010-02-04 at 14:04 +0100, Patrick McHardy wrote:
> >> In any case, we need a fix for this suitable for 2.6.33. If
> >> you don't like using the pointer, please send a patch to add
> >> an id to the network namespaces.
> >
> > Right. I think the quick solution is fine for 2.6.33. So that makes the
> > hashtable non-resize patch, the crash fix, and the cachep bits. I will
> > try to get involved and help you out with the per-ns hashtable clean
> > rather than just being a whiner :)
>
> This is the patch I'm going to commit unless unless there are further
> objections. Its Eric's patch with a change on top to allocate a unique
> name for the slab.
>
> > Thanks a bunch! Fedora kernels have already been built with this fix,
> > since it will allow us to close a fair number of "KVM goes boom" bugs.
>
> Thanks as well for your help.
Patch looks fine to me for the moment, unless it *really* matters about
the pointer exposure.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 12:10 ` Patrick McHardy
2010-02-03 18:38 ` Jon Masters
@ 2010-02-03 20:21 ` Jon Masters
2010-02-04 12:24 ` Patrick McHardy
1 sibling, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-03 20:21 UTC (permalink / raw)
To: Patrick McHardy
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Wed, 2010-02-03 at 13:10 +0100, Patrick McHardy wrote:
> Jon, could you give this patch a try please?
> plain text document attachment (x)
> commit 056ff3e3bd1563969a311697323ff929df94415c
> Author: Patrick McHardy <kaber@trash.net>
> Date: Wed Feb 3 12:58:06 2010 +0100
Patrick, can I regard this as the official fix for 2.6.33?
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-03 20:21 ` Jon Masters
@ 2010-02-04 12:24 ` Patrick McHardy
0 siblings, 0 replies; 71+ messages in thread
From: Patrick McHardy @ 2010-02-04 12:24 UTC (permalink / raw)
To: Jon Masters
Cc: Alexey Dobriyan, Eric Dumazet, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Jon Masters wrote:
> On Wed, 2010-02-03 at 13:10 +0100, Patrick McHardy wrote:
>
>> Jon, could you give this patch a try please?
>> plain text document attachment (x)
>> commit 056ff3e3bd1563969a311697323ff929df94415c
>> Author: Patrick McHardy <kaber@trash.net>
>> Date: Wed Feb 3 12:58:06 2010 +0100
>
> Patrick, can I regard this as the official fix for 2.6.33?
Yes, I'll send it upstream today. Thanks for your help.
^ permalink raw reply [flat|nested] 71+ messages in thread
* PROBLEM with summary: Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 16:46 ` Jon Masters
2010-02-02 16:48 ` Patrick McHardy
@ 2010-02-02 16:58 ` Jon Masters
2010-02-02 17:04 ` Patrick McHardy
1 sibling, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-02 16:58 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 11:46 -0500, Jon Masters wrote:
> On Tue, 2010-02-02 at 06:35 -0500, Jon Masters wrote:
>
> > I think there's something more fundamental going on here.
>
> What happens is the conntrack code attempts to free
> nf_conntrack_untracked back into the SL[U]B cache from which it
> allocates other ct's. There's just one problem...that's a static struct
> not from the cache. So, this is why we end up with the SLAB being
> corrupted and the address immediately following the
> nf_conntrack_untracked being corrupted.
>
> I shoved some debug comments into the destroy code to see if we were
> trying to free nf_conntrack_untracked, and bingo. I have shoved a panic
> in there now, will send you a backtrace.
So we attach after starting a VM due to icmpv6_error setting the ct in
some incoming skb to the "untracked" catchall one. Then we panic when I
force a panic if attempting to free the nf_conntrack_untracked static
struct with this backtrace:
#5 0xffffffff81455884 in panic (
fmt=0xffffffff81ec51e0 "JCM: nf_conntrack_destroy: trying to destroy
nf_conntrack_untracked!\n")
at kernel/panic.c:73
#6 0xffffffff813d266c in nf_conntrack_destroy (nfct=<value optimized
out>) at net/netfilter/core.c:244
#7 0xffffffff813abd97 in nf_conntrack_put (skb=0xffff880223b8b700) at
include/linux/skbuff.h:1924
#8 skb_release_head_state (skb=0xffff880223b8b700) at
net/core/skbuff.c:402
#9 0xffffffff813abaf9 in skb_release_all (skb=0xffff880223b8b700) at
net/core/skbuff.c:420
#10 __kfree_skb (skb=0xffff880223b8b700) at net/core/skbuff.c:435
#11 0xffffffff813abbfe in kfree_skb (skb=0xffff880223b8b700) at
net/core/skbuff.c:456
Could you please add (or recommend for me to test) some logic to catch
attempts to free the nf_conntrack_untracked and prevent it? Also, maybe
in the init_net code you could remove the re-initialization of the
untracked conntrack each time a namespace is created?
Thanks!
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: PROBLEM with summary: Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 16:58 ` PROBLEM with summary: " Jon Masters
@ 2010-02-02 17:04 ` Patrick McHardy
2010-02-02 17:16 ` Eric Dumazet
0 siblings, 1 reply; 71+ messages in thread
From: Patrick McHardy @ 2010-02-02 17:04 UTC (permalink / raw)
To: Jon Masters
Cc: Eric Dumazet, Alexey Dobriyan, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Jon Masters wrote:
> On Tue, 2010-02-02 at 11:46 -0500, Jon Masters wrote:
>> On Tue, 2010-02-02 at 06:35 -0500, Jon Masters wrote:
>>
>>> I think there's something more fundamental going on here.
>> What happens is the conntrack code attempts to free
>> nf_conntrack_untracked back into the SL[U]B cache from which it
>> allocates other ct's. There's just one problem...that's a static struct
>> not from the cache. So, this is why we end up with the SLAB being
>> corrupted and the address immediately following the
>> nf_conntrack_untracked being corrupted.
>>
>> I shoved some debug comments into the destroy code to see if we were
>> trying to free nf_conntrack_untracked, and bingo. I have shoved a panic
>> in there now, will send you a backtrace.
>
> So we attach after starting a VM due to icmpv6_error setting the ct in
> some incoming skb to the "untracked" catchall one. Then we panic when I
> force a panic if attempting to free the nf_conntrack_untracked static
> struct with this backtrace:
>
> #5 0xffffffff81455884 in panic (
> fmt=0xffffffff81ec51e0 "JCM: nf_conntrack_destroy: trying to destroy
> nf_conntrack_untracked!\n")
> at kernel/panic.c:73
> #6 0xffffffff813d266c in nf_conntrack_destroy (nfct=<value optimized
> out>) at net/netfilter/core.c:244
> #7 0xffffffff813abd97 in nf_conntrack_put (skb=0xffff880223b8b700) at
> include/linux/skbuff.h:1924
> #8 skb_release_head_state (skb=0xffff880223b8b700) at
> net/core/skbuff.c:402
> #9 0xffffffff813abaf9 in skb_release_all (skb=0xffff880223b8b700) at
> net/core/skbuff.c:420
> #10 __kfree_skb (skb=0xffff880223b8b700) at net/core/skbuff.c:435
> #11 0xffffffff813abbfe in kfree_skb (skb=0xffff880223b8b700) at
> net/core/skbuff.c:456
>
> Could you please add (or recommend for me to test) some logic to catch
> attempts to free the nf_conntrack_untracked and prevent it? Also, maybe
> in the init_net code you could remove the re-initialization of the
> untracked conntrack each time a namespace is created?
Ah nice catch, that seems to be the problem. When the untracked
conntrack is already attached to an skb and thus has refcnt > 1
and we re-initalize the refcnt, it will get freed.
The question is whether the ct_net pointer of the untracked conntrack
is actually required. If so, we need one instance per namespace,
otherwise we can just move initialization and cleanup to the init_net
init/cleanup functions. Alexey, do you happen to know this?
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: PROBLEM with summary: Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 17:04 ` Patrick McHardy
@ 2010-02-02 17:16 ` Eric Dumazet
0 siblings, 0 replies; 71+ messages in thread
From: Eric Dumazet @ 2010-02-02 17:16 UTC (permalink / raw)
To: Patrick McHardy
Cc: Jon Masters, Alexey Dobriyan, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Le mardi 02 février 2010 à 18:04 +0100, Patrick McHardy a écrit :
> Ah nice catch, that seems to be the problem. When the untracked
> conntrack is already attached to an skb and thus has refcnt > 1
> and we re-initalize the refcnt, it will get freed.
>
> The question is whether the ct_net pointer of the untracked conntrack
> is actually required. If so, we need one instance per namespace,
> otherwise we can just move initialization and cleanup to the init_net
> init/cleanup functions. Alexey, do you happen to know this?
>
One untracked per netns seems the way to go, and move it outside of
read_mostly area too, we obviously can modify its refcount frequently...
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: PROBLEM with summary: Re: [PATCH] netfilter: per netns nf_conntrack_cachep
@ 2010-02-02 17:16 ` Eric Dumazet
0 siblings, 0 replies; 71+ messages in thread
From: Eric Dumazet @ 2010-02-02 17:16 UTC (permalink / raw)
To: Patrick McHardy
Cc: Jon Masters, Alexey Dobriyan, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
Le mardi 02 février 2010 à 18:04 +0100, Patrick McHardy a écrit :
> Ah nice catch, that seems to be the problem. When the untracked
> conntrack is already attached to an skb and thus has refcnt > 1
> and we re-initalize the refcnt, it will get freed.
>
> The question is whether the ct_net pointer of the untracked conntrack
> is actually required. If so, we need one instance per namespace,
> otherwise we can just move initialization and cleanup to the init_net
> init/cleanup functions. Alexey, do you happen to know this?
>
One untracked per netns seems the way to go, and move it outside of
read_mostly area too, we obviously can modify its refcount frequently...
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: PROBLEM with summary: Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 17:16 ` Eric Dumazet
(?)
@ 2010-02-02 17:23 ` Jon Masters
-1 siblings, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 17:23 UTC (permalink / raw)
To: Eric Dumazet
Cc: Patrick McHardy, Alexey Dobriyan, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Tue, 2010-02-02 at 18:16 +0100, Eric Dumazet wrote:
> Le mardi 02 février 2010 à 18:04 +0100, Patrick McHardy a écrit :
>
> > Ah nice catch, that seems to be the problem. When the untracked
> > conntrack is already attached to an skb and thus has refcnt > 1
> > and we re-initalize the refcnt, it will get freed.
> >
> > The question is whether the ct_net pointer of the untracked conntrack
> > is actually required. If so, we need one instance per namespace,
> > otherwise we can just move initialization and cleanup to the init_net
> > init/cleanup functions. Alexey, do you happen to know this?
> >
>
> One untracked per netns seems the way to go, and move it outside of
> read_mostly area too, we obviously can modify its refcount frequently...
Sure, that will work. Also, rather than just the NF_CT_ASSERT on the use
count, maybe worth catching the specific case of trying to free the
untracked ct, but that's only if it's not a horrible fast path.
Anyway, thanks. If you want to send me a patch, I'll try it.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-01 14:52 ` Eric Dumazet
(?)
(?)
@ 2010-02-02 4:36 ` Jon Masters
2010-02-02 7:02 ` Jon Masters
-1 siblings, 1 reply; 71+ messages in thread
From: Jon Masters @ 2010-02-02 4:36 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Mon, 2010-02-01 at 15:52 +0100, Eric Dumazet wrote:
> [PATCH] netfilter: per netns nf_conntrack_cachep
>
> nf_conntrack_cachep is currently shared by all netns instances, but
> because of SLAB_DESTROY_BY_RCU special semantics, this is wrong.
>
> If we use a shared slab cache, one object can instantly flight between
> one hash table (netns ONE) to another one (netns TWO), and concurrent
> reader (doing a lookup in netns ONE, 'finding' an object of netns TWO)
> can be fooled without notice, because no RCU grace period has to be
> observed between object freeing and its reuse.
>
> We dont have this problem with UDP/TCP slab caches because TCP/UDP
> hashtables are global to the machine (and each object has a pointer to
> its netns).
>
> If we use per netns conntrack hash tables, we also *must* use per netns
> conntrack slab caches, to guarantee an object can not escape from one
> namespace to another one.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
You're totally right, I'd missed this (RCU behavior wrt SLAB caches was
one of these black magic voodoo things until Peter Z. set me straight
with his explanation that it only applies to the freeing of the cache
itself, not the objects - that makes sense in the grand scheme of what
RCU is trying to achieve, and so in theory, yeah we could just verify
the ct object we get back out of the cache is from the same ns, should
work just as well as doing per-ns caches, but not as clean IMO). I'm
still not sure it explains the specific corruption I'm seeing, but I
just made some coffee and put on some T. Rex to help me think.
Jon.
P.S. What's up with all the "Welcome, Mr. Bond" and "i_see_dead_people"
and other comments in that code anyway? If you're going to use movie
references, perhaps standardize on one particular genre :)
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-02 4:36 ` Jon Masters
@ 2010-02-02 7:02 ` Jon Masters
0 siblings, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 7:02 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Mon, 2010-02-01 at 23:36 -0500, Jon Masters wrote:
> On Mon, 2010-02-01 at 15:52 +0100, Eric Dumazet wrote:
>
> > [PATCH] netfilter: per netns nf_conntrack_cachep
> >
> > nf_conntrack_cachep is currently shared by all netns instances, but
> > because of SLAB_DESTROY_BY_RCU special semantics, this is wrong.
> >
> > If we use a shared slab cache, one object can instantly flight between
> > one hash table (netns ONE) to another one (netns TWO), and concurrent
> > reader (doing a lookup in netns ONE, 'finding' an object of netns TWO)
> > can be fooled without notice, because no RCU grace period has to be
> > observed between object freeing and its reuse.
> >
> > We dont have this problem with UDP/TCP slab caches because TCP/UDP
> > hashtables are global to the machine (and each object has a pointer to
> > its netns).
> >
> > If we use per netns conntrack hash tables, we also *must* use per netns
> > conntrack slab caches, to guarantee an object can not escape from one
> > namespace to another one.
> >
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>
> You're totally right, I'd missed this (RCU behavior wrt SLAB caches was
> one of these black magic voodoo things until Peter Z. set me straight
> with his explanation that it only applies to the freeing of the cache
> itself, not the objects - that makes sense in the grand scheme of what
> RCU is trying to achieve, and so in theory, yeah we could just verify
> the ct object we get back out of the cache is from the same ns, should
> work just as well as doing per-ns caches, but not as clean IMO). I'm
> still not sure it explains the specific corruption I'm seeing, but I
> just made some coffee and put on some T. Rex to help me think.
It happens in the kmem_cache_free in nf_conntrack_free (as I can
trivially confirm with printk), so you're almost certainly right, but I
am just shoving in a bunch of debug code equivalent to the logic in SLUB
kmem cache freeing to see precisely how that pointer corruption occurs,
mostly for curiosity, and to confirm for sure what happens.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-01 14:52 ` Eric Dumazet
` (2 preceding siblings ...)
(?)
@ 2010-02-02 10:47 ` Jon Masters
-1 siblings, 0 replies; 71+ messages in thread
From: Jon Masters @ 2010-02-02 10:47 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Patrick McHardy, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
On Mon, 2010-02-01 at 15:52 +0100, Eric Dumazet wrote:
> [PATCH] netfilter: per netns nf_conntrack_cachep
>
> nf_conntrack_cachep is currently shared by all netns instances, but
> because of SLAB_DESTROY_BY_RCU special semantics, this is wrong.
>
> If we use a shared slab cache, one object can instantly flight between
> one hash table (netns ONE) to another one (netns TWO), and concurrent
> reader (doing a lookup in netns ONE, 'finding' an object of netns TWO)
> can be fooled without notice, because no RCU grace period has to be
> observed between object freeing and its reuse.
I'll test this patch.
After some lengthy debugging, what actually happens here is that the
nf_conntrack_cachep SL*U*B gets corrupted such that the contained
per-cpu cpu_slabs are all pointing to the address of htable_size, which
is then helpfully set to be the value of the individual freelists (the
address of the base of the kmem_cache), or offset '51' into the table.
The worrying thing is it looks like this is actually corrupting other
random memory too, it just happens to bite once we get this far.
Jon.
^ permalink raw reply [flat|nested] 71+ messages in thread* Re: [PATCH] netfilter: per netns nf_conntrack_cachep
2010-02-01 14:52 ` Eric Dumazet
` (3 preceding siblings ...)
(?)
@ 2010-02-04 14:00 ` Patrick McHardy
-1 siblings, 0 replies; 71+ messages in thread
From: Patrick McHardy @ 2010-02-04 14:00 UTC (permalink / raw)
To: Eric Dumazet
Cc: Alexey Dobriyan, Jon Masters, linux-kernel, netdev,
netfilter-devel, Paul E. McKenney
[-- Attachment #1: Type: text/plain, Size: 994 bytes --]
Eric Dumazet wrote:
> [PATCH] netfilter: per netns nf_conntrack_cachep
>
> nf_conntrack_cachep is currently shared by all netns instances, but
> because of SLAB_DESTROY_BY_RCU special semantics, this is wrong.
>
> If we use a shared slab cache, one object can instantly flight between
> one hash table (netns ONE) to another one (netns TWO), and concurrent
> reader (doing a lookup in netns ONE, 'finding' an object of netns TWO)
> can be fooled without notice, because no RCU grace period has to be
> observed between object freeing and its reuse.
>
> We dont have this problem with UDP/TCP slab caches because TCP/UDP
> hashtables are global to the machine (and each object has a pointer to
> its netns).
>
> If we use per netns conntrack hash tables, we also *must* use per netns
> conntrack slab caches, to guarantee an object can not escape from one
> namespace to another one.
Applied with the discussed change to allocate a unique name (attached
again for reference), thanks Eric.
[-- Attachment #2: x --]
[-- Type: text/plain, Size: 5090 bytes --]
commit ab59b19be78aac65cdd599fb5002c9019885e061
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu Feb 4 14:54:05 2010 +0100
netfilter: nf_conntrack: per netns nf_conntrack_cachep
nf_conntrack_cachep is currently shared by all netns instances, but
because of SLAB_DESTROY_BY_RCU special semantics, this is wrong.
If we use a shared slab cache, one object can instantly flight between
one hash table (netns ONE) to another one (netns TWO), and concurrent
reader (doing a lookup in netns ONE, 'finding' an object of netns TWO)
can be fooled without notice, because no RCU grace period has to be
observed between object freeing and its reuse.
We dont have this problem with UDP/TCP slab caches because TCP/UDP
hashtables are global to the machine (and each object has a pointer to
its netns).
If we use per netns conntrack hash tables, we also *must* use per netns
conntrack slab caches, to guarantee an object can not escape from one
namespace to another one.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
[Patrick: added unique slab name allocation]
Signed-off-by: Patrick McHardy <kaber@trash.net>
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index ba1ba0c..aed23b6 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -11,6 +11,7 @@ struct nf_conntrack_ecache;
struct netns_ct {
atomic_t count;
unsigned int expect_count;
+ struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
@@ -28,5 +29,6 @@ struct netns_ct {
#endif
int hash_vmalloc;
int expect_vmalloc;
+ char *slabname;
};
#endif
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 37e2b88..9de4bd4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -63,8 +63,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
struct nf_conn nf_conntrack_untracked __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
-static struct kmem_cache *nf_conntrack_cachep __read_mostly;
-
static int nf_conntrack_hash_rnd_initted;
static unsigned int nf_conntrack_hash_rnd;
@@ -572,7 +570,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
if (ct == NULL) {
pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
atomic_dec(&net->ct.count);
@@ -611,7 +609,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
atomic_dec(&net->ct.count);
nf_ct_ext_free(ct);
- kmem_cache_free(nf_conntrack_cachep, ct);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
@@ -1119,7 +1117,6 @@ static void nf_conntrack_cleanup_init_net(void)
nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
- kmem_cache_destroy(nf_conntrack_cachep);
}
static void nf_conntrack_cleanup_net(struct net *net)
@@ -1137,6 +1134,8 @@ static void nf_conntrack_cleanup_net(struct net *net)
nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
free_percpu(net->ct.stat);
}
@@ -1272,15 +1271,6 @@ static int nf_conntrack_init_init_net(void)
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
- nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
- sizeof(struct nf_conn),
- 0, SLAB_DESTROY_BY_RCU, NULL);
- if (!nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- ret = -ENOMEM;
- goto err_cache;
- }
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -1302,8 +1292,6 @@ static int nf_conntrack_init_init_net(void)
err_helper:
nf_conntrack_proto_fini();
err_proto:
- kmem_cache_destroy(nf_conntrack_cachep);
-err_cache:
return ret;
}
@@ -1325,6 +1313,21 @@ static int nf_conntrack_init_net(struct net *net)
ret = -ENOMEM;
goto err_stat;
}
+
+ net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+ if (!net->ct.slabname) {
+ ret = -ENOMEM;
+ goto err_slabname;
+ }
+
+ net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!net->ct.nf_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+ ret = -ENOMEM;
+ goto err_cache;
+ }
net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
&net->ct.hash_vmalloc, 1);
if (!net->ct.hash) {
@@ -1352,6 +1355,10 @@ err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
err_hash:
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+err_cache:
+ kfree(net->ct.slabname);
+err_slabname:
free_percpu(net->ct.stat);
err_stat:
return ret;
^ permalink raw reply related [flat|nested] 71+ messages in thread