[PATCH 0/1] netfilter: fix soft lockup when netlink adds new entries

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/1] netfilter: fix soft lockup when netlink adds new entries
@ 2012-02-21 13:53 Jozsef Kadlecsik
  2012-02-21 13:53 ` [PATCH 1/1] " Jozsef Kadlecsik
  0 siblings, 1 reply; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-21 13:53 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Pablo Neira Ayuso, Zambo Marcell, Jozsef Kadlecsik

Hi Pablo,

Please review the next patch and push forward as a bugfix.

Best regards,
Jozsef

Jozsef Kadlecsik (1):
  netfilter: fix soft lockup when netlink adds new entries

 net/netfilter/nf_conntrack_netlink.c |   43 ++++++++++++---------------------
 1 files changed, 16 insertions(+), 27 deletions(-)


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-21 13:53 [PATCH 0/1] netfilter: fix soft lockup when netlink adds new entries Jozsef Kadlecsik
@ 2012-02-21 13:53 ` Jozsef Kadlecsik
  2012-02-21 14:52   ` Pablo Neira Ayuso
  0 siblings, 1 reply; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-21 13:53 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Pablo Neira Ayuso, Zambo Marcell, Jozsef Kadlecsik

Marcell Zambo and Janos Farago noticed and reported that when
new conntrack entries are added via netlink and the conntrack table
gets full, soft lockup happens. This is because the nf_conntrack_lock
is held while nf_conntrack_alloc is called, which is in turn wants
to lock nf_conntrack_lock while evicting entries from the full table.

The patch fixes the soft lockup with limiting the holding of the
nf_conntrack_lock to the minimum, where it's absolutely required.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/nf_conntrack_netlink.c |   43 ++++++++++++---------------------
 1 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9307b03..cc70517 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 						    nf_ct_protonum(ct));
 		if (helper == NULL) {
 			rcu_read_unlock();
-			spin_unlock_bh(&nf_conntrack_lock);
 #ifdef CONFIG_MODULES
 			if (request_module("nfct-helper-%s", helpname) < 0) {
-				spin_lock_bh(&nf_conntrack_lock);
 				err = -EOPNOTSUPP;
 				goto err1;
 			}
 
-			spin_lock_bh(&nf_conntrack_lock);
 			rcu_read_lock();
 			helper = __nf_conntrack_helper_find(helpname,
 							    nf_ct_l3num(ct),
@@ -1469,7 +1466,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 		tstamp->start = ktime_to_ns(ktime_get_real());
 
 	add_timer(&ct->timeout);
+	spin_lock_bh(&nf_conntrack_lock);
 	nf_conntrack_hash_insert(ct);
+	nf_conntrack_get(&ct->ct_general);
+	spin_unlock_bh(&nf_conntrack_lock);
 	rcu_read_unlock();
 
 	return ct;
@@ -1490,6 +1490,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	struct nf_conntrack_tuple otuple, rtuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nf_conn *ct;
 	u_int8_t u3 = nfmsg->nfgen_family;
 	u16 zone;
 	int err;
@@ -1512,25 +1513,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 
 	spin_lock_bh(&nf_conntrack_lock);
 	if (cda[CTA_TUPLE_ORIG])
-		h = __nf_conntrack_find(net, zone, &otuple);
+		h = nf_conntrack_find_get(net, zone, &otuple);
 	else if (cda[CTA_TUPLE_REPLY])
-		h = __nf_conntrack_find(net, zone, &rtuple);
+		h = nf_conntrack_find_get(net, zone, &rtuple);
+	spin_unlock_bh(&nf_conntrack_lock);
 
 	if (h == NULL) {
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
-			struct nf_conn *ct;
 			enum ip_conntrack_events events;
 
 			ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
 							&rtuple, u3);
-			if (IS_ERR(ct)) {
-				err = PTR_ERR(ct);
-				goto out_unlock;
-			}
+			if (IS_ERR(ct))
+				return PTR_ERR(ct);
+
 			err = 0;
-			nf_conntrack_get(&ct->ct_general);
-			spin_unlock_bh(&nf_conntrack_lock);
 			if (test_bit(IPS_EXPECTED_BIT, &ct->status))
 				events = IPCT_RELATED;
 			else
@@ -1545,23 +1543,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      ct, NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
 			nf_ct_put(ct);
-		} else
-			spin_unlock_bh(&nf_conntrack_lock);
+		}
 
 		return err;
 	}
 	/* implicit 'else' */
 
-	/* We manipulate the conntrack inside the global conntrack table lock,
-	 * so there's no need to increase the refcount */
 	err = -EEXIST;
+	ct = nf_ct_tuplehash_to_ctrack(h);
 	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
-		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
+		spin_lock_bh(&nf_conntrack_lock);
 		err = ctnetlink_change_conntrack(ct, cda);
+		spin_unlock_bh(&nf_conntrack_lock);
 		if (err == 0) {
-			nf_conntrack_get(&ct->ct_general);
-			spin_unlock_bh(&nf_conntrack_lock);
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
@@ -1570,15 +1564,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_MARK),
 						      ct, NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
-			nf_ct_put(ct);
-		} else
-			spin_unlock_bh(&nf_conntrack_lock);
-
-		return err;
+		}
 	}
 
-out_unlock:
-	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_put(ct);
 	return err;
 }
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-21 13:53 ` [PATCH 1/1] " Jozsef Kadlecsik
@ 2012-02-21 14:52   ` Pablo Neira Ayuso
  2012-02-21 15:06     ` Jozsef Kadlecsik
  0 siblings, 1 reply; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-21 14:52 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

Hi Jozsef,

On Tue, Feb 21, 2012 at 02:53:57PM +0100, Jozsef Kadlecsik wrote:
> Marcell Zambo and Janos Farago noticed and reported that when
> new conntrack entries are added via netlink and the conntrack table
> gets full, soft lockup happens. This is because the nf_conntrack_lock
> is held while nf_conntrack_alloc is called, which is in turn wants
> to lock nf_conntrack_lock while evicting entries from the full table.

Good catch.

> The patch fixes the soft lockup with limiting the holding of the
> nf_conntrack_lock to the minimum, where it's absolutely required.

Still one issue, see below.

> Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> ---
>  net/netfilter/nf_conntrack_netlink.c |   43 ++++++++++++---------------------
>  1 files changed, 16 insertions(+), 27 deletions(-)
> 
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 9307b03..cc70517 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  						    nf_ct_protonum(ct));
>  		if (helper == NULL) {
>  			rcu_read_unlock();
> -			spin_unlock_bh(&nf_conntrack_lock);
>  #ifdef CONFIG_MODULES
>  			if (request_module("nfct-helper-%s", helpname) < 0) {
> -				spin_lock_bh(&nf_conntrack_lock);
>  				err = -EOPNOTSUPP;
>  				goto err1;
>  			}
>  
> -			spin_lock_bh(&nf_conntrack_lock);
>  			rcu_read_lock();
>  			helper = __nf_conntrack_helper_find(helpname,
>  							    nf_ct_l3num(ct),
> @@ -1469,7 +1466,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  		tstamp->start = ktime_to_ns(ktime_get_real());
>  
>  	add_timer(&ct->timeout);
> +	spin_lock_bh(&nf_conntrack_lock);
>  	nf_conntrack_hash_insert(ct);
> +	nf_conntrack_get(&ct->ct_general);
> +	spin_unlock_bh(&nf_conntrack_lock);
>  	rcu_read_unlock();
>  
>  	return ct;
> @@ -1490,6 +1490,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
>  	struct nf_conntrack_tuple otuple, rtuple;
>  	struct nf_conntrack_tuple_hash *h = NULL;
>  	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
> +	struct nf_conn *ct;
>  	u_int8_t u3 = nfmsg->nfgen_family;
>  	u16 zone;
>  	int err;
> @@ -1512,25 +1513,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
>  
>  	spin_lock_bh(&nf_conntrack_lock);
>  	if (cda[CTA_TUPLE_ORIG])
> -		h = __nf_conntrack_find(net, zone, &otuple);
> +		h = nf_conntrack_find_get(net, zone, &otuple);
>  	else if (cda[CTA_TUPLE_REPLY])
> -		h = __nf_conntrack_find(net, zone, &rtuple);
> +		h = nf_conntrack_find_get(net, zone, &rtuple);
> +	spin_unlock_bh(&nf_conntrack_lock);

We still have to keep the lock for the update case. Otherwise we may
clash with one update from the kernel.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-21 14:52   ` Pablo Neira Ayuso
@ 2012-02-21 15:06     ` Jozsef Kadlecsik
  2012-02-21 15:20       ` Pablo Neira Ayuso
  2012-02-23 10:12       ` Pablo Neira Ayuso
  0 siblings, 2 replies; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-21 15:06 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

Hi Pablo,

On Tue, 21 Feb 2012, Pablo Neira Ayuso wrote:
[...]
> Still one issue, see below.
> 
> > @@ -1512,25 +1513,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
> >  
> >  	spin_lock_bh(&nf_conntrack_lock);
> >  	if (cda[CTA_TUPLE_ORIG])
> > -		h = __nf_conntrack_find(net, zone, &otuple);
> > +		h = nf_conntrack_find_get(net, zone, &otuple);
> >  	else if (cda[CTA_TUPLE_REPLY])
> > -		h = __nf_conntrack_find(net, zone, &rtuple);
> > +		h = nf_conntrack_find_get(net, zone, &rtuple);
> > +	spin_unlock_bh(&nf_conntrack_lock);
> 
> We still have to keep the lock for the update case. Otherwise we may
> clash with one update from the kernel.

By calling nf_conntrack_find_get we are safe to release the lock, because 
the conntack entry won't be removed behind us. Later in the patch the lock 
is re-acquired to serialize the updates:

+               spin_lock_bh(&nf_conntrack_lock);
                err = ctnetlink_change_conntrack(ct, cda);
+               spin_unlock_bh(&nf_conntrack_lock);

Or do I miss something else here?

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-21 15:06     ` Jozsef Kadlecsik
@ 2012-02-21 15:20       ` Pablo Neira Ayuso
  2012-02-23 10:12       ` Pablo Neira Ayuso
  1 sibling, 0 replies; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-21 15:20 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

On Tue, Feb 21, 2012 at 04:06:59PM +0100, Jozsef Kadlecsik wrote:
> Hi Pablo,
> 
> On Tue, 21 Feb 2012, Pablo Neira Ayuso wrote:
> [...]
> > Still one issue, see below.
> > 
> > > @@ -1512,25 +1513,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
> > >  
> > >  	spin_lock_bh(&nf_conntrack_lock);
> > >  	if (cda[CTA_TUPLE_ORIG])
> > > -		h = __nf_conntrack_find(net, zone, &otuple);
> > > +		h = nf_conntrack_find_get(net, zone, &otuple);
> > >  	else if (cda[CTA_TUPLE_REPLY])
> > > -		h = __nf_conntrack_find(net, zone, &rtuple);
> > > +		h = nf_conntrack_find_get(net, zone, &rtuple);
> > > +	spin_unlock_bh(&nf_conntrack_lock);
> > 
> > We still have to keep the lock for the update case. Otherwise we may
> > clash with one update from the kernel.
> 
> By calling nf_conntrack_find_get we are safe to release the lock, because 
> the conntack entry won't be removed behind us. Later in the patch the lock 
> is re-acquired to serialize the updates:
> 
> +               spin_lock_bh(&nf_conntrack_lock);
>                 err = ctnetlink_change_conntrack(ct, cda);
> +               spin_unlock_bh(&nf_conntrack_lock);
> 
> Or do I miss something else here?

Oh, I missed that part of the patch. It's fine.

I'll pass it for mainstream.

Thanks Jozsef!

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-21 15:06     ` Jozsef Kadlecsik
  2012-02-21 15:20       ` Pablo Neira Ayuso
@ 2012-02-23 10:12       ` Pablo Neira Ayuso
  2012-02-23 12:43         ` Jozsef Kadlecsik
  1 sibling, 1 reply; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-23 10:12 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

Hi Jozsef,

On Tue, Feb 21, 2012 at 04:06:59PM +0100, Jozsef Kadlecsik wrote:
> Or do I miss something else here?

I just noticed one problem.

With your approach, we may lose race if one packet inserts same conntrack
entry while we're adding one conntrack. Thus resulting in two conntracks
with the same tuples in the table.

One possible solution would be to check if it already exists before
adding it to the list, but this will add too many extra cycles for
each conntrack that is added via ctnetlink.

I'm also considering disabling early_drop from ctnetlink and to return
-ENOMEM instead. Not sure if it makes sense the early drop mechanism
via ctnetlink. If we hit ENOMEM from user-space while adding one new
conntrack, we can iterate over the table and delete conntrack based on
some criteria, then retry.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-23 10:12       ` Pablo Neira Ayuso
@ 2012-02-23 12:43         ` Jozsef Kadlecsik
  2012-02-23 15:46           ` Pablo Neira Ayuso
  0 siblings, 1 reply; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-23 12:43 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

Hi Pablo,

On Thu, 23 Feb 2012, Pablo Neira Ayuso wrote:

> On Tue, Feb 21, 2012 at 04:06:59PM +0100, Jozsef Kadlecsik wrote:
> > Or do I miss something else here?
> 
> I just noticed one problem.
> 
> With your approach, we may lose race if one packet inserts same conntrack
> entry while we're adding one conntrack. Thus resulting in two conntracks
> with the same tuples in the table.

Yes, you're right, that race condition is possible.
 
> One possible solution would be to check if it already exists before
> adding it to the list, but this will add too many extra cycles for
> each conntrack that is added via ctnetlink.

Actually, netfilter for normal conntrack entries does the same in 
__nf_conntrack_confirm. So entries added via ctnetlink would not be 
penalized if the same checking were added to ctnetlink_create_conntrack
in the locked region. Shall I send a patch over the previous one?
 
> I'm also considering disabling early_drop from ctnetlink and to return
> -ENOMEM instead. Not sure if it makes sense the early drop mechanism
> via ctnetlink. If we hit ENOMEM from user-space while adding one new
> conntrack, we can iterate over the table and delete conntrack based on
> some criteria, then retry.

There are advantages in early drop too: it's a good dice playing, which 
can be sufficient in many cases.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-23 12:43         ` Jozsef Kadlecsik
@ 2012-02-23 15:46           ` Pablo Neira Ayuso
  2012-02-23 20:44             ` Jozsef Kadlecsik
  0 siblings, 1 reply; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-23 15:46 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

On Thu, Feb 23, 2012 at 01:43:06PM +0100, Jozsef Kadlecsik wrote:
> Hi Pablo,
> 
> On Thu, 23 Feb 2012, Pablo Neira Ayuso wrote:
> 
> > On Tue, Feb 21, 2012 at 04:06:59PM +0100, Jozsef Kadlecsik wrote:
> > > Or do I miss something else here?
> > 
> > I just noticed one problem.
> > 
> > With your approach, we may lose race if one packet inserts same conntrack
> > entry while we're adding one conntrack. Thus resulting in two conntracks
> > with the same tuples in the table.
> 
> Yes, you're right, that race condition is possible.
>  
> > One possible solution would be to check if it already exists before
> > adding it to the list, but this will add too many extra cycles for
> > each conntrack that is added via ctnetlink.
> 
> Actually, netfilter for normal conntrack entries does the same in 
> __nf_conntrack_confirm. So entries added via ctnetlink would not be 
> penalized if the same checking were added to ctnetlink_create_conntrack
> in the locked region. Shall I send a patch over the previous one?

Yes, please.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-23 15:46           ` Pablo Neira Ayuso
@ 2012-02-23 20:44             ` Jozsef Kadlecsik
  2012-02-24  1:24               ` Pablo Neira Ayuso
  0 siblings, 1 reply; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-23 20:44 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

On Thu, 23 Feb 2012, Pablo Neira Ayuso wrote:

> On Thu, Feb 23, 2012 at 01:43:06PM +0100, Jozsef Kadlecsik wrote:
> > 
> > On Thu, 23 Feb 2012, Pablo Neira Ayuso wrote:
> > 
> > > On Tue, Feb 21, 2012 at 04:06:59PM +0100, Jozsef Kadlecsik wrote:
> > > > Or do I miss something else here?
> > > 
> > > I just noticed one problem.
> > > 
> > > With your approach, we may lose race if one packet inserts same conntrack
> > > entry while we're adding one conntrack. Thus resulting in two conntracks
> > > with the same tuples in the table.
> > 
> > Yes, you're right, that race condition is possible.
> >  
> > > One possible solution would be to check if it already exists before
> > > adding it to the list, but this will add too many extra cycles for
> > > each conntrack that is added via ctnetlink.
> > 
> > Actually, netfilter for normal conntrack entries does the same in 
> > __nf_conntrack_confirm. So entries added via ctnetlink would not be 
> > penalized if the same checking were added to ctnetlink_create_conntrack
> > in the locked region. Shall I send a patch over the previous one?
> 
> Yes, please.

OK, here it comes:

The previous patch with the title "netfilter: fix soft lockup
when netlink adds new entries" introduced a race: conntrack and
ctnetlink could insert the same entry twice into the hash table.
The patch eliminates the race condition by using the same checking
as conntrack confirm does.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/net/netfilter/nf_conntrack.h |    2 +
 net/netfilter/nf_conntrack_core.c    |   41 ++++++++++++++++++++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c |    9 +++----
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 8a2b0ae..48bfe75 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -210,6 +210,8 @@ __nf_conntrack_find(struct net *net, u16 zone,
 		    const struct nf_conntrack_tuple *tuple);
 
 extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern int nf_conntrack_hash_check_insert(struct net *net, u16 zone,
+					  struct nf_conn *ct);
 extern void nf_ct_delete_from_lists(struct nf_conn *ct);
 extern void nf_ct_insert_dying_list(struct nf_conn *ct);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 76613f5..3d829d6 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -418,6 +418,47 @@ void nf_conntrack_hash_insert(struct nf_conn *ct)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
 
+int
+nf_conntrack_hash_check_insert(struct net *net, u16 zone, struct nf_conn *ct)
+{
+	unsigned int hash, repl_hash;
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
+
+	hash = hash_conntrack(net, zone,
+			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	spin_lock_bh(&nf_conntrack_lock);
+
+	/* See if there's one in the list already, including reverse */
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+
+	add_timer(&ct->timeout);
+	atomic_inc(&ct->ct_general.use);
+	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+	NF_CT_STAT_INC(net, insert);
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	return 0;
+
+out:
+	NF_CT_STAT_INC(net, insert_failed);
+	spin_unlock_bh(&nf_conntrack_lock);
+	return -EEXIST;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+
 /* Confirm a connection given skb; places it in hash table */
 int
 __nf_conntrack_confirm(struct sk_buff *skb)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index cc70517..379d34e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1465,11 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 	if (tstamp)
 		tstamp->start = ktime_to_ns(ktime_get_real());
 
-	add_timer(&ct->timeout);
-	spin_lock_bh(&nf_conntrack_lock);
-	nf_conntrack_hash_insert(ct);
-	nf_conntrack_get(&ct->ct_general);
-	spin_unlock_bh(&nf_conntrack_lock);
+	err = nf_conntrack_hash_check_insert(net, zone, ct);
+	if (err < 0)
+		goto err2;
+
 	rcu_read_unlock();
 
 	return ct;
-- 
1.7.0.4

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-23 20:44             ` Jozsef Kadlecsik
@ 2012-02-24  1:24               ` Pablo Neira Ayuso
  2012-02-24  6:44                 ` Eric Dumazet
  2012-02-24  8:06                 ` Jozsef Kadlecsik
  0 siblings, 2 replies; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-24  1:24 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

On Thu, Feb 23, 2012 at 09:44:21PM +0100, Jozsef Kadlecsik wrote:
> OK, here it comes:
> 
> The previous patch with the title "netfilter: fix soft lockup
> when netlink adds new entries" introduced a race: conntrack and
> ctnetlink could insert the same entry twice into the hash table.
> The patch eliminates the race condition by using the same checking
> as conntrack confirm does.
> 
> Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> ---
>  include/net/netfilter/nf_conntrack.h |    2 +
>  net/netfilter/nf_conntrack_core.c    |   41 ++++++++++++++++++++++++++++++++++
>  net/netfilter/nf_conntrack_netlink.c |    9 +++----
>  3 files changed, 47 insertions(+), 5 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 8a2b0ae..48bfe75 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -210,6 +210,8 @@ __nf_conntrack_find(struct net *net, u16 zone,
>  		    const struct nf_conntrack_tuple *tuple);
>  
>  extern void nf_conntrack_hash_insert(struct nf_conn *ct);
> +extern int nf_conntrack_hash_check_insert(struct net *net, u16 zone,
> +					  struct nf_conn *ct);

nf_conntrack_hash_insert has no clients anymore after this change.

We have two choices here:

1) Expand nf_conntrack_hash_insert to perform the checking inside. You can
use the same prototype btw, net and zone can be extracted from the ct.

2) For better code readability (and we save one exported symbol), add
the code that you propose for nf_conntrack_hash_check_insert to
nf_conntrack_netlink.c. I don't think we would ever have another client
of nf_conntrack_hash_insert.

I think I prefer option 2.

BTW, nf_conntrack_alloc allocation can now be GFP_KERNEL.

>  extern void nf_ct_delete_from_lists(struct nf_conn *ct);
>  extern void nf_ct_insert_dying_list(struct nf_conn *ct);
>  
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 76613f5..3d829d6 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -418,6 +418,47 @@ void nf_conntrack_hash_insert(struct nf_conn *ct)
>  }
>  EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
>  
> +int
> +nf_conntrack_hash_check_insert(struct net *net, u16 zone, struct nf_conn *ct)
> +{
> +	unsigned int hash, repl_hash;
> +	struct nf_conntrack_tuple_hash *h;
> +	struct hlist_nulls_node *n;
> +
> +	hash = hash_conntrack(net, zone,
> +			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> +	repl_hash = hash_conntrack(net, zone,
> +				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> +
> +	spin_lock_bh(&nf_conntrack_lock);
> +
> +	/* See if there's one in the list already, including reverse */
> +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
> +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> +				      &h->tuple) &&
> +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> +			goto out;
> +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
> +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
> +				      &h->tuple) &&
> +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> +			goto out;
> +
> +	add_timer(&ct->timeout);
> +	atomic_inc(&ct->ct_general.use);

better nf_conntrack_get?

> +	__nf_conntrack_hash_insert(ct, hash, repl_hash);
> +	NF_CT_STAT_INC(net, insert);

Good catch, this stats increment was missing.

> +	spin_unlock_bh(&nf_conntrack_lock);
> +
> +	return 0;
> +
> +out:
> +	NF_CT_STAT_INC(net, insert_failed);
> +	spin_unlock_bh(&nf_conntrack_lock);
> +	return -EEXIST;
> +}
> +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
> +
>  /* Confirm a connection given skb; places it in hash table */
>  int
>  __nf_conntrack_confirm(struct sk_buff *skb)
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index cc70517..379d34e 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1465,11 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  	if (tstamp)
>  		tstamp->start = ktime_to_ns(ktime_get_real());
>  
> -	add_timer(&ct->timeout);

Hm, this timer addition out of the lock slipped through the previous
patch.

I think it's better if we revert the previous patch and we pass a new
revamped version including the initial and this change.

It will be easier to pass it to -stable.

Would you be OK with this?

Thanks Jozsef.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24  1:24               ` Pablo Neira Ayuso
@ 2012-02-24  6:44                 ` Eric Dumazet
  2012-02-24 10:01                   ` Pablo Neira Ayuso
  2012-02-24  8:06                 ` Jozsef Kadlecsik
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2012-02-24  6:44 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Jozsef Kadlecsik, netfilter-devel, Zambo Marcell

Le vendredi 24 février 2012 à 02:24 +0100, Pablo Neira Ayuso a écrit :

> BTW, nf_conntrack_alloc allocation can now be GFP_KERNEL.
> 

Without looking at the code, I would say I doubt its doable ;)

Input path is run from softirq handler.


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24  6:44                 ` Eric Dumazet
@ 2012-02-24 10:01                   ` Pablo Neira Ayuso
  2012-02-24 10:11                     ` Jozsef Kadlecsik
  2012-02-24 11:36                     ` Pablo Neira Ayuso
  0 siblings, 2 replies; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-24 10:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jozsef Kadlecsik, netfilter-devel, Zambo Marcell

On Fri, Feb 24, 2012 at 07:44:23AM +0100, Eric Dumazet wrote:
> Le vendredi 24 février 2012 à 02:24 +0100, Pablo Neira Ayuso a écrit :
> 
> > BTW, nf_conntrack_alloc allocation can now be GFP_KERNEL.
> > 
> 
> Without looking at the code, I would say I doubt its doable ;)
>
> Input path is run from softirq handler.

This is ctnetlink, conntracks are always created from user context :)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24 10:01                   ` Pablo Neira Ayuso
@ 2012-02-24 10:11                     ` Jozsef Kadlecsik
  2012-02-24 11:36                     ` Pablo Neira Ayuso
  1 sibling, 0 replies; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-24 10:11 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Eric Dumazet, netfilter-devel, Zambo Marcell

On Fri, 24 Feb 2012, Pablo Neira Ayuso wrote:

> On Fri, Feb 24, 2012 at 07:44:23AM +0100, Eric Dumazet wrote:
> > Le vendredi 24 f?vrier 2012 ? 02:24 +0100, Pablo Neira Ayuso a ?crit :
> > 
> > > BTW, nf_conntrack_alloc allocation can now be GFP_KERNEL.
> > 
> > Without looking at the code, I would say I doubt its doable ;)
> >
> > Input path is run from softirq handler.
> 
> This is ctnetlink, conntracks are always created from user context :)

I'm going to convert the gfp flags to GPP_KERNEL as well.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24 10:01                   ` Pablo Neira Ayuso
  2012-02-24 10:11                     ` Jozsef Kadlecsik
@ 2012-02-24 11:36                     ` Pablo Neira Ayuso
  1 sibling, 0 replies; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-24 11:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jozsef Kadlecsik, netfilter-devel, Zambo Marcell

On Fri, Feb 24, 2012 at 11:01:21AM +0100, Pablo Neira Ayuso wrote:
> On Fri, Feb 24, 2012 at 07:44:23AM +0100, Eric Dumazet wrote:
> > Le vendredi 24 février 2012 à 02:24 +0100, Pablo Neira Ayuso a écrit :
> > 
> > > BTW, nf_conntrack_alloc allocation can now be GFP_KERNEL.
> > > 
> > 
> > Without looking at the code, I would say I doubt its doable ;)
> >
> > Input path is run from softirq handler.
> 
> This is ctnetlink, conntracks are always created from user context :)

I noticed we're holding rcu read side so we cannot switch to
GFP_KERNEL, you were right (for different reason though ;-).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24  1:24               ` Pablo Neira Ayuso
  2012-02-24  6:44                 ` Eric Dumazet
@ 2012-02-24  8:06                 ` Jozsef Kadlecsik
  2012-02-24  9:59                   ` Pablo Neira Ayuso
  1 sibling, 1 reply; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-24  8:06 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

Hi Pablo,

On Fri, 24 Feb 2012, Pablo Neira Ayuso wrote:

> On Thu, Feb 23, 2012 at 09:44:21PM +0100, Jozsef Kadlecsik wrote:
> > OK, here it comes:
> > 
> > The previous patch with the title "netfilter: fix soft lockup
> > when netlink adds new entries" introduced a race: conntrack and
> > ctnetlink could insert the same entry twice into the hash table.
> > The patch eliminates the race condition by using the same checking
> > as conntrack confirm does.
> > 
> > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > ---
> >  include/net/netfilter/nf_conntrack.h |    2 +
> >  net/netfilter/nf_conntrack_core.c    |   41 ++++++++++++++++++++++++++++++++++
> >  net/netfilter/nf_conntrack_netlink.c |    9 +++----
> >  3 files changed, 47 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> > index 8a2b0ae..48bfe75 100644
> > --- a/include/net/netfilter/nf_conntrack.h
> > +++ b/include/net/netfilter/nf_conntrack.h
> > @@ -210,6 +210,8 @@ __nf_conntrack_find(struct net *net, u16 zone,
> >  		    const struct nf_conntrack_tuple *tuple);
> >  
> >  extern void nf_conntrack_hash_insert(struct nf_conn *ct);
> > +extern int nf_conntrack_hash_check_insert(struct net *net, u16 zone,
> > +					  struct nf_conn *ct);
> 
> nf_conntrack_hash_insert has no clients anymore after this change.
> 
> We have two choices here:
> 
> 1) Expand nf_conntrack_hash_insert to perform the checking inside. You can
> use the same prototype btw, net and zone can be extracted from the ct.

It's a void function and we need a result: if the insert fails the
newly allocated entry must be destroyed and error reported to the user.

> 2) For better code readability (and we save one exported symbol), add
> the code that you propose for nf_conntrack_hash_check_insert to
> nf_conntrack_netlink.c. I don't think we would ever have another client
> of nf_conntrack_hash_insert.

That'd need exporting hash_conntrack and __nf_conntrack_hash_insert from
nf_conntrack_core.c. That was the reason why I instead introduced a new
function.
 
> >  extern void nf_ct_delete_from_lists(struct nf_conn *ct);
> >  extern void nf_ct_insert_dying_list(struct nf_conn *ct);
> >  
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index 76613f5..3d829d6 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -418,6 +418,47 @@ void nf_conntrack_hash_insert(struct nf_conn *ct)
> >  }
> >  EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
> >  
> > +int
> > +nf_conntrack_hash_check_insert(struct net *net, u16 zone, struct nf_conn *ct)
> > +{
> > +	unsigned int hash, repl_hash;
> > +	struct nf_conntrack_tuple_hash *h;
> > +	struct hlist_nulls_node *n;
> > +
> > +	hash = hash_conntrack(net, zone,
> > +			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> > +	repl_hash = hash_conntrack(net, zone,
> > +				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> > +
> > +	spin_lock_bh(&nf_conntrack_lock);
> > +
> > +	/* See if there's one in the list already, including reverse */
> > +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
> > +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> > +				      &h->tuple) &&
> > +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> > +			goto out;
> > +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
> > +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
> > +				      &h->tuple) &&
> > +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> > +			goto out;
> > +
> > +	add_timer(&ct->timeout);
> > +	atomic_inc(&ct->ct_general.use);
> 
> better nf_conntrack_get?

OK.
 
> > +	__nf_conntrack_hash_insert(ct, hash, repl_hash);
> > +	NF_CT_STAT_INC(net, insert);
> 
> Good catch, this stats increment was missing.
> 
> > +	spin_unlock_bh(&nf_conntrack_lock);
> > +
> > +	return 0;
> > +
> > +out:
> > +	NF_CT_STAT_INC(net, insert_failed);
> > +	spin_unlock_bh(&nf_conntrack_lock);
> > +	return -EEXIST;
> > +}
> > +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
> > +
> >  /* Confirm a connection given skb; places it in hash table */
> >  int
> >  __nf_conntrack_confirm(struct sk_buff *skb)
> > diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> > index cc70517..379d34e 100644
> > --- a/net/netfilter/nf_conntrack_netlink.c
> > +++ b/net/netfilter/nf_conntrack_netlink.c
> > @@ -1465,11 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
> >  	if (tstamp)
> >  		tstamp->start = ktime_to_ns(ktime_get_real());
> >  
> > -	add_timer(&ct->timeout);
> 
> Hm, this timer addition out of the lock slipped through the previous
> patch.
> 
> I think it's better if we revert the previous patch and we pass a new
> revamped version including the initial and this change.
> 
> It will be easier to pass it to -stable.
> 
> Would you be OK with this?

Yes, that's fine.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24  8:06                 ` Jozsef Kadlecsik
@ 2012-02-24  9:59                   ` Pablo Neira Ayuso
  2012-02-24 10:09                     ` Jozsef Kadlecsik
  0 siblings, 1 reply; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-24  9:59 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

On Fri, Feb 24, 2012 at 09:06:37AM +0100, Jozsef Kadlecsik wrote:
> Hi Pablo,
> 
> On Fri, 24 Feb 2012, Pablo Neira Ayuso wrote:
> 
> > On Thu, Feb 23, 2012 at 09:44:21PM +0100, Jozsef Kadlecsik wrote:
> > > OK, here it comes:
> > > 
> > > The previous patch with the title "netfilter: fix soft lockup
> > > when netlink adds new entries" introduced a race: conntrack and
> > > ctnetlink could insert the same entry twice into the hash table.
> > > The patch eliminates the race condition by using the same checking
> > > as conntrack confirm does.
> > > 
> > > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > > ---
> > >  include/net/netfilter/nf_conntrack.h |    2 +
> > >  net/netfilter/nf_conntrack_core.c    |   41 ++++++++++++++++++++++++++++++++++
> > >  net/netfilter/nf_conntrack_netlink.c |    9 +++----
> > >  3 files changed, 47 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> > > index 8a2b0ae..48bfe75 100644
> > > --- a/include/net/netfilter/nf_conntrack.h
> > > +++ b/include/net/netfilter/nf_conntrack.h
> > > @@ -210,6 +210,8 @@ __nf_conntrack_find(struct net *net, u16 zone,
> > >  		    const struct nf_conntrack_tuple *tuple);
> > >  
> > >  extern void nf_conntrack_hash_insert(struct nf_conn *ct);
> > > +extern int nf_conntrack_hash_check_insert(struct net *net, u16 zone,
> > > +					  struct nf_conn *ct);
> > 
> > nf_conntrack_hash_insert has no clients anymore after this change.
> > 
> > We have two choices here:
> > 
> > 1) Expand nf_conntrack_hash_insert to perform the checking inside. You can
> > use the same prototype btw, net and zone can be extracted from the ct.
> 
> It's a void function and we need a result: if the insert fails the
> newly allocated entry must be destroyed and error reported to the user.
>
> > 2) For better code readability (and we save one exported symbol), add
> > the code that you propose for nf_conntrack_hash_check_insert to
> > nf_conntrack_netlink.c. I don't think we would ever have another client
> > of nf_conntrack_hash_insert.
> 
> That'd need exporting hash_conntrack and __nf_conntrack_hash_insert from
> nf_conntrack_core.c. That was the reason why I instead introduced a new
> function.

I see. Go ahead then.

Thanks Jozsef.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24  9:59                   ` Pablo Neira Ayuso
@ 2012-02-24 10:09                     ` Jozsef Kadlecsik
  2012-02-24 10:45                       ` Jozsef Kadlecsik
  0 siblings, 1 reply; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-24 10:09 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

On Fri, 24 Feb 2012, Pablo Neira Ayuso wrote:

> On Fri, Feb 24, 2012 at 09:06:37AM +0100, Jozsef Kadlecsik wrote:
> > 
> > On Fri, 24 Feb 2012, Pablo Neira Ayuso wrote:
> > 
> > > On Thu, Feb 23, 2012 at 09:44:21PM +0100, Jozsef Kadlecsik wrote:
> > > > OK, here it comes:
> > > > 
> > > > The previous patch with the title "netfilter: fix soft lockup
> > > > when netlink adds new entries" introduced a race: conntrack and
> > > > ctnetlink could insert the same entry twice into the hash table.
> > > > The patch eliminates the race condition by using the same checking
> > > > as conntrack confirm does.
> > > > 
> > > > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > > > ---
> > > >  include/net/netfilter/nf_conntrack.h |    2 +
> > > >  net/netfilter/nf_conntrack_core.c    |   41 ++++++++++++++++++++++++++++++++++
> > > >  net/netfilter/nf_conntrack_netlink.c |    9 +++----
> > > >  3 files changed, 47 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> > > > index 8a2b0ae..48bfe75 100644
> > > > --- a/include/net/netfilter/nf_conntrack.h
> > > > +++ b/include/net/netfilter/nf_conntrack.h
> > > > @@ -210,6 +210,8 @@ __nf_conntrack_find(struct net *net, u16 zone,
> > > >  		    const struct nf_conntrack_tuple *tuple);
> > > >  
> > > >  extern void nf_conntrack_hash_insert(struct nf_conn *ct);
> > > > +extern int nf_conntrack_hash_check_insert(struct net *net, u16 zone,
> > > > +					  struct nf_conn *ct);
> > > 
> > > nf_conntrack_hash_insert has no clients anymore after this change.
[...]
> I see. Go ahead then.

OK, and I remove nf_conntrack_hash_insert then, and use a single ct 
argument because as you noted net and zone can be extracted from the ct.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24 10:09                     ` Jozsef Kadlecsik
@ 2012-02-24 10:45                       ` Jozsef Kadlecsik
  2012-02-24 11:18                         ` Pablo Neira Ayuso
  2012-02-24 11:33                         ` Pablo Neira Ayuso
  0 siblings, 2 replies; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-24 10:45 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

On Fri, 24 Feb 2012, Jozsef Kadlecsik wrote:

> OK, and I remove nf_conntrack_hash_insert then, and use a single ct 
> argument because as you noted net and zone can be extracted from the ct.

Here comes the second version of the patch, which assumes that the 
previous one is reverted.

Best regards,
Jozsef

Marcell Zambo and Janos Farago noticed and reported that when
new conntrack entries are added via netlink and the conntrack table
gets full, soft lockup happens. This is because the nf_conntrack_lock
is held while nf_conntrack_alloc is called, which is in turn wants
to lock nf_conntrack_lock while evicting entries from the full table.

The patch fixes the soft lockup with limiting the holding of the
nf_conntrack_lock to the minimum, where it's absolutely required.
It required to extend (and thus change) nf_conntrack_hash_insert
so that it makes sure conntrack and ctnetlink do not add the same entry
twice to the conntrack table.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/net/netfilter/nf_conntrack.h |    2 +-
 net/netfilter/nf_conntrack_core.c    |   38 ++++++++++++++++++++--
 net/netfilter/nf_conntrack_netlink.c |   58 +++++++++++++--------------------
 3 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 8a2b0ae..ab86036 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -209,7 +209,7 @@ extern struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(struct net *net, u16 zone,
 		    const struct nf_conntrack_tuple *tuple);
 
-extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
 extern void nf_ct_delete_from_lists(struct nf_conn *ct);
 extern void nf_ct_insert_dying_list(struct nf_conn *ct);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 76613f5..ed86a3b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 			   &net->ct.hash[repl_hash]);
 }
 
-void nf_conntrack_hash_insert(struct nf_conn *ct)
+int
+nf_conntrack_hash_check_insert(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	unsigned int hash, repl_hash;
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
 	u16 zone;
 
 	zone = nf_ct_zone(ct);
-	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	hash = hash_conntrack(net, zone,
+			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	spin_lock_bh(&nf_conntrack_lock);
 
+	/* See if there's one in the list already, including reverse */
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+
+	add_timer(&ct->timeout);
+	nf_conntrack_get(&ct->ct_general);
 	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+	NF_CT_STAT_INC(net, insert);
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	return 0;
+
+out:
+	NF_CT_STAT_INC(net, insert_failed);
+	spin_unlock_bh(&nf_conntrack_lock);
+	return -EEXIST;
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 
 /* Confirm a connection given skb; places it in hash table */
 int
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9307b03..6d73501 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1345,7 +1345,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 	struct nf_conntrack_helper *helper;
 	struct nf_conn_tstamp *tstamp;
 
-	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
+	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_KERNEL);
 	if (IS_ERR(ct))
 		return ERR_PTR(-ENOMEM);
 
@@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 						    nf_ct_protonum(ct));
 		if (helper == NULL) {
 			rcu_read_unlock();
-			spin_unlock_bh(&nf_conntrack_lock);
 #ifdef CONFIG_MODULES
 			if (request_module("nfct-helper-%s", helpname) < 0) {
-				spin_lock_bh(&nf_conntrack_lock);
 				err = -EOPNOTSUPP;
 				goto err1;
 			}
 
-			spin_lock_bh(&nf_conntrack_lock);
 			rcu_read_lock();
 			helper = __nf_conntrack_helper_find(helpname,
 							    nf_ct_l3num(ct),
@@ -1391,7 +1388,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 		} else {
 			struct nf_conn_help *help;
 
-			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+			help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
 			if (help == NULL) {
 				err = -ENOMEM;
 				goto err2;
@@ -1402,7 +1399,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 		}
 	} else {
 		/* try an implicit helper assignation */
-		err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+		err = __nf_ct_try_assign_helper(ct, NULL, GFP_KERNEL);
 		if (err < 0)
 			goto err2;
 	}
@@ -1413,9 +1410,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 			goto err2;
 	}
 
-	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
-	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
-	nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
+	nf_ct_acct_ext_add(ct, GFP_KERNEL);
+	nf_ct_tstamp_ext_add(ct, GFP_KERNEL);
+	nf_ct_ecache_ext_add(ct, 0, 0, GFP_KERNEL);
 	/* we must add conntrack extensions before confirmation. */
 	ct->status |= IPS_CONFIRMED;
 
@@ -1468,8 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 	if (tstamp)
 		tstamp->start = ktime_to_ns(ktime_get_real());
 
-	add_timer(&ct->timeout);
-	nf_conntrack_hash_insert(ct);
+	err = nf_conntrack_hash_check_insert(ct);
+	if (err < 0)
+		goto err2;
+
 	rcu_read_unlock();
 
 	return ct;
@@ -1490,6 +1489,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	struct nf_conntrack_tuple otuple, rtuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nf_conn *ct;
 	u_int8_t u3 = nfmsg->nfgen_family;
 	u16 zone;
 	int err;
@@ -1512,25 +1512,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 
 	spin_lock_bh(&nf_conntrack_lock);
 	if (cda[CTA_TUPLE_ORIG])
-		h = __nf_conntrack_find(net, zone, &otuple);
+		h = nf_conntrack_find_get(net, zone, &otuple);
 	else if (cda[CTA_TUPLE_REPLY])
-		h = __nf_conntrack_find(net, zone, &rtuple);
+		h = nf_conntrack_find_get(net, zone, &rtuple);
+	spin_unlock_bh(&nf_conntrack_lock);
 
 	if (h == NULL) {
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
-			struct nf_conn *ct;
 			enum ip_conntrack_events events;
 
 			ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
 							&rtuple, u3);
-			if (IS_ERR(ct)) {
-				err = PTR_ERR(ct);
-				goto out_unlock;
-			}
+			if (IS_ERR(ct))
+				return PTR_ERR(ct);
+
 			err = 0;
-			nf_conntrack_get(&ct->ct_general);
-			spin_unlock_bh(&nf_conntrack_lock);
 			if (test_bit(IPS_EXPECTED_BIT, &ct->status))
 				events = IPCT_RELATED;
 			else
@@ -1545,23 +1542,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      ct, NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
 			nf_ct_put(ct);
-		} else
-			spin_unlock_bh(&nf_conntrack_lock);
+		}
 
 		return err;
 	}
 	/* implicit 'else' */
 
-	/* We manipulate the conntrack inside the global conntrack table lock,
-	 * so there's no need to increase the refcount */
 	err = -EEXIST;
+	ct = nf_ct_tuplehash_to_ctrack(h);
 	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
-		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
+		spin_lock_bh(&nf_conntrack_lock);
 		err = ctnetlink_change_conntrack(ct, cda);
+		spin_unlock_bh(&nf_conntrack_lock);
 		if (err == 0) {
-			nf_conntrack_get(&ct->ct_general);
-			spin_unlock_bh(&nf_conntrack_lock);
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
@@ -1570,15 +1563,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_MARK),
 						      ct, NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
-			nf_ct_put(ct);
-		} else
-			spin_unlock_bh(&nf_conntrack_lock);
-
-		return err;
+		}
 	}
 
-out_unlock:
-	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_put(ct);
 	return err;
 }
 
-- 
1.7.0.4

-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24 10:45                       ` Jozsef Kadlecsik
@ 2012-02-24 11:18                         ` Pablo Neira Ayuso
  2012-02-24 11:33                         ` Pablo Neira Ayuso
  1 sibling, 0 replies; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-24 11:18 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

On Fri, Feb 24, 2012 at 11:45:49AM +0100, Jozsef Kadlecsik wrote:
> On Fri, 24 Feb 2012, Jozsef Kadlecsik wrote:
> 
> > OK, and I remove nf_conntrack_hash_insert then, and use a single ct 
> > argument because as you noted net and zone can be extracted from the ct.
> 
> Here comes the second version of the patch, which assumes that the 
> previous one is reverted.
> 
> Best regards,
> Jozsef
> 
> Marcell Zambo and Janos Farago noticed and reported that when
> new conntrack entries are added via netlink and the conntrack table
> gets full, soft lockup happens. This is because the nf_conntrack_lock
> is held while nf_conntrack_alloc is called, which is in turn wants
> to lock nf_conntrack_lock while evicting entries from the full table.
> 
> The patch fixes the soft lockup with limiting the holding of the
> nf_conntrack_lock to the minimum, where it's absolutely required.
> It required to extend (and thus change) nf_conntrack_hash_insert
> so that it makes sure conntrack and ctnetlink do not add the same entry
> twice to the conntrack table.
> 
> Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> ---
>  include/net/netfilter/nf_conntrack.h |    2 +-
>  net/netfilter/nf_conntrack_core.c    |   38 ++++++++++++++++++++--
>  net/netfilter/nf_conntrack_netlink.c |   58 +++++++++++++--------------------
>  3 files changed, 58 insertions(+), 40 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 8a2b0ae..ab86036 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -209,7 +209,7 @@ extern struct nf_conntrack_tuple_hash *
>  __nf_conntrack_find(struct net *net, u16 zone,
>  		    const struct nf_conntrack_tuple *tuple);
>  
> -extern void nf_conntrack_hash_insert(struct nf_conn *ct);
> +extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
>  extern void nf_ct_delete_from_lists(struct nf_conn *ct);
>  extern void nf_ct_insert_dying_list(struct nf_conn *ct);
>  
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 76613f5..ed86a3b 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
>  			   &net->ct.hash[repl_hash]);
>  }
>  
> -void nf_conntrack_hash_insert(struct nf_conn *ct)
> +int
> +nf_conntrack_hash_check_insert(struct nf_conn *ct)
>  {
>  	struct net *net = nf_ct_net(ct);
>  	unsigned int hash, repl_hash;
> +	struct nf_conntrack_tuple_hash *h;
> +	struct hlist_nulls_node *n;
>  	u16 zone;
>  
>  	zone = nf_ct_zone(ct);
> -	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> -	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> +	hash = hash_conntrack(net, zone,
> +			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> +	repl_hash = hash_conntrack(net, zone,
> +				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> +
> +	spin_lock_bh(&nf_conntrack_lock);
>  
> +	/* See if there's one in the list already, including reverse */
> +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
> +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> +				      &h->tuple) &&
> +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> +			goto out;
> +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
> +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
> +				      &h->tuple) &&
> +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> +			goto out;
> +
> +	add_timer(&ct->timeout);
> +	nf_conntrack_get(&ct->ct_general);
>  	__nf_conntrack_hash_insert(ct, hash, repl_hash);
> +	NF_CT_STAT_INC(net, insert);
> +	spin_unlock_bh(&nf_conntrack_lock);
> +
> +	return 0;
> +
> +out:
> +	NF_CT_STAT_INC(net, insert_failed);
> +	spin_unlock_bh(&nf_conntrack_lock);
> +	return -EEXIST;
>  }
> -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
> +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
>  
>  /* Confirm a connection given skb; places it in hash table */
>  int
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 9307b03..6d73501 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1345,7 +1345,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  	struct nf_conntrack_helper *helper;
>  	struct nf_conn_tstamp *tstamp;
>  
> -	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
> +	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_KERNEL);
>  	if (IS_ERR(ct))
>  		return ERR_PTR(-ENOMEM);
>  

Sorry, I was wrong. I didn't notice that this is called holding
rcu_read_lock() inside nfnetlink.c and we cannot use GFP_KERNEL
inside read-lock area.

I'm going to take the patch and will fix this myself.

Thanks Jozsef.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24 10:45                       ` Jozsef Kadlecsik
  2012-02-24 11:18                         ` Pablo Neira Ayuso
@ 2012-02-24 11:33                         ` Pablo Neira Ayuso
  2012-02-24 17:05                           ` Jozsef Kadlecsik
  1 sibling, 1 reply; 21+ messages in thread
From: Pablo Neira Ayuso @ 2012-02-24 11:33 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Zambo Marcell

On Fri, Feb 24, 2012 at 11:45:49AM +0100, Jozsef Kadlecsik wrote:
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 76613f5..ed86a3b 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
>  			   &net->ct.hash[repl_hash]);
>  }
>  
> -void nf_conntrack_hash_insert(struct nf_conn *ct)
> +int
> +nf_conntrack_hash_check_insert(struct nf_conn *ct)
>  {
>  	struct net *net = nf_ct_net(ct);
>  	unsigned int hash, repl_hash;
> +	struct nf_conntrack_tuple_hash *h;
> +	struct hlist_nulls_node *n;
>  	u16 zone;
>  
>  	zone = nf_ct_zone(ct);
> -	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> -	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> +	hash = hash_conntrack(net, zone,
> +			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> +	repl_hash = hash_conntrack(net, zone,
> +				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> +
> +	spin_lock_bh(&nf_conntrack_lock);
>  
> +	/* See if there's one in the list already, including reverse */
> +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
> +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> +				      &h->tuple) &&
> +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> +			goto out;
> +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
> +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
> +				      &h->tuple) &&
> +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
> +			goto out;
> +
> +	add_timer(&ct->timeout);
> +	nf_conntrack_get(&ct->ct_general);
>  	__nf_conntrack_hash_insert(ct, hash, repl_hash);
> +	NF_CT_STAT_INC(net, insert);
> +	spin_unlock_bh(&nf_conntrack_lock);
> +
> +	return 0;
> +
> +out:
> +	NF_CT_STAT_INC(net, insert_failed);
> +	spin_unlock_bh(&nf_conntrack_lock);
> +	return -EEXIST;
>  }
> -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
> +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
>  
>  /* Confirm a connection given skb; places it in hash table */
>  int
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 9307b03..6d73501 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1345,7 +1345,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  	struct nf_conntrack_helper *helper;
>  	struct nf_conn_tstamp *tstamp;
>  
> -	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
> +	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_KERNEL);
>  	if (IS_ERR(ct))
>  		return ERR_PTR(-ENOMEM);
>  
> @@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  						    nf_ct_protonum(ct));
>  		if (helper == NULL) {
>  			rcu_read_unlock();
> -			spin_unlock_bh(&nf_conntrack_lock);
>  #ifdef CONFIG_MODULES
>  			if (request_module("nfct-helper-%s", helpname) < 0) {
> -				spin_lock_bh(&nf_conntrack_lock);
>  				err = -EOPNOTSUPP;
>  				goto err1;
>  			}
>  
> -			spin_lock_bh(&nf_conntrack_lock);
>  			rcu_read_lock();
>  			helper = __nf_conntrack_helper_find(helpname,
>  							    nf_ct_l3num(ct),
> @@ -1391,7 +1388,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  		} else {
>  			struct nf_conn_help *help;
>  
> -			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
> +			help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
>  			if (help == NULL) {
>  				err = -ENOMEM;
>  				goto err2;
> @@ -1402,7 +1399,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  		}
>  	} else {
>  		/* try an implicit helper assignation */
> -		err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
> +		err = __nf_ct_try_assign_helper(ct, NULL, GFP_KERNEL);
>  		if (err < 0)
>  			goto err2;
>  	}
> @@ -1413,9 +1410,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  			goto err2;
>  	}
>  
> -	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
> -	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
> -	nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
> +	nf_ct_acct_ext_add(ct, GFP_KERNEL);
> +	nf_ct_tstamp_ext_add(ct, GFP_KERNEL);
> +	nf_ct_ecache_ext_add(ct, 0, 0, GFP_KERNEL);
>  	/* we must add conntrack extensions before confirmation. */
>  	ct->status |= IPS_CONFIRMED;
>  
> @@ -1468,8 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  	if (tstamp)
>  		tstamp->start = ktime_to_ns(ktime_get_real());
>  
> -	add_timer(&ct->timeout);
> -	nf_conntrack_hash_insert(ct);
> +	err = nf_conntrack_hash_check_insert(ct);
> +	if (err < 0)
> +		goto err2;
> +
>  	rcu_read_unlock();
>  
>  	return ct;
> @@ -1490,6 +1489,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
>  	struct nf_conntrack_tuple otuple, rtuple;
>  	struct nf_conntrack_tuple_hash *h = NULL;
>  	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
> +	struct nf_conn *ct;
>  	u_int8_t u3 = nfmsg->nfgen_family;
>  	u16 zone;
>  	int err;
> @@ -1512,25 +1512,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
>  
>  	spin_lock_bh(&nf_conntrack_lock);
>  	if (cda[CTA_TUPLE_ORIG])
> -		h = __nf_conntrack_find(net, zone, &otuple);
> +		h = nf_conntrack_find_get(net, zone, &otuple);
>  	else if (cda[CTA_TUPLE_REPLY])
> -		h = __nf_conntrack_find(net, zone, &rtuple);
> +		h = nf_conntrack_find_get(net, zone, &rtuple);
> +	spin_unlock_bh(&nf_conntrack_lock);

It seems we can remove the spin_lock_bh here. We don't need it
anymore.

I'm going to send you a new version of the patch based on yours.

Sorry, I'm puzzling with this patch 8-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] netfilter: fix soft lockup when netlink adds new entries
  2012-02-24 11:33                         ` Pablo Neira Ayuso
@ 2012-02-24 17:05                           ` Jozsef Kadlecsik
  0 siblings, 0 replies; 21+ messages in thread
From: Jozsef Kadlecsik @ 2012-02-24 17:05 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Zambo Marcell

On Fri, 24 Feb 2012, Pablo Neira Ayuso wrote:

> > @@ -1512,25 +1512,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
> >  
> >  	spin_lock_bh(&nf_conntrack_lock);
> >  	if (cda[CTA_TUPLE_ORIG])
> > -		h = __nf_conntrack_find(net, zone, &otuple);
> > +		h = nf_conntrack_find_get(net, zone, &otuple);
> >  	else if (cda[CTA_TUPLE_REPLY])
> > -		h = __nf_conntrack_find(net, zone, &rtuple);
> > +		h = nf_conntrack_find_get(net, zone, &rtuple);
> > +	spin_unlock_bh(&nf_conntrack_lock);
> 
> It seems we can remove the spin_lock_bh here. We don't need it
> anymore.
> 
> I'm going to send you a new version of the patch based on yours.
> 
> Sorry, I'm puzzling with this patch 8-)

No problem whatsoever. For the sake of simplicity I'm sending the new 
version: gfp flags are reverted back and the unnecessary locking you 
noticed is removed.

Best regards,
Jozsef

Marcell Zambo and Janos Farago noticed and reported that when
new conntrack entries are added via netlink and the conntrack table
gets full, soft lockup happens. This is because the nf_conntrack_lock
is held while nf_conntrack_alloc is called, which is in turn wants
to lock nf_conntrack_lock while evicting entries from the full table.

The patch fixes the soft lockup with limiting the holding of the
nf_conntrack_lock to the minimum, where it's absolutely required.
It required to extend (and thus change) nf_conntrack_hash_insert
so that it makes sure conntrack and ctnetlink do not add the same entry
twice to the conntrack table.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/net/netfilter/nf_conntrack.h |    2 +-
 net/netfilter/nf_conntrack_core.c    |   38 +++++++++++++++++++++++++---
 net/netfilter/nf_conntrack_netlink.c |   46 ++++++++++++----------------------
 3 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 8a2b0ae..ab86036 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -209,7 +209,7 @@ extern struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(struct net *net, u16 zone,
 		    const struct nf_conntrack_tuple *tuple);
 
-extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
 extern void nf_ct_delete_from_lists(struct nf_conn *ct);
 extern void nf_ct_insert_dying_list(struct nf_conn *ct);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 76613f5..ed86a3b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 			   &net->ct.hash[repl_hash]);
 }
 
-void nf_conntrack_hash_insert(struct nf_conn *ct)
+int
+nf_conntrack_hash_check_insert(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	unsigned int hash, repl_hash;
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
 	u16 zone;
 
 	zone = nf_ct_zone(ct);
-	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	hash = hash_conntrack(net, zone,
+			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	spin_lock_bh(&nf_conntrack_lock);
 
+	/* See if there's one in the list already, including reverse */
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+
+	add_timer(&ct->timeout);
+	nf_conntrack_get(&ct->ct_general);
 	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+	NF_CT_STAT_INC(net, insert);
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	return 0;
+
+out:
+	NF_CT_STAT_INC(net, insert_failed);
+	spin_unlock_bh(&nf_conntrack_lock);
+	return -EEXIST;
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 
 /* Confirm a connection given skb; places it in hash table */
 int
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9307b03..30c9d4c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 						    nf_ct_protonum(ct));
 		if (helper == NULL) {
 			rcu_read_unlock();
-			spin_unlock_bh(&nf_conntrack_lock);
 #ifdef CONFIG_MODULES
 			if (request_module("nfct-helper-%s", helpname) < 0) {
-				spin_lock_bh(&nf_conntrack_lock);
 				err = -EOPNOTSUPP;
 				goto err1;
 			}
 
-			spin_lock_bh(&nf_conntrack_lock);
 			rcu_read_lock();
 			helper = __nf_conntrack_helper_find(helpname,
 							    nf_ct_l3num(ct),
@@ -1468,8 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 	if (tstamp)
 		tstamp->start = ktime_to_ns(ktime_get_real());
 
-	add_timer(&ct->timeout);
-	nf_conntrack_hash_insert(ct);
+	err = nf_conntrack_hash_check_insert(ct);
+	if (err < 0)
+		goto err2;
+
 	rcu_read_unlock();
 
 	return ct;
@@ -1490,6 +1489,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	struct nf_conntrack_tuple otuple, rtuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nf_conn *ct;
 	u_int8_t u3 = nfmsg->nfgen_family;
 	u16 zone;
 	int err;
@@ -1510,27 +1510,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			return err;
 	}
 
-	spin_lock_bh(&nf_conntrack_lock);
 	if (cda[CTA_TUPLE_ORIG])
-		h = __nf_conntrack_find(net, zone, &otuple);
+		h = nf_conntrack_find_get(net, zone, &otuple);
 	else if (cda[CTA_TUPLE_REPLY])
-		h = __nf_conntrack_find(net, zone, &rtuple);
+		h = nf_conntrack_find_get(net, zone, &rtuple);
 
 	if (h == NULL) {
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
-			struct nf_conn *ct;
 			enum ip_conntrack_events events;
 
 			ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
 							&rtuple, u3);
-			if (IS_ERR(ct)) {
-				err = PTR_ERR(ct);
-				goto out_unlock;
-			}
+			if (IS_ERR(ct))
+				return PTR_ERR(ct);
+
 			err = 0;
-			nf_conntrack_get(&ct->ct_general);
-			spin_unlock_bh(&nf_conntrack_lock);
 			if (test_bit(IPS_EXPECTED_BIT, &ct->status))
 				events = IPCT_RELATED;
 			else
@@ -1545,23 +1540,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      ct, NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
 			nf_ct_put(ct);
-		} else
-			spin_unlock_bh(&nf_conntrack_lock);
+		}
 
 		return err;
 	}
 	/* implicit 'else' */
 
-	/* We manipulate the conntrack inside the global conntrack table lock,
-	 * so there's no need to increase the refcount */
 	err = -EEXIST;
+	ct = nf_ct_tuplehash_to_ctrack(h);
 	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
-		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
+		spin_lock_bh(&nf_conntrack_lock);
 		err = ctnetlink_change_conntrack(ct, cda);
+		spin_unlock_bh(&nf_conntrack_lock);
 		if (err == 0) {
-			nf_conntrack_get(&ct->ct_general);
-			spin_unlock_bh(&nf_conntrack_lock);
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
@@ -1570,15 +1561,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_MARK),
 						      ct, NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
-			nf_ct_put(ct);
-		} else
-			spin_unlock_bh(&nf_conntrack_lock);
-
-		return err;
+		}
 	}
 
-out_unlock:
-	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_put(ct);
 	return err;
 }
 
-- 
1.7.0.4


-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2012-02-24 17:05 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-02-21 13:53 [PATCH 0/1] netfilter: fix soft lockup when netlink adds new entries Jozsef Kadlecsik
2012-02-21 13:53 ` [PATCH 1/1] " Jozsef Kadlecsik
2012-02-21 14:52   ` Pablo Neira Ayuso
2012-02-21 15:06     ` Jozsef Kadlecsik
2012-02-21 15:20       ` Pablo Neira Ayuso
2012-02-23 10:12       ` Pablo Neira Ayuso
2012-02-23 12:43         ` Jozsef Kadlecsik
2012-02-23 15:46           ` Pablo Neira Ayuso
2012-02-23 20:44             ` Jozsef Kadlecsik
2012-02-24  1:24               ` Pablo Neira Ayuso
2012-02-24  6:44                 ` Eric Dumazet
2012-02-24 10:01                   ` Pablo Neira Ayuso
2012-02-24 10:11                     ` Jozsef Kadlecsik
2012-02-24 11:36                     ` Pablo Neira Ayuso
2012-02-24  8:06                 ` Jozsef Kadlecsik
2012-02-24  9:59                   ` Pablo Neira Ayuso
2012-02-24 10:09                     ` Jozsef Kadlecsik
2012-02-24 10:45                       ` Jozsef Kadlecsik
2012-02-24 11:18                         ` Pablo Neira Ayuso
2012-02-24 11:33                         ` Pablo Neira Ayuso
2012-02-24 17:05                           ` Jozsef Kadlecsik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).