All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oliver <olipro@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
To: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: netfilter-devel@vger.kernel.org
Subject: Re: [PATCH] death_by_event() does not check IPS_DYING_BIT - race condition against ctnetlink_del_conntrack
Date: Tue, 28 Aug 2012 19:16:39 +0200	[thread overview]
Message-ID: <9829027.jsWF3AolUh@gentoovm> (raw)
In-Reply-To: <20120828105235.GA22669@1984>

[-- Attachment #1: Type: text/plain, Size: 1063 bytes --]

Hi Pablo,

On Tuesday 28 August 2012 12:52:35 you wrote:
> It seems we're hitting death_by_event twice, which should not happen.
> 
> Would you give a try to the following patch?
> 
> Thanks.

I've tried applying the patch against 3.4.10 and found that it doesn't work 
due to having rewritten nf_ct_iterate_cleanup() to take two additional 
arguments and the nf_conntrack_proto.c in your source tree being divergent 
from anything I could find.

So, I've taken a different approach; nf_ct_iterate_cleanup() is a wrapper for 
nf_ct_iterate_cleanup_new() that simply passes 0 for the last two args so as 
to save having to edit every module.

Please take a look at the attached patch and let me know your thoughts; I'd 
Ideally like to have this fix in 3.4 since that's long-term stable.

During testing I found that the kernel is indeed solid and does not panic; 
however, I managed to make conntrackd eat 100% of a CPU core on one of the 
pair and conntrack entries remained unevicted from the kernel until I killed 
the conntrackd process.

Kind Regards,
Oliver

[-- Attachment #2: 0002-netfilter-nf_conntrack-fix-race-in-timer-handling-wi-in-kernel-3.4.patch --]
[-- Type: text/x-patch, Size: 8338 bytes --]

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index ab86036..8f92f77 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -210,8 +210,7 @@ __nf_conntrack_find(struct net *net, u16 zone,
 		    const struct nf_conntrack_tuple *tuple);
 
 extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
-extern void nf_ct_delete_from_lists(struct nf_conn *ct);
-extern void nf_ct_insert_dying_list(struct nf_conn *ct);
+bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report);
 
 extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report);
 
@@ -278,7 +277,8 @@ extern void nf_ct_untracked_status_or(unsigned long bits);
 
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 extern void
-nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data);
+nf_ct_iterate_cleanup_new(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 pid, int report);
+extern void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data);
 extern void nf_conntrack_free(struct nf_conn *ct);
 extern struct nf_conn *
 nf_conntrack_alloc(struct net *net, u16 zone,
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index a88fb69..2f7b0ac 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -108,7 +108,7 @@ nf_conntrack_eventmask_report(unsigned int eventmask,
 	if (e == NULL)
 		goto out_unlock;
 
-	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
+	if (nf_ct_is_confirmed(ct)) {
 		struct nf_ct_event item = {
 			.ct 	= ct,
 			.pid	= e->pid ? e->pid : pid,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 729f157..bdc9473 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -231,7 +231,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	nf_conntrack_free(ct);
 }
 
-void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 
@@ -243,7 +243,6 @@ void nf_ct_delete_from_lists(struct nf_conn *ct)
 	clean_from_lists(ct);
 	spin_unlock_bh(&nf_conntrack_lock);
 }
-EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
 
 static void death_by_event(unsigned long ul_conntrack)
 {
@@ -257,15 +256,13 @@ static void death_by_event(unsigned long ul_conntrack)
 		add_timer(&ct->timeout);
 		return;
 	}
-	/* we've got the event delivered, now it's dying */
-	set_bit(IPS_DYING_BIT, &ct->status);
 	spin_lock(&nf_conntrack_lock);
 	hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 	spin_unlock(&nf_conntrack_lock);
 	nf_ct_put(ct);
 }
 
-void nf_ct_insert_dying_list(struct nf_conn *ct)
+static void nf_ct_insert_dying_list(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 
@@ -280,27 +277,32 @@ void nf_ct_insert_dying_list(struct nf_conn *ct)
 		(random32() % net->ct.sysctl_events_retry_timeout);
 	add_timer(&ct->timeout);
 }
-EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
 
-static void death_by_timeout(unsigned long ul_conntrack)
+bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report)
 {
-	struct nf_conn *ct = (void *)ul_conntrack;
 	struct nf_conn_tstamp *tstamp;
 
 	tstamp = nf_conn_tstamp_find(ct);
 	if (tstamp && tstamp->stop == 0)
 		tstamp->stop = ktime_to_ns(ktime_get_real());
 
-	if (!test_bit(IPS_DYING_BIT, &ct->status) &&
-	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+	if (!test_and_set_bit(IPS_DYING_BIT, &ct->status) &&
+	    unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, pid,
+								report) < 0)) {
 		/* destroy event was not delivered */
 		nf_ct_delete_from_lists(ct);
 		nf_ct_insert_dying_list(ct);
-		return;
+		return false;
 	}
-	set_bit(IPS_DYING_BIT, &ct->status);
 	nf_ct_delete_from_lists(ct);
 	nf_ct_put(ct);
+	return true;
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
 }
 
 /*
@@ -634,11 +636,9 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 	if (!ct)
 		return dropped;
 
-	if (del_timer(&ct->timeout)) {
-		death_by_timeout((unsigned long)ct);
-		/* Check if we indeed killed this entry. Reliable event
-		   delivery may have inserted it into the dying list. */
-		if (test_bit(IPS_DYING_BIT, &ct->status)) {
+	if (!nf_ct_is_dying(ct) && del_timer(&ct->timeout)) {
+		/* Check if we indeed killed this entry */
+		if (nf_ct_delete(ct, 0, 0)) {
 			dropped = 1;
 			NF_CT_STAT_INC_ATOMIC(net, early_drop);
 		}
@@ -1124,8 +1124,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
 		}
 	}
 
-	if (del_timer(&ct->timeout)) {
-		ct->timeout.function((unsigned long)ct);
+	if (!nf_ct_is_dying(ct) && del_timer(&ct->timeout)) {
+		nf_ct_delete(ct, 0, 0);
 		return true;
 	}
 	return false;
@@ -1225,8 +1225,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 	}
 	hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
-		if (iter(ct, data))
-			set_bit(IPS_DYING_BIT, &ct->status);
+		iter(ct, data);
 	}
 	spin_unlock_bh(&nf_conntrack_lock);
 	return NULL;
@@ -1236,50 +1235,40 @@ found:
 	return ct;
 }
 
-void nf_ct_iterate_cleanup(struct net *net,
+void nf_ct_iterate_cleanup_new(struct net *net,
 			   int (*iter)(struct nf_conn *i, void *data),
-			   void *data)
+			   void *data, u32 pid, int report)
 {
 	struct nf_conn *ct;
 	unsigned int bucket = 0;
 
 	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
-		if (del_timer(&ct->timeout))
-			death_by_timeout((unsigned long)ct);
+		if (!nf_ct_is_dying(ct) && del_timer(&ct->timeout))
+			nf_ct_delete(ct, pid, report);
 		/* ... else the timer will get him soon. */
 
 		nf_ct_put(ct);
 	}
 }
-EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
+EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_new);
 
-struct __nf_ct_flush_report {
-	u32 pid;
-	int report;
-};
+void nf_ct_iterate_cleanup(struct net *net,
+			   int (*iter)(struct nf_conn *i, void *data),
+			   void *data)
+{
+	nf_ct_iterate_cleanup_new(net, iter, data, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 
-static int kill_report(struct nf_conn *i, void *data)
+static int kill_all(struct nf_conn *i, void *data)
 {
-	struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
 	struct nf_conn_tstamp *tstamp;
 
 	tstamp = nf_conn_tstamp_find(i);
 	if (tstamp && tstamp->stop == 0)
 		tstamp->stop = ktime_to_ns(ktime_get_real());
 
-	/* If we fail to deliver the event, death_by_timeout() will retry */
-	if (nf_conntrack_event_report(IPCT_DESTROY, i,
-				      fr->pid, fr->report) < 0)
-		return 1;
-
-	/* Avoid the delivery of the destroy event in death_by_timeout(). */
-	set_bit(IPS_DYING_BIT, &i->status);
-	return 1;
-}
-
-static int kill_all(struct nf_conn *i, void *data)
-{
 	return 1;
 }
 
@@ -1295,11 +1284,7 @@ EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
 
 void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
 {
-	struct __nf_ct_flush_report fr = {
-		.pid 	= pid,
-		.report = report,
-	};
-	nf_ct_iterate_cleanup(net, kill_report, &fr);
+	nf_ct_iterate_cleanup_new(net, kill_all, NULL, pid, report);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index ca7e835..2b0c9c1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -967,21 +967,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		}
 	}
 
-	if (del_timer(&ct->timeout)) {
-		if (nf_conntrack_event_report(IPCT_DESTROY, ct,
-					      NETLINK_CB(skb).pid,
-					      nlmsg_report(nlh)) < 0) {
-			nf_ct_delete_from_lists(ct);
-			/* we failed to report the event, try later */
-			nf_ct_insert_dying_list(ct);
-			nf_ct_put(ct);
-			return 0;
-		}
-		/* death_by_timeout would report the event again */
-		set_bit(IPS_DYING_BIT, &ct->status);
-		nf_ct_delete_from_lists(ct);
-		nf_ct_put(ct);
-	}
+	if (!nf_ct_is_dying(ct) && del_timer(&ct->timeout))
+		nf_ct_delete(ct, NETLINK_CB(skb).pid, nlmsg_report(nlh));
+
 	nf_ct_put(ct);
 
 	return 0;

  reply	other threads:[~2012-08-28 17:17 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-08-27  9:33 [PATCH] death_by_event() does not check IPS_DYING_BIT - race condition against ctnetlink_del_conntrack Oliver
2012-08-28 10:52 ` Pablo Neira Ayuso
2012-08-28 17:16   ` Oliver [this message]
2012-08-28 23:10     ` Oliver
2012-08-30  0:52       ` Pablo Neira Ayuso
2012-08-30  2:05         ` Oliver
2012-08-30  2:25           ` Pablo Neira Ayuso
     [not found] ` <5427975.6moJlq4F9d@gentoovm>
     [not found]   ` <20120830025009.GA16782@1984>
2012-08-30  3:09     ` Oliver
2012-08-30 10:34       ` Pablo Neira Ayuso
2012-08-30 12:28         ` Oliver
2012-08-30 12:39           ` Oliver
2012-08-30 16:22           ` Pablo Neira Ayuso
2012-08-30 17:49             ` Oliver
2012-08-30 18:39               ` Pablo Neira Ayuso
2012-08-31  0:19                 ` Oliver
2012-08-31  9:27                   ` Pablo Neira Ayuso

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9829027.jsWF3AolUh@gentoovm \
    --to=olipro@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pablo@netfilter.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.