[PATCH] netfilter: nf_tables: fix racy rule deletion

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] netfilter: nf_tables: fix racy rule deletion
@ 2014-01-25 13:03 Pablo Neira Ayuso
  2014-01-25 13:55 ` Patrick McHardy
  2014-02-05 15:48 ` Patrick McHardy
  0 siblings, 2 replies; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-01-25 13:03 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber, arturo.borrero.glez

We may lost race if we flush the rule-set (which happens asynchronously
via call_rcu) and we try to remove the table (that userspace assumes
to be empty).

Fix this by recovering synchronous rule and chain deletion. This was
introduced time ago before we had no batch support, and synchronous
rule deletion performance was not good. Now that we have the batch
support, we can just postpone the purge of old rule in a second step
in the commit phase. All object deletions are synchronous after this
patch.

As a side effect, we save memory as we don't need rcu_head per rule
anymore.

Cc: Patrick McHardy <kaber@trash.net>
Reported-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
We still have a bug somewhere else. When creating 10000 rules like:
tcp dport { 22, 23 }, I can see more than 10000 sets.

# ./nft-set-get ip | wc -l
10016

It seems set 511 is not being used. See below:

# ./nft-rule-get
ip filter output 513 512
  [ payload load 1b @ network header + 9 => reg 1 ]
  [ cmp eq reg 1 0x00000006 ]
  [ payload load 2b @ transport header + 2 => reg 1 ]
  [ lookup reg 1 set set510 ]
  [ counter pkts 0 bytes 0 ]

ip filter output 514 513
  [ payload load 1b @ network header + 9 => reg 1 ]
  [ cmp eq reg 1 0x00000006 ]
  [ payload load 2b @ transport header + 2 => reg 1 ]
  [ lookup reg 1 set set512 ]
  [ counter pkts 0 bytes 0 ]

It seems to happen every 512 sets are added. Still investigating, so
this needs a second follow up patch to resolve what Arturo is reporting.

 include/net/netfilter/nf_tables.h |    4 ----
 net/netfilter/nf_tables_api.c     |   40 +++++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 57c8ff7..ec5e885 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -320,7 +320,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
  *	struct nft_rule - nf_tables rule
  *
  *	@list: used internally
- *	@rcu_head: used internally for rcu
  *	@handle: rule handle
  *	@genmask: generation mask
  *	@dlen: length of expression data
@@ -328,7 +327,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
  */
 struct nft_rule {
 	struct list_head		list;
-	struct rcu_head			rcu_head;
 	u64				handle:46,
 					genmask:2,
 					dlen:16;
@@ -389,7 +387,6 @@ enum nft_chain_flags {
  *
  *	@rules: list of rules in the chain
  *	@list: used internally
- *	@rcu_head: used internally
  *	@net: net namespace that this chain belongs to
  *	@table: table that this chain belongs to
  *	@handle: chain handle
@@ -401,7 +398,6 @@ enum nft_chain_flags {
 struct nft_chain {
 	struct list_head		rules;
 	struct list_head		list;
-	struct rcu_head			rcu_head;
 	struct net			*net;
 	struct nft_table		*table;
 	u64				handle;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 117bbaa..a74ee39 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1008,10 +1008,8 @@ notify:
 	return 0;
 }
 
-static void nf_tables_rcu_chain_destroy(struct rcu_head *head)
+static void nf_tables_chain_destroy(struct nft_chain *chain)
 {
-	struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head);
-
 	BUG_ON(chain->use > 0);
 
 	if (chain->flags & NFT_BASE_CHAIN) {
@@ -1059,7 +1057,9 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 			       family);
 
 	/* Make sure all rule references are gone before this is released */
-	call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy);
+	synchronize_rcu();
+
+	nf_tables_chain_destroy(chain);
 	return 0;
 }
 
@@ -1521,9 +1521,8 @@ err:
 	return err;
 }
 
-static void nf_tables_rcu_rule_destroy(struct rcu_head *head)
+static void nf_tables_rule_destroy(struct nft_rule *rule)
 {
-	struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head);
 	struct nft_expr *expr;
 
 	/*
@@ -1538,11 +1537,6 @@ static void nf_tables_rcu_rule_destroy(struct rcu_head *head)
 	kfree(rule);
 }
 
-static void nf_tables_rule_destroy(struct nft_rule *rule)
-{
-	call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy);
-}
-
 #define NFT_RULE_MAXEXPRS	128
 
 static struct nft_expr_info *info;
@@ -1809,9 +1803,6 @@ static int nf_tables_commit(struct sk_buff *skb)
 	synchronize_rcu();
 
 	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
-		/* Delete this rule from the dirty list */
-		list_del(&rupd->list);
-
 		/* This rule was inactive in the past and just became active.
 		 * Clear the next bit of the genmask since its meaning has
 		 * changed, now it is the future.
@@ -1822,6 +1813,7 @@ static int nf_tables_commit(struct sk_buff *skb)
 					      rupd->chain, rupd->rule,
 					      NFT_MSG_NEWRULE, 0,
 					      rupd->family);
+			list_del(&rupd->list);
 			kfree(rupd);
 			continue;
 		}
@@ -1831,7 +1823,15 @@ static int nf_tables_commit(struct sk_buff *skb)
 		nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain,
 				      rupd->rule, NFT_MSG_DELRULE, 0,
 				      rupd->family);
+	}
+
+	/* Make sure we don't see any packet traversing old rules */
+	synchronize_rcu();
+
+	/* Now we can safely release unused old rules */
+	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
 		nf_tables_rule_destroy(rupd->rule);
+		list_del(&rupd->list);
 		kfree(rupd);
 	}
 
@@ -1844,20 +1844,26 @@ static int nf_tables_abort(struct sk_buff *skb)
 	struct nft_rule_trans *rupd, *tmp;
 
 	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
-		/* Delete all rules from the dirty list */
-		list_del(&rupd->list);
-
 		if (!nft_rule_is_active_next(net, rupd->rule)) {
 			nft_rule_clear(net, rupd->rule);
+			list_del(&rupd->list);
 			kfree(rupd);
 			continue;
 		}
 
 		/* This rule is inactive, get rid of it */
 		list_del_rcu(&rupd->rule->list);
+	}
+
+	/* Make sure we don't see any packet accessing aborted rules */
+	synchronize_rcu();
+
+	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
 		nf_tables_rule_destroy(rupd->rule);
+		list_del(&rupd->list);
 		kfree(rupd);
 	}
+
 	return 0;
 }
 
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-25 13:03 [PATCH] netfilter: nf_tables: fix racy rule deletion Pablo Neira Ayuso
@ 2014-01-25 13:55 ` Patrick McHardy
  2014-01-25 16:35   ` Patrick McHardy
  2014-02-05 15:48 ` Patrick McHardy
  1 sibling, 1 reply; 10+ messages in thread
From: Patrick McHardy @ 2014-01-25 13:55 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arturo.borrero.glez

On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> We may lost race if we flush the rule-set (which happens asynchronously
> via call_rcu) and we try to remove the table (that userspace assumes
> to be empty).
> 
> Fix this by recovering synchronous rule and chain deletion. This was
> introduced time ago before we had no batch support, and synchronous
> rule deletion performance was not good. Now that we have the batch
> support, we can just postpone the purge of old rule in a second step
> in the commit phase. All object deletions are synchronous after this
> patch.
> 
> As a side effect, we save memory as we don't need rcu_head per rule
> anymore.
> 
> Cc: Patrick McHardy <kaber@trash.net>
> Reported-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> We still have a bug somewhere else. When creating 10000 rules like:
> tcp dport { 22, 23 }, I can see more than 10000 sets.
> 
> # ./nft-set-get ip | wc -l
> 10016
> 
> It seems set 511 is not being used. See below:
> 
> # ./nft-rule-get
> ip filter output 513 512
>   [ payload load 1b @ network header + 9 => reg 1 ]
>   [ cmp eq reg 1 0x00000006 ]
>   [ payload load 2b @ transport header + 2 => reg 1 ]
>   [ lookup reg 1 set set510 ]
>   [ counter pkts 0 bytes 0 ]
> 
> ip filter output 514 513
>   [ payload load 1b @ network header + 9 => reg 1 ]
>   [ cmp eq reg 1 0x00000006 ]
>   [ payload load 2b @ transport header + 2 => reg 1 ]
>   [ lookup reg 1 set set512 ]
>   [ counter pkts 0 bytes 0 ]
> 
> It seems to happen every 512 sets are added. Still investigating, so
> this needs a second follow up patch to resolve what Arturo is reporting.

Yeah, we seem to have a couple of bugs in nf_tables_set_alloc_name().
I'll fix them up and will then have a look at this patch.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-25 13:55 ` Patrick McHardy
@ 2014-01-25 16:35   ` Patrick McHardy
  2014-01-25 17:14     ` Patrick McHardy
  0 siblings, 1 reply; 10+ messages in thread
From: Patrick McHardy @ 2014-01-25 16:35 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arturo.borrero.glez

On Sat, Jan 25, 2014 at 01:55:52PM +0000, Patrick McHardy wrote:
> On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> > We still have a bug somewhere else. When creating 10000 rules like:
> > tcp dport { 22, 23 }, I can see more than 10000 sets.
> > 
> > # ./nft-set-get ip | wc -l
> > 10016
> > 
> > It seems set 511 is not being used. See below:
> > 
> > # ./nft-rule-get
> > ip filter output 513 512
> >   [ payload load 1b @ network header + 9 => reg 1 ]
> >   [ cmp eq reg 1 0x00000006 ]
> >   [ payload load 2b @ transport header + 2 => reg 1 ]
> >   [ lookup reg 1 set set510 ]
> >   [ counter pkts 0 bytes 0 ]
> > 
> > ip filter output 514 513
> >   [ payload load 1b @ network header + 9 => reg 1 ]
> >   [ cmp eq reg 1 0x00000006 ]
> >   [ payload load 2b @ transport header + 2 => reg 1 ]
> >   [ lookup reg 1 set set512 ]
> >   [ counter pkts 0 bytes 0 ]
> > 
> > It seems to happen every 512 sets are added. Still investigating, so
> > this needs a second follow up patch to resolve what Arturo is reporting.
> 
> Yeah, we seem to have a couple of bugs in nf_tables_set_alloc_name().
> I'll fix them up and will then have a look at this patch.

I can't reproduce the gaps in the name space, but we have an obvious
overflow since we're using BITS_PER_LONG * PAGE_SIZE instead of BITS_PER_BYTE.

This shouldn't have affected your test case though since the overflow only
happens for more than 32768 sets.

Another thing is that our name allocation algorithm really sucks. It was
copied from dev_alloc_name(), but network device allocation doesn't happen
on the same scale as we might have. I'm considering switching to something
taking O(1). Basically, the name allocation is only useful for anonymous
sets anyway since in all other cases you need to manually populate them.
So if we switch to a prefix string that can't clash with user defined
names, we can simply use an incrementing 64 bit counter. So my proposal
would be to just use names starting with \0. Alternatively use a handle
instead of a name for anonymous sets.

The second upside is that its not possible anymore for the user to run
into unexpected EEXIST when using setN or mapN as name.

Thoughts?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-25 16:35   ` Patrick McHardy
@ 2014-01-25 17:14     ` Patrick McHardy
  2014-01-26  8:54       ` Pablo Neira Ayuso
  2014-02-05 23:02       ` Pablo Neira Ayuso
  0 siblings, 2 replies; 10+ messages in thread
From: Patrick McHardy @ 2014-01-25 17:14 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arturo.borrero.glez

On Sat, Jan 25, 2014 at 04:35:33PM +0000, Patrick McHardy wrote:
> On Sat, Jan 25, 2014 at 01:55:52PM +0000, Patrick McHardy wrote:
> > On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> > > We still have a bug somewhere else. When creating 10000 rules like:
> > > tcp dport { 22, 23 }, I can see more than 10000 sets.
> > > 
> > > # ./nft-set-get ip | wc -l
> > > 10016
> > > 
> > > It seems set 511 is not being used. See below:
> > > 
> > > # ./nft-rule-get
> > > ip filter output 513 512
> > >   [ payload load 1b @ network header + 9 => reg 1 ]
> > >   [ cmp eq reg 1 0x00000006 ]
> > >   [ payload load 2b @ transport header + 2 => reg 1 ]
> > >   [ lookup reg 1 set set510 ]
> > >   [ counter pkts 0 bytes 0 ]
> > > 
> > > ip filter output 514 513
> > >   [ payload load 1b @ network header + 9 => reg 1 ]
> > >   [ cmp eq reg 1 0x00000006 ]
> > >   [ payload load 2b @ transport header + 2 => reg 1 ]
> > >   [ lookup reg 1 set set512 ]
> > >   [ counter pkts 0 bytes 0 ]
> > > 
> > > It seems to happen every 512 sets are added. Still investigating, so
> > > this needs a second follow up patch to resolve what Arturo is reporting.
> > 
> > Yeah, we seem to have a couple of bugs in nf_tables_set_alloc_name().
> > I'll fix them up and will then have a look at this patch.
> 
> I can't reproduce the gaps in the name space, but we have an obvious
> overflow since we're using BITS_PER_LONG * PAGE_SIZE instead of BITS_PER_BYTE.
> 
> This shouldn't have affected your test case though since the overflow only
> happens for more than 32768 sets.
> 

As a start, please try this patch. It fixes the overflow, might also
fix your problem.

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9ce3053..e8c7437 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1989,13 +1992,13 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 
 			if (!sscanf(i->name, name, &tmp))
 				continue;
-			if (tmp < 0 || tmp > BITS_PER_LONG * PAGE_SIZE)
+			if (tmp < 0 || tmp >= BITS_PER_BYTE * PAGE_SIZE)
 				continue;
 
 			set_bit(tmp, inuse);
 		}
 
-		n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE);
+		n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
 		free_page((unsigned long)inuse);
 	}
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-25 17:14     ` Patrick McHardy
@ 2014-01-26  8:54       ` Pablo Neira Ayuso
  2014-01-26 12:23         ` Patrick McHardy
  2014-02-05 23:02       ` Pablo Neira Ayuso
  1 sibling, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-01-26  8:54 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, arturo.borrero.glez

[-- Attachment #1: Type: text/plain, Size: 4260 bytes --]

On Sat, Jan 25, 2014 at 05:14:51PM +0000, Patrick McHardy wrote:
> On Sat, Jan 25, 2014 at 04:35:33PM +0000, Patrick McHardy wrote:
> > On Sat, Jan 25, 2014 at 01:55:52PM +0000, Patrick McHardy wrote:
> > > On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> > > > We still have a bug somewhere else. When creating 10000 rules like:
> > > > tcp dport { 22, 23 }, I can see more than 10000 sets.
> > > > 
> > > > # ./nft-set-get ip | wc -l
> > > > 10016
> > > > 
> > > > It seems set 511 is not being used. See below:
> > > > 
> > > > # ./nft-rule-get
> > > > ip filter output 513 512
> > > >   [ payload load 1b @ network header + 9 => reg 1 ]
> > > >   [ cmp eq reg 1 0x00000006 ]
> > > >   [ payload load 2b @ transport header + 2 => reg 1 ]
> > > >   [ lookup reg 1 set set510 ]
> > > >   [ counter pkts 0 bytes 0 ]
> > > > 
> > > > ip filter output 514 513
> > > >   [ payload load 1b @ network header + 9 => reg 1 ]
> > > >   [ cmp eq reg 1 0x00000006 ]
> > > >   [ payload load 2b @ transport header + 2 => reg 1 ]
> > > >   [ lookup reg 1 set set512 ]
> > > >   [ counter pkts 0 bytes 0 ]
> > > > 
> > > > It seems to happen every 512 sets are added. Still investigating, so
> > > > this needs a second follow up patch to resolve what Arturo is reporting.
> > > 
> > > Yeah, we seem to have a couple of bugs in nf_tables_set_alloc_name().
> > > I'll fix them up and will then have a look at this patch.
> > 
> > I can't reproduce the gaps in the name space, but we have an obvious
> > overflow since we're using BITS_PER_LONG * PAGE_SIZE instead of BITS_PER_BYTE.
> > 
> > This shouldn't have affected your test case though since the overflow only
> > happens for more than 32768 sets.
> > 
> 
> As a start, please try this patch. It fixes the overflow, might also
> fix your problem.
> 
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index 9ce3053..e8c7437 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -1989,13 +1992,13 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
>  
>  			if (!sscanf(i->name, name, &tmp))
>  				continue;
> -			if (tmp < 0 || tmp > BITS_PER_LONG * PAGE_SIZE)
> +			if (tmp < 0 || tmp >= BITS_PER_BYTE * PAGE_SIZE)
>  				continue;
>  
>  			set_bit(tmp, inuse);
>  		}
>  
> -		n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE);
> +		n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
>  		free_page((unsigned long)inuse);
>  	}
>  

Tested this patch, it works fine here, I hit -EMFILE with 32768 sets
with no crashes.

The problem I was reporting was different though, I found a bug in the
batching code of libmnl. The mnl_nlmsg_batch_next function was not
accounting the last message not fitting in the batch.

With my patch + libmnl patch I can perform:

nft -f pablo-lots-test; nft flush table filter; nft delete chain filter output; nft delete table filter

without seeing unused anonymous sets left attached to the table and
-EBUSY problems in that table.

> Another thing is that our name allocation algorithm really sucks. It
> was copied from dev_alloc_name(), but network device allocation doesn't
> happen on the same scale as we might have. I'm considering switching to
> something taking O(1). Basically, the name allocation is only useful for
> anonymous sets anyway since in all other cases you need to manually populate
> them. So if we switch to a prefix string that can't clash with user defined
> names, we can simply use an incrementing 64 bit counter. So my
> proposal would be to just use names starting with \0. Alternatively use a
> handle instead of a name for anonymous sets.
>
> The second upside is that its not possible anymore for the user to run
> into unexpected EEXIST when using setN or mapN as name.
>
> Thoughts?

I like the u64 handle for anonymous sets, it's similar to what we do
with other objects, we get O(1) handle allocation.

I think we can allow both u64 and set%d, map%d.  The kernel can check
if the handle is available first, if not check if the name looks like
set%d, map%d (so the the maximum number of sets limitation only
applies to that case). Userspace only needs to send both set%d and the
u64 handle.

Would you be OK with that?

[-- Attachment #2: libmnl.patch --]
[-- Type: text/x-diff, Size: 533 bytes --]

diff --git a/src/nlmsg.c b/src/nlmsg.c
index fdb7af8..0a414a7 100644
--- a/src/nlmsg.c
+++ b/src/nlmsg.c
@@ -484,14 +484,15 @@ EXPORT_SYMBOL(mnl_nlmsg_batch_stop);
 bool mnl_nlmsg_batch_next(struct mnl_nlmsg_batch *b)
 {
 	struct nlmsghdr *nlh = b->cur;
+	bool ret = true;
 
 	if (b->buflen + nlh->nlmsg_len > b->limit) {
 		b->overflow = true;
-		return false;
+		ret = false;
 	}
 	b->cur = b->buf + b->buflen + nlh->nlmsg_len;
 	b->buflen += nlh->nlmsg_len;
-	return true;
+	return ret;
 }
 EXPORT_SYMBOL(mnl_nlmsg_batch_next);
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-26  8:54       ` Pablo Neira Ayuso
@ 2014-01-26 12:23         ` Patrick McHardy
  0 siblings, 0 replies; 10+ messages in thread
From: Patrick McHardy @ 2014-01-26 12:23 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arturo.borrero.glez

On Sun, Jan 26, 2014 at 09:54:46AM +0100, Pablo Neira Ayuso wrote:
> On Sat, Jan 25, 2014 at 05:14:51PM +0000, Patrick McHardy wrote:
> > As a start, please try this patch. It fixes the overflow, might also
> > fix your problem.
> > ...
> 
> Tested this patch, it works fine here, I hit -EMFILE with 32768 sets
> with no crashes.

Thanks. I got another patch for this. Just RFC for now since I prefer
to get rid of this completely.

> The problem I was reporting was different though, I found a bug in the
> batching code of libmnl. The mnl_nlmsg_batch_next function was not
> accounting the last message not fitting in the batch.
> 
> With my patch + libmnl patch I can perform:
> 
> nft -f pablo-lots-test; nft flush table filter; nft delete chain filter output; nft delete table filter
> 
> without seeing unused anonymous sets left attached to the table and
> -EBUSY problems in that table.

Excellent.

> > Another thing is that our name allocation algorithm really sucks. It
> > was copied from dev_alloc_name(), but network device allocation doesn't
> > happen on the same scale as we might have. I'm considering switching to
> > something taking O(1). Basically, the name allocation is only useful for
> > anonymous sets anyway since in all other cases you need to manually populate
> > them. So if we switch to a prefix string that can't clash with user defined
> > names, we can simply use an incrementing 64 bit counter. So my
> > proposal would be to just use names starting with \0. Alternatively use a
> > handle instead of a name for anonymous sets.
> >
> > The second upside is that its not possible anymore for the user to run
> > into unexpected EEXIST when using setN or mapN as name.
> >
> > Thoughts?
> 
> I like the u64 handle for anonymous sets, it's similar to what we do
> with other objects, we get O(1) handle allocation.
> 
> I think we can allow both u64 and set%d, map%d.  The kernel can check
> if the handle is available first, if not check if the name looks like
> set%d, map%d (so the the maximum number of sets limitation only
> applies to that case). Userspace only needs to send both set%d and the
> u64 handle.
> 
> Would you be OK with that?

Yes, that was my thought as well. We can kill it off later if we want,
no need to keep compatibility with this very early version of nftables
for long.

I'll look into it once I've managed to tame my constantly growing TODO-list :)


commit 06d7a2f84bf1360a07768418f6c80b6476439d23
Author: Patrick McHardy <kaber@trash.net>
Date:   Sat Jan 25 18:24:17 2014 +0000

    netfilter: nf_tables: handle more than 8 * PAGE_SIZE set name allocations
    
    We currently have a limit of 8 * PAGE_SIZE anonymous sets. Lift that limit
    by continuing the scan if the entire page is exhausted.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e8c7437..f6b869b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1976,7 +1976,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 	const struct nft_set *i;
 	const char *p;
 	unsigned long *inuse;
-	unsigned int n = 0;
+	unsigned int n = 0, min = 0;
 
 	p = strnchr(name, IFNAMSIZ, '%');
 	if (p != NULL) {
@@ -1986,23 +1986,28 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 		inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
 		if (inuse == NULL)
 			return -ENOMEM;
-
+cont:
 		list_for_each_entry(i, &ctx->table->sets, list) {
 			int tmp;
 
 			if (!sscanf(i->name, name, &tmp))
 				continue;
-			if (tmp < 0 || tmp >= BITS_PER_BYTE * PAGE_SIZE)
+			if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
 				continue;
 
-			set_bit(tmp, inuse);
+			set_bit(tmp - min, inuse);
 		}
 
 		n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
+		if (n >= BITS_PER_BYTE * PAGE_SIZE) {
+			min += BITS_PER_BYTE * PAGE_SIZE;
+			memset(inuse, 0, PAGE_SIZE);
+			goto cont;
+		}
 		free_page((unsigned long)inuse);
 	}
 
-	snprintf(set->name, sizeof(set->name), name, n);
+	snprintf(set->name, sizeof(set->name), name, min + n);
 	list_for_each_entry(i, &ctx->table->sets, list) {
 		if (!strcmp(set->name, i->name))
 			return -ENFILE;

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-25 13:03 [PATCH] netfilter: nf_tables: fix racy rule deletion Pablo Neira Ayuso
  2014-01-25 13:55 ` Patrick McHardy
@ 2014-02-05 15:48 ` Patrick McHardy
  2014-02-05 16:38   ` Pablo Neira Ayuso
  1 sibling, 1 reply; 10+ messages in thread
From: Patrick McHardy @ 2014-02-05 15:48 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arturo.borrero.glez

On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> As a side effect, we save memory as we don't need rcu_head per rule
> anymore.

We can also save some memory for now unnecessary families in the private
structs since we have the context available during destruction again.

> @@ -1809,9 +1803,6 @@ static int nf_tables_commit(struct sk_buff *skb)
>  	synchronize_rcu();
>  
>  	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
> -		/* Delete this rule from the dirty list */
> -		list_del(&rupd->list);
> -
>  		/* This rule was inactive in the past and just became active.
>  		 * Clear the next bit of the genmask since its meaning has
>  		 * changed, now it is the future.
> @@ -1822,6 +1813,7 @@ static int nf_tables_commit(struct sk_buff *skb)
>  					      rupd->chain, rupd->rule,
>  					      NFT_MSG_NEWRULE, 0,
>  					      rupd->family);
> +			list_del(&rupd->list);
>  			kfree(rupd);
>  			continue;
>  		}
> @@ -1831,7 +1823,15 @@ static int nf_tables_commit(struct sk_buff *skb)
>  		nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain,
>  				      rupd->rule, NFT_MSG_DELRULE, 0,
>  				      rupd->family);
> +	}
> +
> +	/* Make sure we don't see any packet traversing old rules */
> +	synchronize_rcu();
> +
> +	/* Now we can safely release unused old rules */
> +	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
>  		nf_tables_rule_destroy(rupd->rule);
> +		list_del(&rupd->list);
>  		kfree(rupd);
>  	}
>  
> @@ -1844,20 +1844,26 @@ static int nf_tables_abort(struct sk_buff *skb)
>  	struct nft_rule_trans *rupd, *tmp;
>  
>  	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
> -		/* Delete all rules from the dirty list */
> -		list_del(&rupd->list);
> -
>  		if (!nft_rule_is_active_next(net, rupd->rule)) {
>  			nft_rule_clear(net, rupd->rule);
> +			list_del(&rupd->list);
>  			kfree(rupd);
>  			continue;
>  		}
>  
>  		/* This rule is inactive, get rid of it */
>  		list_del_rcu(&rupd->rule->list);
> +	}
> +
> +	/* Make sure we don't see any packet accessing aborted rules */
> +	synchronize_rcu();
> +
> +	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
>  		nf_tables_rule_destroy(rupd->rule);
> +		list_del(&rupd->list);
>  		kfree(rupd);
>  	}

I have to admit this all seems slightly confusing to me, we now have three
synhronize_rcu()s in this function, are all those really needed?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-02-05 15:48 ` Patrick McHardy
@ 2014-02-05 16:38   ` Pablo Neira Ayuso
  2014-02-05 16:48     ` Patrick McHardy
  0 siblings, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-02-05 16:38 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, arturo.borrero.glez

On Wed, Feb 05, 2014 at 03:48:46PM +0000, Patrick McHardy wrote:
> On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> > As a side effect, we save memory as we don't need rcu_head per rule
> > anymore.
> 
> We can also save some memory for now unnecessary families in the private
> structs since we have the context available during destruction again.

Right, that was only needed with when we were using call_rcu. I'll
revisit that.

> > @@ -1809,9 +1803,6 @@ static int nf_tables_commit(struct sk_buff *skb)
> >  	synchronize_rcu();
> >  
> >  	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
> > -		/* Delete this rule from the dirty list */
> > -		list_del(&rupd->list);
> > -
> >  		/* This rule was inactive in the past and just became active.
> >  		 * Clear the next bit of the genmask since its meaning has
> >  		 * changed, now it is the future.
> > @@ -1822,6 +1813,7 @@ static int nf_tables_commit(struct sk_buff *skb)
> >  					      rupd->chain, rupd->rule,
> >  					      NFT_MSG_NEWRULE, 0,
> >  					      rupd->family);
> > +			list_del(&rupd->list);
> >  			kfree(rupd);
> >  			continue;
> >  		}
> > @@ -1831,7 +1823,15 @@ static int nf_tables_commit(struct sk_buff *skb)
> >  		nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain,
> >  				      rupd->rule, NFT_MSG_DELRULE, 0,
> >  				      rupd->family);
> > +	}
> > +
> > +	/* Make sure we don't see any packet traversing old rules */
> > +	synchronize_rcu();
> > +
> > +	/* Now we can safely release unused old rules */
> > +	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
> >  		nf_tables_rule_destroy(rupd->rule);
> > +		list_del(&rupd->list);
> >  		kfree(rupd);
> >  	}
> >  
> > @@ -1844,20 +1844,26 @@ static int nf_tables_abort(struct sk_buff *skb)
> >  	struct nft_rule_trans *rupd, *tmp;
> >  
> >  	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
> > -		/* Delete all rules from the dirty list */
> > -		list_del(&rupd->list);
> > -
> >  		if (!nft_rule_is_active_next(net, rupd->rule)) {
> >  			nft_rule_clear(net, rupd->rule);
> > +			list_del(&rupd->list);
> >  			kfree(rupd);
> >  			continue;
> >  		}
> >  
> >  		/* This rule is inactive, get rid of it */
> >  		list_del_rcu(&rupd->rule->list);
> > +	}
> > +
> > +	/* Make sure we don't see any packet accessing aborted rules */
> > +	synchronize_rcu();
> > +
> > +	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
> >  		nf_tables_rule_destroy(rupd->rule);
> > +		list_del(&rupd->list);
> >  		kfree(rupd);
> >  	}
> 
> I have to admit this all seems slightly confusing to me, we now have three
> synhronize_rcu()s in this function, are all those really needed?

There are only two to separate the different stages. To my
understanding, the first one ensures that all packets has left the
previous generation before we start purging out old rules. Then, the
second one makes sure that no packets are still checking the old rule
genmask that have just been deleted, so we can safely release it.

Before this patch, we only needed one since we were using call_rcu
after deleting the rules from the list.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-02-05 16:38   ` Pablo Neira Ayuso
@ 2014-02-05 16:48     ` Patrick McHardy
  0 siblings, 0 replies; 10+ messages in thread
From: Patrick McHardy @ 2014-02-05 16:48 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arturo.borrero.glez

On Wed, Feb 05, 2014 at 05:38:06PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Feb 05, 2014 at 03:48:46PM +0000, Patrick McHardy wrote:
> > On Sat, Jan 25, 2014 at 02:03:51PM +0100, Pablo Neira Ayuso wrote:
> > > As a side effect, we save memory as we don't need rcu_head per rule
> > > anymore.
> > 
> > We can also save some memory for now unnecessary families in the private
> > structs since we have the context available during destruction again.
> 
> Right, that was only needed with when we were using call_rcu. I'll
> revisit that.

I already have a patch which does this for expressions which can now
use pkt->hook_ops->pf queued. Since its quite similar, I'll just add
it to my patch once your patch is in the tree.

> > I have to admit this all seems slightly confusing to me, we now have three
> > synhronize_rcu()s in this function, are all those really needed?
> 
> There are only two to separate the different stages. To my
> understanding, the first one ensures that all packets has left the
> previous generation before we start purging out old rules. Then, the
> second one makes sure that no packets are still checking the old rule
> genmask that have just been deleted, so we can safely release it.
> 
> Before this patch, we only needed one since we were using call_rcu
> after deleting the rules from the list.

I'll have another look now, thanks.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_tables: fix racy rule deletion
  2014-01-25 17:14     ` Patrick McHardy
  2014-01-26  8:54       ` Pablo Neira Ayuso
@ 2014-02-05 23:02       ` Pablo Neira Ayuso
  1 sibling, 0 replies; 10+ messages in thread
From: Pablo Neira Ayuso @ 2014-02-05 23:02 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, arturo.borrero.glez

> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -1989,13 +1992,13 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
>  
>  			if (!sscanf(i->name, name, &tmp))
>  				continue;
> -			if (tmp < 0 || tmp > BITS_PER_LONG * PAGE_SIZE)
> +			if (tmp < 0 || tmp >= BITS_PER_BYTE * PAGE_SIZE)
>  				continue;
>  
>  			set_bit(tmp, inuse);
>  		}
>  
> -		n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE);
> +		n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
>  		free_page((unsigned long)inuse);
>  	}
> 

Enqueued this chunk as fix for David, thanks Patrick.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-02-05 23:02 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-01-25 13:03 [PATCH] netfilter: nf_tables: fix racy rule deletion Pablo Neira Ayuso
2014-01-25 13:55 ` Patrick McHardy
2014-01-25 16:35   ` Patrick McHardy
2014-01-25 17:14     ` Patrick McHardy
2014-01-26  8:54       ` Pablo Neira Ayuso
2014-01-26 12:23         ` Patrick McHardy
2014-02-05 23:02       ` Pablo Neira Ayuso
2014-02-05 15:48 ` Patrick McHardy
2014-02-05 16:38   ` Pablo Neira Ayuso
2014-02-05 16:48     ` Patrick McHardy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).