From: Pablo Neira Ayuso <pablo@netfilter.org>
To: Florian Westphal <fw@strlen.de>
Cc: netfilter-devel@vger.kernel.org
Subject: Re: [PATCH nf] netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure
Date: Thu, 28 Sep 2023 15:38:12 +0200 [thread overview]
Message-ID: <ZRWBxJBxQ4z7QYVo@calendula> (raw)
In-Reply-To: <20230928131247.3391-1-fw@strlen.de>
On Thu, Sep 28, 2023 at 03:12:44PM +0200, Florian Westphal wrote:
> nft_rbtree_gc_elem() walks back and removes the end interval element that
> comes before the expired element.
>
> There is a small chance that we've cached this element as 'rbe_ge'.
> If this happens, we hold and test a pointer that has been queued for
> freeing.
>
> It also causes spurious insertion failures:
>
> $ cat nft-test.20230921-143934.826.dMBwHt/test-testcases-sets-0044interval_overlap_0.1/testout.log
> Error: Could not process rule: File exists
> add element t s { 0 - 2 }
> ^^^^^^
> Failed to insert 0 - 2 given:
> table ip t {
> set s {
> type inet_service
> flags interval,timeout
> timeout 2s
> gc-interval 2s
> }
> }
>
> The set (rbtree) is empty. The 'failure' doesn't happen on next attempt.
>
> Reason is that when we try to insert, the tree may hold an expired
> element that collides with the range we're adding.
> While we do evict/erase this element, we can trip over this check:
>
> if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
> return -ENOTEMPTY;
>
> rbe_ge was erased by the synchronous gc, we should not have done this
> check. Next attempt won't find it, so retry results in successful
> insertion.
>
> Restart in-kernel to avoid such spurious errors.
>
> Such restart are rare, unless userspace intentionally adds very large
> numbers of elements with very short timeouts while setting a huge
> gc interval.
>
> Even in this case, this cannot loop forever, on each retry an existing
> element has been removed.
>
> As the caller is holding the transaction mutex, its impossible
> for a second entity to add more expiring elements to the tree.
>
> After this it also becomes feasible to remove the async gc worker
> and perform all garbage collection from the commit path.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> Changes since v1:
> - restart entire insertion process in case we remove
> element that we held as lesser/greater overlap detection
> edge candidate.
Thanks, I am still looking at finding a way to move this to .commit,
if no better solution, then let's get this in for the next round.
> net/netfilter/nft_set_rbtree.c | 46 +++++++++++++++++++++-------------
> 1 file changed, 29 insertions(+), 17 deletions(-)
>
> diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
> index 487572dcd614..2660ceab3759 100644
> --- a/net/netfilter/nft_set_rbtree.c
> +++ b/net/netfilter/nft_set_rbtree.c
> @@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
> rb_erase(&rbe->node, &priv->root);
> }
>
> -static int nft_rbtree_gc_elem(const struct nft_set *__set,
> - struct nft_rbtree *priv,
> - struct nft_rbtree_elem *rbe,
> - u8 genmask)
> +static const struct nft_rbtree_elem *
> +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
> + struct nft_rbtree_elem *rbe, u8 genmask)
> {
> struct nft_set *set = (struct nft_set *)__set;
> struct rb_node *prev = rb_prev(&rbe->node);
> @@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
>
> gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
> if (!gc)
> - return -ENOMEM;
> + return ERR_PTR(-ENOMEM);
>
> /* search for end interval coming before this element.
> * end intervals don't carry a timeout extension, they
> @@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
> prev = rb_prev(prev);
> }
>
> + rbe_prev = NULL;
> if (prev) {
> rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
> nft_rbtree_gc_remove(net, set, priv, rbe_prev);
> @@ -272,7 +272,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
> */
> gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
> if (WARN_ON_ONCE(!gc))
> - return -ENOMEM;
> + return ERR_PTR(-ENOMEM);
>
> nft_trans_gc_elem_add(gc, rbe_prev);
> }
> @@ -280,13 +280,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
> nft_rbtree_gc_remove(net, set, priv, rbe);
> gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
> if (WARN_ON_ONCE(!gc))
> - return -ENOMEM;
> + return ERR_PTR(-ENOMEM);
>
> nft_trans_gc_elem_add(gc, rbe);
>
> nft_trans_gc_queue_sync_done(gc);
>
> - return 0;
> + return rbe_prev;
> }
>
> static bool nft_rbtree_update_first(const struct nft_set *set,
> @@ -314,7 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
> struct nft_rbtree *priv = nft_set_priv(set);
> u8 cur_genmask = nft_genmask_cur(net);
> u8 genmask = nft_genmask_next(net);
> - int d, err;
> + int d;
>
> /* Descend the tree to search for an existing element greater than the
> * key value to insert that is greater than the new element. This is the
> @@ -363,9 +363,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
> */
> if (nft_set_elem_expired(&rbe->ext) &&
> nft_set_elem_active(&rbe->ext, cur_genmask)) {
> - err = nft_rbtree_gc_elem(set, priv, rbe, genmask);
> - if (err < 0)
> - return err;
> + const struct nft_rbtree_elem *removed_end;
> +
> + removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask);
> + if (IS_ERR(removed_end))
> + return PTR_ERR(removed_end);
> +
> + if (removed_end == rbe_le || removed_end == rbe_ge)
> + return -EAGAIN;
>
> continue;
> }
> @@ -486,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
> struct nft_rbtree_elem *rbe = elem->priv;
> int err;
>
> - write_lock_bh(&priv->lock);
> - write_seqcount_begin(&priv->count);
> - err = __nft_rbtree_insert(net, set, rbe, ext);
> - write_seqcount_end(&priv->count);
> - write_unlock_bh(&priv->lock);
> + do {
> + if (fatal_signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + write_lock_bh(&priv->lock);
> + write_seqcount_begin(&priv->count);
> + err = __nft_rbtree_insert(net, set, rbe, ext);
> + write_seqcount_end(&priv->count);
> + write_unlock_bh(&priv->lock);
> + } while (err == -EAGAIN);
>
> return err;
> }
> --
> 2.41.0
>
next prev parent reply other threads:[~2023-09-28 13:38 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-09-28 13:12 [PATCH nf] netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure Florian Westphal
2023-09-28 13:38 ` Pablo Neira Ayuso [this message]
2023-09-28 13:49 ` Florian Westphal
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ZRWBxJBxQ4z7QYVo@calendula \
--to=pablo@netfilter.org \
--cc=fw@strlen.de \
--cc=netfilter-devel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).