netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: gregkh@linuxfoundation.org, sashal@kernel.org, stable@vger.kernel.org
Subject: [PATCH -stable,4.19.x 06/40] netfilter: nft_set_rbtree: Switch to node list walk for overlap detection
Date: Thu, 13 Jun 2024 03:01:35 +0200	[thread overview]
Message-ID: <20240613010209.104423-7-pablo@netfilter.org> (raw)
In-Reply-To: <20240613010209.104423-1-pablo@netfilter.org>

commit c9e6978e2725a7d4b6cd23b2facd3f11422c0643 upstream.

...instead of a tree descent, which became overly complicated in an
attempt to cover cases where expired or inactive elements would affect
comparisons with the new element being inserted.

Further, it turned out that it's probably impossible to cover all those
cases, as inactive nodes might entirely hide subtrees consisting of a
complete interval plus a node that makes the current insertion not
overlap.

To speed up the overlap check, descent the tree to find a greater
element that is closer to the key value to insert. Then walk down the
node list for overlap detection. Starting the overlap check from
rb_first() unconditionally is slow, it takes 10 times longer due to the
full linear traversal of the list.

Moreover, perform garbage collection of expired elements when walking
down the node list to avoid bogus overlap reports.

For the insertion operation itself, this essentially reverts back to the
implementation before commit 7c84d41416d8 ("netfilter: nft_set_rbtree:
Detect partial overlaps on insertion"), except that cases of complete
overlap are already handled in the overlap detection phase itself, which
slightly simplifies the loop to find the insertion point.

Based on initial patch from Stefano Brivio, including text from the
original patch description too.

Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 223 +++++++++++++++++++++++++++++----
 1 file changed, 198 insertions(+), 25 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 43c9cd5d6078..f8d98547df7a 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -41,10 +41,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
 	return !nft_rbtree_interval_end(rbe);
 }
 
-static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
-			     const struct nft_rbtree_elem *interval)
+static int nft_rbtree_cmp(const struct nft_set *set,
+			  const struct nft_rbtree_elem *e1,
+			  const struct nft_rbtree_elem *e2)
 {
-	return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+	return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
+		      set->klen);
 }
 
 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
@@ -55,7 +57,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
 	const struct nft_rbtree_elem *rbe, *interval = NULL;
 	u8 genmask = nft_genmask_cur(net);
 	const struct rb_node *parent;
-	const void *this;
 	int d;
 
 	parent = rcu_dereference_raw(priv->root.rb_node);
@@ -65,12 +66,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
 
 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 
-		this = nft_set_ext_key(&rbe->ext);
-		d = memcmp(this, key, set->klen);
+		d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
 		if (d < 0) {
 			parent = rcu_dereference_raw(parent->rb_left);
 			if (interval &&
-			    nft_rbtree_equal(set, this, interval) &&
+			    !nft_rbtree_cmp(set, rbe, interval) &&
 			    nft_rbtree_interval_end(rbe) &&
 			    nft_rbtree_interval_start(interval))
 				continue;
@@ -217,43 +217,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
 	return rbe;
 }
 
+static int nft_rbtree_gc_elem(const struct nft_set *__set,
+			      struct nft_rbtree *priv,
+			      struct nft_rbtree_elem *rbe)
+{
+	struct nft_set *set = (struct nft_set *)__set;
+	struct rb_node *prev = rb_prev(&rbe->node);
+	struct nft_rbtree_elem *rbe_prev;
+	struct nft_set_gc_batch *gcb;
+
+	gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
+	if (!gcb)
+		return -ENOMEM;
+
+	/* search for expired end interval coming before this element. */
+	do {
+		rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+		if (nft_rbtree_interval_end(rbe_prev))
+			break;
+
+		prev = rb_prev(prev);
+	} while (prev != NULL);
+
+	rb_erase(&rbe_prev->node, &priv->root);
+	rb_erase(&rbe->node, &priv->root);
+	atomic_sub(2, &set->nelems);
+
+	nft_set_gc_batch_add(gcb, rbe);
+	nft_set_gc_batch_complete(gcb);
+
+	return 0;
+}
+
+static bool nft_rbtree_update_first(const struct nft_set *set,
+				    struct nft_rbtree_elem *rbe,
+				    struct rb_node *first)
+{
+	struct nft_rbtree_elem *first_elem;
+
+	first_elem = rb_entry(first, struct nft_rbtree_elem, node);
+	/* this element is closest to where the new element is to be inserted:
+	 * update the first element for the node list path.
+	 */
+	if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
+		return true;
+
+	return false;
+}
+
 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new,
 			       struct nft_set_ext **ext)
 {
+	struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
+	struct rb_node *node, *parent, **p, *first = NULL;
 	struct nft_rbtree *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_rbtree_elem *rbe;
-	struct rb_node *parent, **p;
-	int d;
+	int d, err;
 
+	/* Descend the tree to search for an existing element greater than the
+	 * key value to insert that is greater than the new element. This is the
+	 * first element to walk the ordered elements to find possible overlap.
+	 */
 	parent = NULL;
 	p = &priv->root.rb_node;
 	while (*p != NULL) {
 		parent = *p;
 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-		d = memcmp(nft_set_ext_key(&rbe->ext),
-			   nft_set_ext_key(&new->ext),
-			   set->klen);
-		if (d < 0)
+		d = nft_rbtree_cmp(set, rbe, new);
+
+		if (d < 0) {
 			p = &parent->rb_left;
-		else if (d > 0)
+		} else if (d > 0) {
+			if (!first ||
+			    nft_rbtree_update_first(set, rbe, first))
+				first = &rbe->node;
+
 			p = &parent->rb_right;
-		else {
-			if (nft_rbtree_interval_end(rbe) &&
-			    nft_rbtree_interval_start(new)) {
+		} else {
+			if (nft_rbtree_interval_end(rbe))
 				p = &parent->rb_left;
-			} else if (nft_rbtree_interval_start(rbe) &&
-				   nft_rbtree_interval_end(new)) {
+			else
 				p = &parent->rb_right;
-			} else if (nft_set_elem_active(&rbe->ext, genmask)) {
-				*ext = &rbe->ext;
-				return -EEXIST;
-			} else {
-				p = &parent->rb_left;
+		}
+	}
+
+	if (!first)
+		first = rb_first(&priv->root);
+
+	/* Detect overlap by going through the list of valid tree nodes.
+	 * Values stored in the tree are in reversed order, starting from
+	 * highest to lowest value.
+	 */
+	for (node = first; node != NULL; node = rb_next(node)) {
+		rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+		if (!nft_set_elem_active(&rbe->ext, genmask))
+			continue;
+
+		/* perform garbage collection to avoid bogus overlap reports. */
+		if (nft_set_elem_expired(&rbe->ext)) {
+			err = nft_rbtree_gc_elem(set, priv, rbe);
+			if (err < 0)
+				return err;
+
+			continue;
+		}
+
+		d = nft_rbtree_cmp(set, rbe, new);
+		if (d == 0) {
+			/* Matching end element: no need to look for an
+			 * overlapping greater or equal element.
+			 */
+			if (nft_rbtree_interval_end(rbe)) {
+				rbe_le = rbe;
+				break;
+			}
+
+			/* first element that is greater or equal to key value. */
+			if (!rbe_ge) {
+				rbe_ge = rbe;
+				continue;
+			}
+
+			/* this is a closer more or equal element, update it. */
+			if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
+				rbe_ge = rbe;
+				continue;
 			}
+
+			/* element is equal to key value, make sure flags are
+			 * the same, an existing more or equal start element
+			 * must not be replaced by more or equal end element.
+			 */
+			if ((nft_rbtree_interval_start(new) &&
+			     nft_rbtree_interval_start(rbe_ge)) ||
+			    (nft_rbtree_interval_end(new) &&
+			     nft_rbtree_interval_end(rbe_ge))) {
+				rbe_ge = rbe;
+				continue;
+			}
+		} else if (d > 0) {
+			/* annotate element greater than the new element. */
+			rbe_ge = rbe;
+			continue;
+		} else if (d < 0) {
+			/* annotate element less than the new element. */
+			rbe_le = rbe;
+			break;
 		}
 	}
+
+	/* - new start element matching existing start element: full overlap
+	 *   reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+	 */
+	if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
+	    nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
+		*ext = &rbe_ge->ext;
+		return -EEXIST;
+	}
+
+	/* - new end element matching existing end element: full overlap
+	 *   reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+	 */
+	if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
+	    nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
+		*ext = &rbe_le->ext;
+		return -EEXIST;
+	}
+
+	/* - new start element with existing closest, less or equal key value
+	 *   being a start element: partial overlap, reported as -ENOTEMPTY.
+	 *   Anonymous sets allow for two consecutive start element since they
+	 *   are constant, skip them to avoid bogus overlap reports.
+	 */
+	if (!nft_set_is_anonymous(set) && rbe_le &&
+	    nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
+		return -ENOTEMPTY;
+
+	/* - new end element with existing closest, less or equal key value
+	 *   being a end element: partial overlap, reported as -ENOTEMPTY.
+	 */
+	if (rbe_le &&
+	    nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
+		return -ENOTEMPTY;
+
+	/* - new end element with existing closest, greater or equal key value
+	 *   being an end element: partial overlap, reported as -ENOTEMPTY
+	 */
+	if (rbe_ge &&
+	    nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
+		return -ENOTEMPTY;
+
+	/* Accepted element: pick insertion point depending on key value */
+	parent = NULL;
+	p = &priv->root.rb_node;
+	while (*p != NULL) {
+		parent = *p;
+		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+		d = nft_rbtree_cmp(set, rbe, new);
+
+		if (d < 0)
+			p = &parent->rb_left;
+		else if (d > 0)
+			p = &parent->rb_right;
+		else if (nft_rbtree_interval_end(rbe))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+
 	rb_link_node_rcu(&new->node, parent, p);
 	rb_insert_color(&new->node, &priv->root);
 	return 0;
-- 
2.30.2


  parent reply	other threads:[~2024-06-13  1:02 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-13  1:01 [PATCH -stable,4.19.x 00/40] Netfilter fixes for -stable Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 01/40] netfilter: nf_tables: pass context to nft_set_destroy() Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 02/40] netfilter: nftables: rename set element data activation/deactivation functions Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 03/40] netfilter: nf_tables: drop map element references from preparation phase Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 04/40] netfilter: nft_set_rbtree: allow loose matching of closing element in interval Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 05/40] netfilter: nft_set_rbtree: Add missing expired checks Pablo Neira Ayuso
2024-06-13  1:01 ` Pablo Neira Ayuso [this message]
2024-06-13  1:01 ` [PATCH -stable,4.19.x 07/40] netfilter: nft_set_rbtree: fix null deref on element insertion Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 08/40] netfilter: nft_set_rbtree: fix overlap expiration walk Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 09/40] netfilter: nf_tables: don't skip expired elements during walk Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 10/40] netfilter: nf_tables: GC transaction API to avoid race with control plane Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 11/40] netfilter: nf_tables: adapt set backend to use GC transaction API Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 12/40] netfilter: nf_tables: remove busy mark and gc batch API Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 13/40] netfilter: nf_tables: fix GC transaction races with netns and netlink event exit path Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 14/40] netfilter: nf_tables: GC transaction race with netns dismantle Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 15/40] netfilter: nf_tables: GC transaction race with abort path Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 16/40] netfilter: nf_tables: defer gc run if previous batch is still pending Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 17/40] netfilter: nft_set_rbtree: skip sync GC for new elements in this transaction Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 18/40] netfilter: nft_set_rbtree: use read spinlock to avoid datapath contention Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 19/40] netfilter: nft_set_hash: try later when GC hits EAGAIN on iteration Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 20/40] netfilter: nf_tables: fix memleak when more than 255 elements expired Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 21/40] netfilter: nf_tables: unregister flowtable hooks on netns exit Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 22/40] netfilter: nf_tables: double hook unregistration in netns path Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 23/40] netfilter: nftables: update table flags from the commit phase Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 24/40] netfilter: nf_tables: fix table flag updates Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 25/40] netfilter: nf_tables: disable toggling dormant table state more than once Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 26/40] netfilter: nf_tables: bogus EBUSY when deleting flowtable after flush (for 4.19) Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 27/40] netfilter: nft_dynset: fix timeouts later than 23 days Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 28/40] netfilter: nftables: exthdr: fix 4-byte stack OOB write Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 29/40] netfilter: nft_dynset: report EOPNOTSUPP on missing set feature Pablo Neira Ayuso
2024-06-13  1:01 ` [PATCH -stable,4.19.x 30/40] netfilter: nft_dynset: relax superfluous check on set updates Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 31/40] netfilter: nf_tables: mark newset as dead on transaction abort Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 32/40] netfilter: nf_tables: skip dead set elements in netlink dump Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 33/40] netfilter: nf_tables: validate NFPROTO_* family Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 34/40] netfilter: nft_set_rbtree: skip end interval element from gc Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 35/40] netfilter: nf_tables: set dormant flag on hook register failure Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 36/40] netfilter: nf_tables: allow NFPROTO_INET in nft_(match/target)_validate() Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 37/40] netfilter: nf_tables: do not compare internal table flags on updates Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 38/40] netfilter: nf_tables: mark set as dead when unbinding anonymous set with timeout Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 39/40] netfilter: nf_tables: reject new basechain after table flag update Pablo Neira Ayuso
2024-06-13  1:02 ` [PATCH -stable,4.19.x 40/40] netfilter: nf_tables: discard table flag update with pending basechain deletion Pablo Neira Ayuso
2024-06-13  6:43 ` [PATCH -stable,4.19.x 00/40] Netfilter fixes for -stable Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240613010209.104423-7-pablo@netfilter.org \
    --to=pablo@netfilter.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=sashal@kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).