All of lore.kernel.org
 help / color / mirror / Atom feed
From: Florian Westphal <fw@strlen.de>
To: <netdev@vger.kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>,
	<netfilter-devel@vger.kernel.org>,
	pablo@netfilter.org
Subject: [PATCH net-next 7/9] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
Date: Wed, 28 Jan 2026 16:41:53 +0100	[thread overview]
Message-ID: <20260128154155.32143-8-fw@strlen.de> (raw)
In-Reply-To: <20260128154155.32143-1-fw@strlen.de>

From: Scott Mitchell <scott.k.mitch1@gmail.com>

The current implementation uses a linear list to find queued packets by
ID when processing verdicts from userspace. With large queue depths and
out-of-order verdicting, this O(n) lookup becomes a significant
bottleneck, causing userspace verdict processing to dominate CPU time.

Replace the linear search with a hash table for O(1) average-case
packet lookup by ID. A global rhashtable spanning all network
namespaces attributes hash bucket memory to kernel but is subject to
fixed upper bound.

Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_queue.h |   3 +
 net/netfilter/nfnetlink_queue.c  | 146 ++++++++++++++++++++++++-------
 2 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 4aeffddb7586..e6803831d6af 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -6,11 +6,13 @@
 #include <linux/ipv6.h>
 #include <linux/jhash.h>
 #include <linux/netfilter.h>
+#include <linux/rhashtable-types.h>
 #include <linux/skbuff.h>
 
 /* Each queued (to userspace) skbuff has one of these. */
 struct nf_queue_entry {
 	struct list_head	list;
+	struct rhash_head	hash_node;
 	struct sk_buff		*skb;
 	unsigned int		id;
 	unsigned int		hook_index;	/* index in hook_entries->hook[] */
@@ -20,6 +22,7 @@ struct nf_queue_entry {
 #endif
 	struct nf_hook_state	state;
 	u16			size; /* sizeof(entry) + saved route keys */
+	u16			queue_num;
 
 	/* extra space to store route keys */
 };
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8fa0807973c9..671b52c652ef 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -30,6 +30,8 @@
 #include <linux/netfilter/nf_conntrack_common.h>
 #include <linux/list.h>
 #include <linux/cgroup-defs.h>
+#include <linux/rhashtable.h>
+#include <linux/jhash.h>
 #include <net/gso.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
@@ -47,6 +49,8 @@
 #endif
 
 #define NFQNL_QMAX_DEFAULT 1024
+#define NFQNL_HASH_MIN     1024
+#define NFQNL_HASH_MAX     1048576
 
 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
  * includes the header length. Thus, the maximum packet length that we
@@ -56,6 +60,26 @@
  */
 #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
 
+/* Composite key for packet lookup: (net, queue_num, packet_id) */
+struct nfqnl_packet_key {
+	possible_net_t net;
+	u32 packet_id;
+	u16 queue_num;
+} __aligned(sizeof(u32));  /* jhash2 requires 32-bit alignment */
+
+/* Global rhashtable - one for entire system, all netns */
+static struct rhashtable nfqnl_packet_map __read_mostly;
+
+/* Helper to initialize composite key */
+static inline void nfqnl_init_key(struct nfqnl_packet_key *key,
+				  struct net *net, u32 packet_id, u16 queue_num)
+{
+	memset(key, 0, sizeof(*key));
+	write_pnet(&key->net, net);
+	key->packet_id = packet_id;
+	key->queue_num = queue_num;
+}
+
 struct nfqnl_instance {
 	struct hlist_node hlist;		/* global list of queues */
 	struct rcu_head rcu;
@@ -100,6 +124,39 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num)
 	return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
 }
 
+/* Extract composite key from nf_queue_entry for hashing */
+static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct nf_queue_entry *entry = data;
+	struct nfqnl_packet_key key;
+
+	nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num);
+
+	return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed);
+}
+
+/* Compare stack-allocated key against entry */
+static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg,
+				  const void *obj)
+{
+	const struct nfqnl_packet_key *key = arg->key;
+	const struct nf_queue_entry *entry = obj;
+
+	return !net_eq(entry->state.net, read_pnet(&key->net)) ||
+	       entry->queue_num != key->queue_num ||
+	       entry->id != key->packet_id;
+}
+
+static const struct rhashtable_params nfqnl_rhashtable_params = {
+	.head_offset = offsetof(struct nf_queue_entry, hash_node),
+	.key_len = sizeof(struct nfqnl_packet_key),
+	.obj_hashfn = nfqnl_packet_obj_hashfn,
+	.obj_cmpfn = nfqnl_packet_obj_cmpfn,
+	.automatic_shrinking = true,
+	.min_size = NFQNL_HASH_MIN,
+	.max_size = NFQNL_HASH_MAX,
+};
+
 static struct nfqnl_instance *
 instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
 {
@@ -188,33 +245,45 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
 	spin_unlock(&q->instances_lock);
 }
 
-static inline void
+static int
 __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
-       list_add_tail(&entry->list, &queue->queue_list);
-       queue->queue_total++;
+	int err;
+
+	entry->queue_num = queue->queue_num;
+
+	err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node,
+				     nfqnl_rhashtable_params);
+	if (unlikely(err))
+		return err;
+
+	list_add_tail(&entry->list, &queue->queue_list);
+	queue->queue_total++;
+
+	return 0;
 }
 
 static void
 __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
+	rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node,
+			       nfqnl_rhashtable_params);
 	list_del(&entry->list);
 	queue->queue_total--;
 }
 
 static struct nf_queue_entry *
-find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id,
+		   struct net *net)
 {
-	struct nf_queue_entry *entry = NULL, *i;
+	struct nfqnl_packet_key key;
+	struct nf_queue_entry *entry;
 
-	spin_lock_bh(&queue->lock);
+	nfqnl_init_key(&key, net, id, queue->queue_num);
 
-	list_for_each_entry(i, &queue->queue_list, list) {
-		if (i->id == id) {
-			entry = i;
-			break;
-		}
-	}
+	spin_lock_bh(&queue->lock);
+	entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key,
+				       nfqnl_rhashtable_params);
 
 	if (entry)
 		__dequeue_entry(queue, entry);
@@ -404,8 +473,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 	spin_lock_bh(&queue->lock);
 	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
 		if (!cmpfn || cmpfn(entry, data)) {
-			list_del(&entry->list);
-			queue->queue_total--;
+			__dequeue_entry(queue, entry);
 			nfqnl_reinject(entry, NF_DROP);
 		}
 	}
@@ -885,23 +953,23 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
 	if (nf_ct_drop_unconfirmed(entry))
 		goto err_out_free_nskb;
 
-	if (queue->queue_total >= queue->queue_maxlen) {
-		if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
-			failopen = 1;
-			err = 0;
-		} else {
-			queue->queue_dropped++;
-			net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
-					     queue->queue_total);
-		}
-		goto err_out_free_nskb;
-	}
+	if (queue->queue_total >= queue->queue_maxlen)
+		goto err_out_queue_drop;
+
 	entry->id = ++queue->id_sequence;
 	*packet_id_ptr = htonl(entry->id);
 
+	/* Insert into hash BEFORE unicast. If failure don't send to userspace. */
+	err = __enqueue_entry(queue, entry);
+	if (unlikely(err))
+		goto err_out_queue_drop;
+
 	/* nfnetlink_unicast will either free the nskb or add it to a socket */
 	err = nfnetlink_unicast(nskb, net, queue->peer_portid);
 	if (err < 0) {
+		/* Unicast failed - remove entry we just inserted */
+		__dequeue_entry(queue, entry);
+
 		if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
 			failopen = 1;
 			err = 0;
@@ -911,11 +979,22 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
 		goto err_out_unlock;
 	}
 
-	__enqueue_entry(queue, entry);
-
 	spin_unlock_bh(&queue->lock);
 	return 0;
 
+err_out_queue_drop:
+	if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+		failopen = 1;
+		err = 0;
+	} else {
+		queue->queue_dropped++;
+
+		if (queue->queue_total >= queue->queue_maxlen)
+			net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
+					     queue->queue_total);
+		else
+			net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
+	}
 err_out_free_nskb:
 	kfree_skb(nskb);
 err_out_unlock:
@@ -1427,7 +1506,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
 
 	verdict = ntohl(vhdr->verdict);
 
-	entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+	entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net);
 	if (entry == NULL)
 		return -ENOENT;
 
@@ -1774,10 +1853,14 @@ static int __init nfnetlink_queue_init(void)
 {
 	int status;
 
+	status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params);
+	if (status < 0)
+		return status;
+
 	status = register_pernet_subsys(&nfnl_queue_net_ops);
 	if (status < 0) {
 		pr_err("failed to register pernet ops\n");
-		goto out;
+		goto cleanup_rhashtable;
 	}
 
 	netlink_register_notifier(&nfqnl_rtnl_notifier);
@@ -1802,7 +1885,8 @@ static int __init nfnetlink_queue_init(void)
 cleanup_netlink_notifier:
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
 	unregister_pernet_subsys(&nfnl_queue_net_ops);
-out:
+cleanup_rhashtable:
+	rhashtable_destroy(&nfqnl_packet_map);
 	return status;
 }
 
@@ -1814,6 +1898,8 @@ static void __exit nfnetlink_queue_fini(void)
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
 	unregister_pernet_subsys(&nfnl_queue_net_ops);
 
+	rhashtable_destroy(&nfqnl_packet_map);
+
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
-- 
2.52.0


  parent reply	other threads:[~2026-01-28 15:42 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-28 15:41 [PATCH net-next 0/9] netfilter: updates for net-next Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 1/9] netfilter: Add ctx pointer in nf_flow_skb_encap_protocol/nf_flow_ip4_tunnel_proto signature Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 2/9] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 3/9] netfilter: flowtable: Add IP6IP6 rx sw acceleration Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 4/9] netfilter: flowtable: Add IP6IP6 tx " Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 5/9] selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 6/9] netfilter: xt_time: use is_leap_year() helper Florian Westphal
2026-01-28 15:41 ` Florian Westphal [this message]
2026-01-28 15:41 ` [PATCH net-next 8/9] netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation Florian Westphal
2026-01-28 15:41 ` [PATCH net-next 9/9] selftests: netfilter: nft_queue.sh: add udp fraglist gro test case Florian Westphal
2026-01-29  5:03 ` [PATCH net-next 0/9] netfilter: updates for net-next Jakub Kicinski
2026-01-29  8:56   ` Florian Westphal
2026-01-29 10:08     ` Florian Westphal
2026-01-29 10:40       ` Paolo Abeni

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260128154155.32143-8-fw@strlen.de \
    --to=fw@strlen.de \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=pablo@netfilter.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.