[PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
@ 2025-11-13  9:26 Scott Mitchell
  2025-11-13 10:25 ` Eric Dumazet
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Scott Mitchell @ 2025-11-13  9:26 UTC (permalink / raw)
  To: pablo
  Cc: kadlec, fw, phil, davem, edumazet, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel, Scott Mitchell

The current implementation uses a linear list to find queued packets by
ID when processing verdicts from userspace. With large queue depths and
out-of-order verdicting, this O(n) lookup becomes a significant
bottleneck, causing userspace verdict processing to dominate CPU time.

Replace the linear search with a hash table for O(1) average-case
packet lookup by ID. The hash table size is configurable via the new
NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
two to enable efficient bitwise masking instead of modulo operations.
Unpatched kernels silently ignore the new attribute, maintaining
backward compatibility.

The existing list data structure is retained for operations requiring
linear iteration (e.g. flush, device down events). Hot fields
(queue_hash_mask, queue_hash pointer) are placed in the same cache line
as the spinlock and packet counters for optimal memory access patterns.

Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>
---
Changes in v2:
- Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
  tables with vmalloc fallback (Florian Westphal)
- Remove incorrect comment about concurrent resizes - nfnetlink subsystem
  mutex already serializes config operations (Florian Westphal)
- Fix style: remove unnecessary braces around single-line if (Florian Westphal)

 include/net/netfilter/nf_queue.h              |   1 +
 .../uapi/linux/netfilter/nfnetlink_queue.h    |   1 +
 net/netfilter/nfnetlink_queue.c               | 129 ++++++++++++++++--
 3 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 4aeffddb7586..3d0def310523 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -11,6 +11,7 @@
 /* Each queued (to userspace) skbuff has one of these. */
 struct nf_queue_entry {
 	struct list_head	list;
+	struct hlist_node	hash_node;
 	struct sk_buff		*skb;
 	unsigned int		id;
 	unsigned int		hook_index;	/* index in hook_entries->hook[] */
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index efcb7c044a74..bc296a17e5aa 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -107,6 +107,7 @@ enum nfqnl_attr_config {
 	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
 	NFQA_CFG_MASK,			/* identify which flags to change */
 	NFQA_CFG_FLAGS,			/* value of these flags (__u32) */
+	NFQA_CFG_HASH_SIZE,		/* __u32 hash table size (rounded to power of 2) */
 	__NFQA_CFG_MAX
 };
 #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8b7b39d8a109..f076609cac32 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -46,7 +46,10 @@
 #include <net/netfilter/nf_conntrack.h>
 #endif
 
-#define NFQNL_QMAX_DEFAULT 1024
+#define NFQNL_QMAX_DEFAULT      1024
+#define NFQNL_MIN_HASH_SIZE     16
+#define NFQNL_DEFAULT_HASH_SIZE 1024
+#define NFQNL_MAX_HASH_SIZE     131072
 
 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
  * includes the header length. Thus, the maximum packet length that we
@@ -65,6 +68,7 @@ struct nfqnl_instance {
 	unsigned int copy_range;
 	unsigned int queue_dropped;
 	unsigned int queue_user_dropped;
+	unsigned int queue_hash_size;
 
 
 	u_int16_t queue_num;			/* number of this queue */
@@ -77,6 +81,8 @@ struct nfqnl_instance {
 	spinlock_t	lock	____cacheline_aligned_in_smp;
 	unsigned int	queue_total;
 	unsigned int	id_sequence;		/* 'sequence' of pkt ids */
+	unsigned int	queue_hash_mask;
+	struct hlist_head *queue_hash;
 	struct list_head queue_list;		/* packets in queue */
 };
 
@@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
 	return net_generic(net, nfnl_queue_net_id);
 }
 
+static inline unsigned int
+nfqnl_packet_hash(u32 id, unsigned int mask)
+{
+	return hash_32(id, 32) & mask;
+}
+
+static inline u32
+nfqnl_normalize_hash_size(u32 hash_size)
+{
+	/* Must be power of two for queue_hash_mask to work correctly.
+	 * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
+	 */
+	BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
+		     !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
+		     !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
+		     NFQNL_MAX_HASH_SIZE > 1U << 31);
+
+	if (!hash_size)
+		return NFQNL_DEFAULT_HASH_SIZE;
+
+	/* Clamp to valid range before power of two to avoid overflow */
+	if (hash_size <= NFQNL_MIN_HASH_SIZE)
+		return NFQNL_MIN_HASH_SIZE;
+
+	if (hash_size >= NFQNL_MAX_HASH_SIZE)
+		return NFQNL_MAX_HASH_SIZE;
+
+	if (!is_power_of_2(hash_size))
+		hash_size = roundup_pow_of_two(hash_size);
+
+	return hash_size;
+}
+
 static inline u_int8_t instance_hashfn(u_int16_t queue_num)
 {
 	return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
@@ -114,13 +153,56 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
 	return NULL;
 }
 
+static int
+nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
+{
+	struct hlist_head *new_hash, *old_hash;
+	struct nf_queue_entry *entry;
+	unsigned int h, hash_mask;
+
+	hash_size = nfqnl_normalize_hash_size(hash_size);
+	if (hash_size == inst->queue_hash_size)
+		return 0;
+
+	new_hash = kvcalloc(hash_size, sizeof(*new_hash), GFP_KERNEL_ACCOUNT);
+	if (!new_hash)
+		return -ENOMEM;
+
+	hash_mask = hash_size - 1;
+
+	for (h = 0; h < hash_size; h++)
+		INIT_HLIST_HEAD(&new_hash[h]);
+
+	spin_lock_bh(&inst->lock);
+
+	list_for_each_entry(entry, &inst->queue_list, list) {
+		/* No hlist_del() since old_hash will be freed and we hold lock */
+		h = nfqnl_packet_hash(entry->id, hash_mask);
+		hlist_add_head(&entry->hash_node, &new_hash[h]);
+	}
+
+	old_hash = inst->queue_hash;
+	inst->queue_hash_size = hash_size;
+	inst->queue_hash_mask = hash_mask;
+	inst->queue_hash = new_hash;
+
+	spin_unlock_bh(&inst->lock);
+
+	kvfree(old_hash);
+
+	return 0;
+}
+
 static struct nfqnl_instance *
-instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
+		u32 hash_size)
 {
 	struct nfqnl_instance *inst;
 	unsigned int h;
 	int err;
 
+	hash_size = nfqnl_normalize_hash_size(hash_size);
+
 	spin_lock(&q->instances_lock);
 	if (instance_lookup(q, queue_num)) {
 		err = -EEXIST;
@@ -133,11 +215,24 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
 		goto out_unlock;
 	}
 
+	inst->queue_hash = kvcalloc(hash_size, sizeof(*inst->queue_hash),
+				    GFP_KERNEL_ACCOUNT);
+	if (!inst->queue_hash) {
+		kfree(inst);
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	for (h = 0; h < hash_size; h++)
+		INIT_HLIST_HEAD(&inst->queue_hash[h]);
+
 	inst->queue_num = queue_num;
 	inst->peer_portid = portid;
 	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
 	inst->copy_range = NFQNL_MAX_COPY_RANGE;
 	inst->copy_mode = NFQNL_COPY_NONE;
+	inst->queue_hash_size = hash_size;
+	inst->queue_hash_mask = hash_size - 1;
 	spin_lock_init(&inst->lock);
 	INIT_LIST_HEAD(&inst->queue_list);
 
@@ -154,6 +249,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
 	return inst;
 
 out_free:
+	kvfree(inst->queue_hash);
 	kfree(inst);
 out_unlock:
 	spin_unlock(&q->instances_lock);
@@ -172,6 +268,7 @@ instance_destroy_rcu(struct rcu_head *head)
 	rcu_read_lock();
 	nfqnl_flush(inst, NULL, 0);
 	rcu_read_unlock();
+	kvfree(inst->queue_hash);
 	kfree(inst);
 	module_put(THIS_MODULE);
 }
@@ -194,13 +291,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
 static inline void
 __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
-       list_add_tail(&entry->list, &queue->queue_list);
-       queue->queue_total++;
+	unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
+
+	hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
+	list_add_tail(&entry->list, &queue->queue_list);
+	queue->queue_total++;
 }
 
 static void
 __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
+	hlist_del(&entry->hash_node);
 	list_del(&entry->list);
 	queue->queue_total--;
 }
@@ -209,10 +310,11 @@ static struct nf_queue_entry *
 find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 {
 	struct nf_queue_entry *entry = NULL, *i;
+	unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
 
 	spin_lock_bh(&queue->lock);
 
-	list_for_each_entry(i, &queue->queue_list, list) {
+	hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
 		if (i->id == id) {
 			entry = i;
 			break;
@@ -407,8 +509,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 	spin_lock_bh(&queue->lock);
 	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
 		if (!cmpfn || cmpfn(entry, data)) {
-			list_del(&entry->list);
-			queue->queue_total--;
+			__dequeue_entry(queue, entry);
 			nfqnl_reinject(entry, NF_DROP);
 		}
 	}
@@ -1483,6 +1584,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
 	[NFQA_CFG_QUEUE_MAXLEN]	= { .type = NLA_U32 },
 	[NFQA_CFG_MASK]		= { .type = NLA_U32 },
 	[NFQA_CFG_FLAGS]	= { .type = NLA_U32 },
+	[NFQA_CFG_HASH_SIZE]    = { .type = NLA_U32 },
 };
 
 static const struct nf_queue_handler nfqh = {
@@ -1495,11 +1597,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
 	u_int16_t queue_num = ntohs(info->nfmsg->res_id);
+	u32 hash_size = 0;
 	struct nfqnl_msg_config_cmd *cmd = NULL;
 	struct nfqnl_instance *queue;
 	__u32 flags = 0, mask = 0;
 	int ret = 0;
 
+	if (nfqa[NFQA_CFG_HASH_SIZE])
+		hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
+
 	if (nfqa[NFQA_CFG_CMD]) {
 		cmd = nla_data(nfqa[NFQA_CFG_CMD]);
 
@@ -1559,11 +1665,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 				goto err_out_unlock;
 			}
 			queue = instance_create(q, queue_num,
-						NETLINK_CB(skb).portid);
+						NETLINK_CB(skb).portid, hash_size);
 			if (IS_ERR(queue)) {
 				ret = PTR_ERR(queue);
 				goto err_out_unlock;
 			}
+			hash_size = 0; /* avoid resize later in this function */
 			break;
 		case NFQNL_CFG_CMD_UNBIND:
 			if (!queue) {
@@ -1586,6 +1693,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 		goto err_out_unlock;
 	}
 
+	if (hash_size > 0) {
+		ret = nfqnl_hash_resize(queue, hash_size);
+		if (ret)
+			goto err_out_unlock;
+	}
+
 	if (nfqa[NFQA_CFG_PARAMS]) {
 		struct nfqnl_msg_config_params *params =
 			nla_data(nfqa[NFQA_CFG_PARAMS]);
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13  9:26 [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table Scott Mitchell
@ 2025-11-13 10:25 ` Eric Dumazet
  2025-11-13 15:30   ` Scott Mitchell
  2025-11-13 19:40   ` David Laight
  2025-11-13 12:19 ` Florian Westphal
  2025-11-13 13:50 ` [syzbot ci] " syzbot ci
  2 siblings, 2 replies; 8+ messages in thread
From: Eric Dumazet @ 2025-11-13 10:25 UTC (permalink / raw)
  To: Scott Mitchell
  Cc: pablo, kadlec, fw, phil, davem, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel, Scott Mitchell

On Thu, Nov 13, 2025 at 1:26 AM Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
>
> The current implementation uses a linear list to find queued packets by
> ID when processing verdicts from userspace. With large queue depths and
> out-of-order verdicting, this O(n) lookup becomes a significant
> bottleneck, causing userspace verdict processing to dominate CPU time.
>
> Replace the linear search with a hash table for O(1) average-case
> packet lookup by ID. The hash table size is configurable via the new
> NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
> NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
> two to enable efficient bitwise masking instead of modulo operations.
> Unpatched kernels silently ignore the new attribute, maintaining
> backward compatibility.
>
> The existing list data structure is retained for operations requiring
> linear iteration (e.g. flush, device down events). Hot fields
> (queue_hash_mask, queue_hash pointer) are placed in the same cache line
> as the spinlock and packet counters for optimal memory access patterns.
>
> Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>
> ---
> Changes in v2:
> - Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
>   tables with vmalloc fallback (Florian Westphal)
> - Remove incorrect comment about concurrent resizes - nfnetlink subsystem
>   mutex already serializes config operations (Florian Westphal)
> - Fix style: remove unnecessary braces around single-line if (Florian Westphal)
>
>  include/net/netfilter/nf_queue.h              |   1 +
>  .../uapi/linux/netfilter/nfnetlink_queue.h    |   1 +
>  net/netfilter/nfnetlink_queue.c               | 129 ++++++++++++++++--
>  3 files changed, 123 insertions(+), 8 deletions(-)
>
> diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> index 4aeffddb7586..3d0def310523 100644
> --- a/include/net/netfilter/nf_queue.h
> +++ b/include/net/netfilter/nf_queue.h
> @@ -11,6 +11,7 @@
>  /* Each queued (to userspace) skbuff has one of these. */
>  struct nf_queue_entry {
>         struct list_head        list;
> +       struct hlist_node       hash_node;
>         struct sk_buff          *skb;
>         unsigned int            id;
>         unsigned int            hook_index;     /* index in hook_entries->hook[] */
> diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
> index efcb7c044a74..bc296a17e5aa 100644
> --- a/include/uapi/linux/netfilter/nfnetlink_queue.h
> +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
> @@ -107,6 +107,7 @@ enum nfqnl_attr_config {
>         NFQA_CFG_QUEUE_MAXLEN,          /* __u32 */
>         NFQA_CFG_MASK,                  /* identify which flags to change */
>         NFQA_CFG_FLAGS,                 /* value of these flags (__u32) */
> +       NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
>         __NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index 8b7b39d8a109..f076609cac32 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> @@ -46,7 +46,10 @@
>  #include <net/netfilter/nf_conntrack.h>
>  #endif
>
> -#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QMAX_DEFAULT      1024
> +#define NFQNL_MIN_HASH_SIZE     16
> +#define NFQNL_DEFAULT_HASH_SIZE 1024
> +#define NFQNL_MAX_HASH_SIZE     131072
>
>  /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
>   * includes the header length. Thus, the maximum packet length that we
> @@ -65,6 +68,7 @@ struct nfqnl_instance {
>         unsigned int copy_range;
>         unsigned int queue_dropped;
>         unsigned int queue_user_dropped;
> +       unsigned int queue_hash_size;
>
>
>         u_int16_t queue_num;                    /* number of this queue */
> @@ -77,6 +81,8 @@ struct nfqnl_instance {
>         spinlock_t      lock    ____cacheline_aligned_in_smp;
>         unsigned int    queue_total;
>         unsigned int    id_sequence;            /* 'sequence' of pkt ids */
> +       unsigned int    queue_hash_mask;
> +       struct hlist_head *queue_hash;
>         struct list_head queue_list;            /* packets in queue */
>  };
>
> @@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
>         return net_generic(net, nfnl_queue_net_id);
>  }
>
> +static inline unsigned int
> +nfqnl_packet_hash(u32 id, unsigned int mask)
> +{
> +       return hash_32(id, 32) & mask;
> +}
> +

(Resent in plaintext for the lists, sorry for duplicates)

I do not think this is an efficient hash function.

queue->id_sequence is monotonically increasing (controlled by the
kernel : __nfqnl_enqueue_packet(), not user space).

I would use   return (id & mask) so that we have better use of cpu
caches and hardware prefetchers,
in case a cpu receives a batch of ~64 packets from a busy network device.

Your hash function would require 8x more cache line misses.


> +static inline u32
> +nfqnl_normalize_hash_size(u32 hash_size)
> +{
> +       /* Must be power of two for queue_hash_mask to work correctly.
> +        * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
> +        */
> +       BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
> +                    !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
> +                    !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
> +                    NFQNL_MAX_HASH_SIZE > 1U << 31);
> +
> +       if (!hash_size)
> +               return NFQNL_DEFAULT_HASH_SIZE;
> +
> +       /* Clamp to valid range before power of two to avoid overflow */
> +       if (hash_size <= NFQNL_MIN_HASH_SIZE)
> +               return NFQNL_MIN_HASH_SIZE;
> +
> +       if (hash_size >= NFQNL_MAX_HASH_SIZE)
> +               return NFQNL_MAX_HASH_SIZE;
> +
> +       if (!is_power_of_2(hash_size))
> +               hash_size = roundup_pow_of_two(hash_size);
> +
> +       return hash_size;
> +}
> +
>  static inline u_int8_t instance_hashfn(u_int16_t queue_num)
>  {
>         return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> @@ -114,13 +153,56 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
>         return NULL;
>  }
>
> +static int
> +nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
> +{
> +       struct hlist_head *new_hash, *old_hash;
> +       struct nf_queue_entry *entry;
> +       unsigned int h, hash_mask;
> +
> +       hash_size = nfqnl_normalize_hash_size(hash_size);
> +       if (hash_size == inst->queue_hash_size)
> +               return 0;
> +
> +       new_hash = kvcalloc(hash_size, sizeof(*new_hash), GFP_KERNEL_ACCOUNT);
> +       if (!new_hash)
> +               return -ENOMEM;
> +
> +       hash_mask = hash_size - 1;
> +
> +       for (h = 0; h < hash_size; h++)
> +               INIT_HLIST_HEAD(&new_hash[h]);
> +
> +       spin_lock_bh(&inst->lock);
> +
> +       list_for_each_entry(entry, &inst->queue_list, list) {
> +               /* No hlist_del() since old_hash will be freed and we hold lock */
> +               h = nfqnl_packet_hash(entry->id, hash_mask);
> +               hlist_add_head(&entry->hash_node, &new_hash[h]);
> +       }
> +
> +       old_hash = inst->queue_hash;
> +       inst->queue_hash_size = hash_size;
> +       inst->queue_hash_mask = hash_mask;
> +       inst->queue_hash = new_hash;
> +
> +       spin_unlock_bh(&inst->lock);
> +
> +       kvfree(old_hash);
> +
> +       return 0;
> +}
> +
>  static struct nfqnl_instance *
> -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
> +               u32 hash_size)
>  {
>         struct nfqnl_instance *inst;
>         unsigned int h;
>         int err;
>
> +       hash_size = nfqnl_normalize_hash_size(hash_size);
> +
>         spin_lock(&q->instances_lock);
>         if (instance_lookup(q, queue_num)) {
>                 err = -EEXIST;
> @@ -133,11 +215,24 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
>                 goto out_unlock;
>         }
>
> +       inst->queue_hash = kvcalloc(hash_size, sizeof(*inst->queue_hash),
> +                                   GFP_KERNEL_ACCOUNT);
> +       if (!inst->queue_hash) {
> +               kfree(inst);
> +               err = -ENOMEM;
> +               goto out_unlock;
> +       }
> +
> +       for (h = 0; h < hash_size; h++)
> +               INIT_HLIST_HEAD(&inst->queue_hash[h]);
> +
>         inst->queue_num = queue_num;
>         inst->peer_portid = portid;
>         inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>         inst->copy_range = NFQNL_MAX_COPY_RANGE;
>         inst->copy_mode = NFQNL_COPY_NONE;
> +       inst->queue_hash_size = hash_size;
> +       inst->queue_hash_mask = hash_size - 1;
>         spin_lock_init(&inst->lock);
>         INIT_LIST_HEAD(&inst->queue_list);
>
> @@ -154,6 +249,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
>         return inst;
>
>  out_free:
> +       kvfree(inst->queue_hash);
>         kfree(inst);
>  out_unlock:
>         spin_unlock(&q->instances_lock);
> @@ -172,6 +268,7 @@ instance_destroy_rcu(struct rcu_head *head)
>         rcu_read_lock();
>         nfqnl_flush(inst, NULL, 0);
>         rcu_read_unlock();
> +       kvfree(inst->queue_hash);
>         kfree(inst);
>         module_put(THIS_MODULE);
>  }
> @@ -194,13 +291,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
>  static inline void
>  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>  {
> -       list_add_tail(&entry->list, &queue->queue_list);
> -       queue->queue_total++;
> +       unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
> +
> +       hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
> +       list_add_tail(&entry->list, &queue->queue_list);
> +       queue->queue_total++;
>  }
>
>  static void
>  __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>  {
> +       hlist_del(&entry->hash_node);
>         list_del(&entry->list);
>         queue->queue_total--;
>  }
> @@ -209,10 +310,11 @@ static struct nf_queue_entry *
>  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>  {
>         struct nf_queue_entry *entry = NULL, *i;
> +       unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
>
>         spin_lock_bh(&queue->lock);
>
> -       list_for_each_entry(i, &queue->queue_list, list) {
> +       hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
>                 if (i->id == id) {
>                         entry = i;
>                         break;
> @@ -407,8 +509,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>         spin_lock_bh(&queue->lock);
>         list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
>                 if (!cmpfn || cmpfn(entry, data)) {
> -                       list_del(&entry->list);
> -                       queue->queue_total--;
> +                       __dequeue_entry(queue, entry);
>                         nfqnl_reinject(entry, NF_DROP);
>                 }
>         }
> @@ -1483,6 +1584,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
>         [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 },
>         [NFQA_CFG_MASK]         = { .type = NLA_U32 },
>         [NFQA_CFG_FLAGS]        = { .type = NLA_U32 },
> +       [NFQA_CFG_HASH_SIZE]    = { .type = NLA_U32 },
>  };
>
>  static const struct nf_queue_handler nfqh = {
> @@ -1495,11 +1597,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
>  {
>         struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
>         u_int16_t queue_num = ntohs(info->nfmsg->res_id);
> +       u32 hash_size = 0;
>         struct nfqnl_msg_config_cmd *cmd = NULL;
>         struct nfqnl_instance *queue;
>         __u32 flags = 0, mask = 0;
>         int ret = 0;
>
> +       if (nfqa[NFQA_CFG_HASH_SIZE])
> +               hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
> +
>         if (nfqa[NFQA_CFG_CMD]) {
>                 cmd = nla_data(nfqa[NFQA_CFG_CMD]);
>
> @@ -1559,11 +1665,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
>                                 goto err_out_unlock;
>                         }
>                         queue = instance_create(q, queue_num,
> -                                               NETLINK_CB(skb).portid);
> +                                               NETLINK_CB(skb).portid, hash_size);
>                         if (IS_ERR(queue)) {
>                                 ret = PTR_ERR(queue);
>                                 goto err_out_unlock;
>                         }
> +                       hash_size = 0; /* avoid resize later in this function */
>                         break;
>                 case NFQNL_CFG_CMD_UNBIND:
>                         if (!queue) {
> @@ -1586,6 +1693,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
>                 goto err_out_unlock;
>         }
>
> +       if (hash_size > 0) {
> +               ret = nfqnl_hash_resize(queue, hash_size);
> +               if (ret)
> +                       goto err_out_unlock;
> +       }
> +
>         if (nfqa[NFQA_CFG_PARAMS]) {
>                 struct nfqnl_msg_config_params *params =
>                         nla_data(nfqa[NFQA_CFG_PARAMS]);
> --
> 2.39.5 (Apple Git-154)
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13  9:26 [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table Scott Mitchell
  2025-11-13 10:25 ` Eric Dumazet
@ 2025-11-13 12:19 ` Florian Westphal
  2025-11-13 15:32   ` Scott Mitchell
  2025-11-13 13:50 ` [syzbot ci] " syzbot ci
  2 siblings, 1 reply; 8+ messages in thread
From: Florian Westphal @ 2025-11-13 12:19 UTC (permalink / raw)
  To: Scott Mitchell
  Cc: pablo, kadlec, phil, davem, edumazet, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel, Scott Mitchell

Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>

Didn't notice this before, these two should match:

scripts/checkpatch.pl netfilter-nfnetlink_queue-optimize-verdict-lookup-wi.patch
WARNING: From:/Signed-off-by: email address mismatch: 'From: Scott Mitchell <scott.k.mitch1@gmail.com>' != 'Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>'

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [syzbot ci] Re: netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13  9:26 [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table Scott Mitchell
  2025-11-13 10:25 ` Eric Dumazet
  2025-11-13 12:19 ` Florian Westphal
@ 2025-11-13 13:50 ` syzbot ci
  2 siblings, 0 replies; 8+ messages in thread
From: syzbot ci @ 2025-11-13 13:50 UTC (permalink / raw)
  To: coreteam, davem, edumazet, fw, horms, kadlec, kuba, linux-kernel,
	netdev, netfilter-devel, pabeni, pablo, phil, scott.k.mitch1,
	scott_mitchell
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
https://lore.kernel.org/all/20251113092606.91406-1-scott_mitchell@apple.com
* [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

and found the following issue:
BUG: sleeping function called from invalid context in instance_create

Full report is available here:
https://ci.syzbot.org/series/001a6a6c-7e1b-46e8-995d-5b6d650af320

***

BUG: sleeping function called from invalid context in instance_create

tree:      nf-next
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/netfilter/nf-next.git
base:      0d0eb186421d0886ac466008235f6d9eedaf918e
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/69a4254b-f4aa-45a7-a4bd-2d23940887f3/config
C repro:   https://ci.syzbot.org/findings/8074062c-0aee-4734-a3d1-587b80676bf1/c_repro
syz repro: https://ci.syzbot.org/findings/8074062c-0aee-4734-a3d1-587b80676bf1/syz_repro

netlink: 'syz.0.17': attribute type 6 has an invalid length.
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 5968, name: syz.0.17
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
3 locks held by syz.0.17/5968:
 #0: ffffffff99cc1c30 (nfnl_subsys_queue){+.+.}-{4:4}, at: nfnl_lock net/netfilter/nfnetlink.c:98 [inline]
 #0: ffffffff99cc1c30 (nfnl_subsys_queue){+.+.}-{4:4}, at: nfnetlink_rcv_msg+0x9dc/0x1130 net/netfilter/nfnetlink.c:295
 #1: ffffffff8df3d2e0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:331 [inline]
 #1: ffffffff8df3d2e0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:867 [inline]
 #1: ffffffff8df3d2e0 (rcu_read_lock){....}-{1:3}, at: nfqnl_recv_config+0x222/0xf90 net/netfilter/nfnetlink_queue.c:1653
 #2: ffff888112297d18 (&q->instances_lock){+.+.}-{3:3}, at: spin_lock include/linux/spinlock.h:351 [inline]
 #2: ffff888112297d18 (&q->instances_lock){+.+.}-{3:3}, at: instance_create+0x121/0x740 net/netfilter/nfnetlink_queue.c:206
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 5968 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8927
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4913 [inline]
 slab_alloc_node mm/slub.c:5248 [inline]
 __do_kmalloc_node mm/slub.c:5633 [inline]
 __kvmalloc_node_noprof+0x149/0x910 mm/slub.c:7089
 kvmalloc_array_node_noprof include/linux/slab.h:1122 [inline]
 instance_create+0x203/0x740 net/netfilter/nfnetlink_queue.c:218
 nfqnl_recv_config+0x660/0xf90 net/netfilter/nfnetlink_queue.c:1667
 nfnetlink_rcv_msg+0xb4d/0x1130 net/netfilter/nfnetlink.c:302
 netlink_rcv_skb+0x208/0x470 net/netlink/af_netlink.c:2550
 nfnetlink_rcv+0x282/0x2590 net/netfilter/nfnetlink.c:669
 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline]
 netlink_unicast+0x82f/0x9e0 net/netlink/af_netlink.c:1344
 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1894
 sock_sendmsg_nosec net/socket.c:727 [inline]
 __sock_sendmsg+0x21c/0x270 net/socket.c:742
 ____sys_sendmsg+0x505/0x830 net/socket.c:2630
 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2684
 __sys_sendmsg net/socket.c:2716 [inline]
 __do_sys_sendmsg net/socket.c:2721 [inline]
 __se_sys_sendmsg net/socket.c:2719 [inline]
 __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2719
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f7a65f8f6c9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffde4361588 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007f7a661e5fa0 RCX: 00007f7a65f8f6c9
RDX: 0000000000008010 RSI: 00002000000000c0 RDI: 0000000000000003
RBP: 00007f7a66011f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f7a661e5fa0 R14: 00007f7a661e5fa0 R15: 0000000000000003
 </TASK>


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13 10:25 ` Eric Dumazet
@ 2025-11-13 15:30   ` Scott Mitchell
  2025-11-13 19:40   ` David Laight
  1 sibling, 0 replies; 8+ messages in thread
From: Scott Mitchell @ 2025-11-13 15:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: pablo, kadlec, fw, phil, davem, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel, Scott Mitchell

On Thu, Nov 13, 2025 at 2:25 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Thu, Nov 13, 2025 at 1:26 AM Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> >
> > The current implementation uses a linear list to find queued packets by
> > ID when processing verdicts from userspace. With large queue depths and
> > out-of-order verdicting, this O(n) lookup becomes a significant
> > bottleneck, causing userspace verdict processing to dominate CPU time.
> >
> > Replace the linear search with a hash table for O(1) average-case
> > packet lookup by ID. The hash table size is configurable via the new
> > NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
> > NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
> > two to enable efficient bitwise masking instead of modulo operations.
> > Unpatched kernels silently ignore the new attribute, maintaining
> > backward compatibility.
> >
> > The existing list data structure is retained for operations requiring
> > linear iteration (e.g. flush, device down events). Hot fields
> > (queue_hash_mask, queue_hash pointer) are placed in the same cache line
> > as the spinlock and packet counters for optimal memory access patterns.
> >
> > Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>
> > ---
> > Changes in v2:
> > - Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
> >   tables with vmalloc fallback (Florian Westphal)
> > - Remove incorrect comment about concurrent resizes - nfnetlink subsystem
> >   mutex already serializes config operations (Florian Westphal)
> > - Fix style: remove unnecessary braces around single-line if (Florian Westphal)
> >
> >  include/net/netfilter/nf_queue.h              |   1 +
> >  .../uapi/linux/netfilter/nfnetlink_queue.h    |   1 +
> >  net/netfilter/nfnetlink_queue.c               | 129 ++++++++++++++++--
> >  3 files changed, 123 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> > index 4aeffddb7586..3d0def310523 100644
> > --- a/include/net/netfilter/nf_queue.h
> > +++ b/include/net/netfilter/nf_queue.h
> > @@ -11,6 +11,7 @@
> >  /* Each queued (to userspace) skbuff has one of these. */
> >  struct nf_queue_entry {
> >         struct list_head        list;
> > +       struct hlist_node       hash_node;
> >         struct sk_buff          *skb;
> >         unsigned int            id;
> >         unsigned int            hook_index;     /* index in hook_entries->hook[] */
> > diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
> > index efcb7c044a74..bc296a17e5aa 100644
> > --- a/include/uapi/linux/netfilter/nfnetlink_queue.h
> > +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
> > @@ -107,6 +107,7 @@ enum nfqnl_attr_config {
> >         NFQA_CFG_QUEUE_MAXLEN,          /* __u32 */
> >         NFQA_CFG_MASK,                  /* identify which flags to change */
> >         NFQA_CFG_FLAGS,                 /* value of these flags (__u32) */
> > +       NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
> >         __NFQA_CFG_MAX
> >  };
> >  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> > diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> > index 8b7b39d8a109..f076609cac32 100644
> > --- a/net/netfilter/nfnetlink_queue.c
> > +++ b/net/netfilter/nfnetlink_queue.c
> > @@ -46,7 +46,10 @@
> >  #include <net/netfilter/nf_conntrack.h>
> >  #endif
> >
> > -#define NFQNL_QMAX_DEFAULT 1024
> > +#define NFQNL_QMAX_DEFAULT      1024
> > +#define NFQNL_MIN_HASH_SIZE     16
> > +#define NFQNL_DEFAULT_HASH_SIZE 1024
> > +#define NFQNL_MAX_HASH_SIZE     131072
> >
> >  /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
> >   * includes the header length. Thus, the maximum packet length that we
> > @@ -65,6 +68,7 @@ struct nfqnl_instance {
> >         unsigned int copy_range;
> >         unsigned int queue_dropped;
> >         unsigned int queue_user_dropped;
> > +       unsigned int queue_hash_size;
> >
> >
> >         u_int16_t queue_num;                    /* number of this queue */
> > @@ -77,6 +81,8 @@ struct nfqnl_instance {
> >         spinlock_t      lock    ____cacheline_aligned_in_smp;
> >         unsigned int    queue_total;
> >         unsigned int    id_sequence;            /* 'sequence' of pkt ids */
> > +       unsigned int    queue_hash_mask;
> > +       struct hlist_head *queue_hash;
> >         struct list_head queue_list;            /* packets in queue */
> >  };
> >
> > @@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
> >         return net_generic(net, nfnl_queue_net_id);
> >  }
> >
> > +static inline unsigned int
> > +nfqnl_packet_hash(u32 id, unsigned int mask)
> > +{
> > +       return hash_32(id, 32) & mask;
> > +}
> > +
>
> (Resent in plaintext for the lists, sorry for duplicates)
>
> I do not think this is an efficient hash function.
>
> queue->id_sequence is monotonically increasing (controlled by the
> kernel : __nfqnl_enqueue_packet(), not user space).
>
> I would use   return (id & mask) so that we have better use of cpu
> caches and hardware prefetchers,
> in case a cpu receives a batch of ~64 packets from a busy network device.
>
> Your hash function would require 8x more cache line misses.
>

Nice improvement, done!

>
> > +static inline u32
> > +nfqnl_normalize_hash_size(u32 hash_size)
> > +{
> > +       /* Must be power of two for queue_hash_mask to work correctly.
> > +        * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
> > +        */
> > +       BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
> > +                    !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
> > +                    !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
> > +                    NFQNL_MAX_HASH_SIZE > 1U << 31);
> > +
> > +       if (!hash_size)
> > +               return NFQNL_DEFAULT_HASH_SIZE;
> > +
> > +       /* Clamp to valid range before power of two to avoid overflow */
> > +       if (hash_size <= NFQNL_MIN_HASH_SIZE)
> > +               return NFQNL_MIN_HASH_SIZE;
> > +
> > +       if (hash_size >= NFQNL_MAX_HASH_SIZE)
> > +               return NFQNL_MAX_HASH_SIZE;
> > +
> > +       if (!is_power_of_2(hash_size))
> > +               hash_size = roundup_pow_of_two(hash_size);
> > +
> > +       return hash_size;
> > +}
> > +
> >  static inline u_int8_t instance_hashfn(u_int16_t queue_num)
> >  {
> >         return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> > @@ -114,13 +153,56 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
> >         return NULL;
> >  }
> >
> > +static int
> > +nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
> > +{
> > +       struct hlist_head *new_hash, *old_hash;
> > +       struct nf_queue_entry *entry;
> > +       unsigned int h, hash_mask;
> > +
> > +       hash_size = nfqnl_normalize_hash_size(hash_size);
> > +       if (hash_size == inst->queue_hash_size)
> > +               return 0;
> > +
> > +       new_hash = kvcalloc(hash_size, sizeof(*new_hash), GFP_KERNEL_ACCOUNT);
> > +       if (!new_hash)
> > +               return -ENOMEM;
> > +
> > +       hash_mask = hash_size - 1;
> > +
> > +       for (h = 0; h < hash_size; h++)
> > +               INIT_HLIST_HEAD(&new_hash[h]);
> > +
> > +       spin_lock_bh(&inst->lock);
> > +
> > +       list_for_each_entry(entry, &inst->queue_list, list) {
> > +               /* No hlist_del() since old_hash will be freed and we hold lock */
> > +               h = nfqnl_packet_hash(entry->id, hash_mask);
> > +               hlist_add_head(&entry->hash_node, &new_hash[h]);
> > +       }
> > +
> > +       old_hash = inst->queue_hash;
> > +       inst->queue_hash_size = hash_size;
> > +       inst->queue_hash_mask = hash_mask;
> > +       inst->queue_hash = new_hash;
> > +
> > +       spin_unlock_bh(&inst->lock);
> > +
> > +       kvfree(old_hash);
> > +
> > +       return 0;
> > +}
> > +
> >  static struct nfqnl_instance *
> > -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> > +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
> > +               u32 hash_size)
> >  {
> >         struct nfqnl_instance *inst;
> >         unsigned int h;
> >         int err;
> >
> > +       hash_size = nfqnl_normalize_hash_size(hash_size);
> > +
> >         spin_lock(&q->instances_lock);
> >         if (instance_lookup(q, queue_num)) {
> >                 err = -EEXIST;
> > @@ -133,11 +215,24 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> >                 goto out_unlock;
> >         }
> >
> > +       inst->queue_hash = kvcalloc(hash_size, sizeof(*inst->queue_hash),
> > +                                   GFP_KERNEL_ACCOUNT);
> > +       if (!inst->queue_hash) {
> > +               kfree(inst);
> > +               err = -ENOMEM;
> > +               goto out_unlock;
> > +       }
> > +
> > +       for (h = 0; h < hash_size; h++)
> > +               INIT_HLIST_HEAD(&inst->queue_hash[h]);
> > +
> >         inst->queue_num = queue_num;
> >         inst->peer_portid = portid;
> >         inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
> >         inst->copy_range = NFQNL_MAX_COPY_RANGE;
> >         inst->copy_mode = NFQNL_COPY_NONE;
> > +       inst->queue_hash_size = hash_size;
> > +       inst->queue_hash_mask = hash_size - 1;
> >         spin_lock_init(&inst->lock);
> >         INIT_LIST_HEAD(&inst->queue_list);
> >
> > @@ -154,6 +249,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> >         return inst;
> >
> >  out_free:
> > +       kvfree(inst->queue_hash);
> >         kfree(inst);
> >  out_unlock:
> >         spin_unlock(&q->instances_lock);
> > @@ -172,6 +268,7 @@ instance_destroy_rcu(struct rcu_head *head)
> >         rcu_read_lock();
> >         nfqnl_flush(inst, NULL, 0);
> >         rcu_read_unlock();
> > +       kvfree(inst->queue_hash);
> >         kfree(inst);
> >         module_put(THIS_MODULE);
> >  }
> > @@ -194,13 +291,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
> >  static inline void
> >  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> >  {
> > -       list_add_tail(&entry->list, &queue->queue_list);
> > -       queue->queue_total++;
> > +       unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
> > +
> > +       hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
> > +       list_add_tail(&entry->list, &queue->queue_list);
> > +       queue->queue_total++;
> >  }
> >
> >  static void
> >  __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> >  {
> > +       hlist_del(&entry->hash_node);
> >         list_del(&entry->list);
> >         queue->queue_total--;
> >  }
> > @@ -209,10 +310,11 @@ static struct nf_queue_entry *
> >  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
> >  {
> >         struct nf_queue_entry *entry = NULL, *i;
> > +       unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
> >
> >         spin_lock_bh(&queue->lock);
> >
> > -       list_for_each_entry(i, &queue->queue_list, list) {
> > +       hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
> >                 if (i->id == id) {
> >                         entry = i;
> >                         break;
> > @@ -407,8 +509,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
> >         spin_lock_bh(&queue->lock);
> >         list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
> >                 if (!cmpfn || cmpfn(entry, data)) {
> > -                       list_del(&entry->list);
> > -                       queue->queue_total--;
> > +                       __dequeue_entry(queue, entry);
> >                         nfqnl_reinject(entry, NF_DROP);
> >                 }
> >         }
> > @@ -1483,6 +1584,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
> >         [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 },
> >         [NFQA_CFG_MASK]         = { .type = NLA_U32 },
> >         [NFQA_CFG_FLAGS]        = { .type = NLA_U32 },
> > +       [NFQA_CFG_HASH_SIZE]    = { .type = NLA_U32 },
> >  };
> >
> >  static const struct nf_queue_handler nfqh = {
> > @@ -1495,11 +1597,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> >  {
> >         struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
> >         u_int16_t queue_num = ntohs(info->nfmsg->res_id);
> > +       u32 hash_size = 0;
> >         struct nfqnl_msg_config_cmd *cmd = NULL;
> >         struct nfqnl_instance *queue;
> >         __u32 flags = 0, mask = 0;
> >         int ret = 0;
> >
> > +       if (nfqa[NFQA_CFG_HASH_SIZE])
> > +               hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
> > +
> >         if (nfqa[NFQA_CFG_CMD]) {
> >                 cmd = nla_data(nfqa[NFQA_CFG_CMD]);
> >
> > @@ -1559,11 +1665,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> >                                 goto err_out_unlock;
> >                         }
> >                         queue = instance_create(q, queue_num,
> > -                                               NETLINK_CB(skb).portid);
> > +                                               NETLINK_CB(skb).portid, hash_size);
> >                         if (IS_ERR(queue)) {
> >                                 ret = PTR_ERR(queue);
> >                                 goto err_out_unlock;
> >                         }
> > +                       hash_size = 0; /* avoid resize later in this function */
> >                         break;
> >                 case NFQNL_CFG_CMD_UNBIND:
> >                         if (!queue) {
> > @@ -1586,6 +1693,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> >                 goto err_out_unlock;
> >         }
> >
> > +       if (hash_size > 0) {
> > +               ret = nfqnl_hash_resize(queue, hash_size);
> > +               if (ret)
> > +                       goto err_out_unlock;
> > +       }
> > +
> >         if (nfqa[NFQA_CFG_PARAMS]) {
> >                 struct nfqnl_msg_config_params *params =
> >                         nla_data(nfqa[NFQA_CFG_PARAMS]);
> > --
> > 2.39.5 (Apple Git-154)
> >

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13 12:19 ` Florian Westphal
@ 2025-11-13 15:32   ` Scott Mitchell
  0 siblings, 0 replies; 8+ messages in thread
From: Scott Mitchell @ 2025-11-13 15:32 UTC (permalink / raw)
  To: Florian Westphal
  Cc: pablo, kadlec, phil, davem, edumazet, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel, Scott Mitchell

On Thu, Nov 13, 2025 at 4:19 AM Florian Westphal <fw@strlen.de> wrote:
>
> Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> > Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>
>
> Didn't notice this before, these two should match:
>
> scripts/checkpatch.pl netfilter-nfnetlink_queue-optimize-verdict-lookup-wi.patch
> WARNING: From:/Signed-off-by: email address mismatch: 'From: Scott Mitchell <scott.k.mitch1@gmail.com>' != 'Signed-off-by: Scott Mitchell <scott_mitchell@apple.com>'

Good catch, will fix in v3 (coming shortly).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13 10:25 ` Eric Dumazet
  2025-11-13 15:30   ` Scott Mitchell
@ 2025-11-13 19:40   ` David Laight
  2025-11-13 20:56     ` Scott Mitchell
  1 sibling, 1 reply; 8+ messages in thread
From: David Laight @ 2025-11-13 19:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Scott Mitchell, pablo, kadlec, fw, phil, davem, kuba, pabeni,
	horms, netfilter-devel, coreteam, netdev, linux-kernel,
	Scott Mitchell

On Thu, 13 Nov 2025 02:25:24 -0800
Eric Dumazet <edumazet@google.com> wrote:

> On Thu, Nov 13, 2025 at 1:26 AM Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
....
> I do not think this is an efficient hash function.
> 
> queue->id_sequence is monotonically increasing (controlled by the
> kernel : __nfqnl_enqueue_packet(), not user space).

If id_sequence is allocated by the kernel, is there any requirement
that the values be sequential rather than just unique?

If they don't need to be sequential then the kernel can pick an 'id' value
such that 'id & mask' is unique for all 'live' id values.
Then the hash becomes 'perfect' and degenerates into a simple array lookup.
Just needs a bit of housekeeping...

	David

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
  2025-11-13 19:40   ` David Laight
@ 2025-11-13 20:56     ` Scott Mitchell
  0 siblings, 0 replies; 8+ messages in thread
From: Scott Mitchell @ 2025-11-13 20:56 UTC (permalink / raw)
  To: David Laight
  Cc: Eric Dumazet, pablo, kadlec, fw, phil, davem, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel, Scott Mitchell

On Thu, Nov 13, 2025 at 11:40 AM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Thu, 13 Nov 2025 02:25:24 -0800
> Eric Dumazet <edumazet@google.com> wrote:
>
> > On Thu, Nov 13, 2025 at 1:26 AM Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> ....
> > I do not think this is an efficient hash function.
> >
> > queue->id_sequence is monotonically increasing (controlled by the
> > kernel : __nfqnl_enqueue_packet(), not user space).
>
> If id_sequence is allocated by the kernel, is there any requirement
> that the values be sequential rather than just unique?

I will defer to maintainers for the authoritative answer, but
NFQNL_MSG_VERDICT_BATCH API semantics rely on sequential IDs
(nfqnl_recv_verdict_batch applies same verdict to all IDs <= max id).
New options could be added to opt-in to different ID generation
behavior (e.g. user acknowledging NFQNL_MSG_VERDICT_BATCH isn't used),
but not clear this would always be beneficial as "unique for all
packets" depends on size of map relative to number of un-verdicted
packets. Packets can be verdicted out-of-order which would require
additional tracking/searching to get "next unused ID".

>
> If they don't need to be sequential then the kernel can pick an 'id' value
> such that 'id & mask' is unique for all 'live' id values.
> Then the hash becomes 'perfect' and degenerates into a simple array lookup.
> Just needs a bit of housekeeping...
>
>         David

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-11-13 20:56 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-13  9:26 [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table Scott Mitchell
2025-11-13 10:25 ` Eric Dumazet
2025-11-13 15:30   ` Scott Mitchell
2025-11-13 19:40   ` David Laight
2025-11-13 20:56     ` Scott Mitchell
2025-11-13 12:19 ` Florian Westphal
2025-11-13 15:32   ` Scott Mitchell
2025-11-13 13:50 ` [syzbot ci] " syzbot ci

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).