From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, fw@strlen.de,
horms@kernel.org, steffen.klassert@secunet.com,
antony.antony@secunet.com
Subject: [PATCH net-next,RFC 8/8] net: add dev_queue_xmit_list() and use it
Date: Tue, 17 Mar 2026 12:29:17 +0100 [thread overview]
Message-ID: <20260317112917.4170466-9-pablo@netfilter.org> (raw)
In-Reply-To: <20260317112917.4170466-1-pablo@netfilter.org>
Add listified skb tx path and use it to implement the flowtable TX
datapath. Use the dev_dst_drop() and dev_noqueue_xmit_list() helper
functions to build dev_queue_xmit_list().
100dfa74cad9 ("net: dev_queue_xmit() llist adoption") requires to
reverse the skb list and then splice this list to the the last pending
skbuff for transmission.
A few notes:
- I removed:
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
return -1;
Only possible if skb->sk is set on, if my assumption is not correct, this
can be checked from flowtable path.
Reducing the size of dev_queue_xmit_list() is convenient, to focus only
on speeding up what it can be really speed up, so let's return -1 if
either:
- qdisc is not empty
OR
- qdisc is not work-conserving (no TCQ_F_CAN_BYPASS is set on)
Then, the flowtable falls back to call dev_queue_xmit() for each single
skbuff.
Co-developed-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netdevice.h | 2 +
net/core/dev.c | 157 +++++++++++++++++++++++++++++++
net/netfilter/nf_flow_table_ip.c | 18 ++--
3 files changed, 169 insertions(+), 8 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c0174aa1037f..34747e9b85d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3401,6 +3401,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
return ret;
}
+int dev_queue_xmit_list(struct sk_buff *skb);
+
int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8f5bef5a715c..8f114f5af537 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4920,6 +4920,163 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
}
EXPORT_SYMBOL(__dev_direct_xmit);
+static int dev_queue_xmit_skb_list(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
+ spinlock_t *root_lock = qdisc_lock(q);
+ struct llist_node *ll_list, *first_n;
+ unsigned long defer_count = 0;
+ int rc = -1;
+
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
+ qdisc_run_begin(q)) {
+ /* Retest nolock_qdisc_is_empty() within the protection
+ * of q->seqlock to protect from racing with requeuing.
+ */
+ if (unlikely(!nolock_qdisc_is_empty(q))) {
+ to_free2 = qdisc_run_end(q);
+ goto free_skbs;
+ }
+
+ if (sch_direct_xmit(skb, q, dev, txq, NULL, false) &&
+ !nolock_qdisc_is_empty(q))
+ __qdisc_run(q);
+
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ goto free_skbs;
+ }
+ }
+
+ /* Transform skb list to llist in reverse order to splice this batch
+ * into the defer_list. The next field of skb chain and llist use the
+ * memory layout.
+ */
+ ll_list = llist_reverse_order(&skb->ll_node);
+
+ /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+ * In the try_cmpxchg() loop, we want to increment q->defer_count
+ * at most once to limit the number of skbs in defer_list.
+ * We perform the defer_count increment only if the list is not empty,
+ * because some arches have slow atomic_long_inc_return().
+ */
+ first_n = READ_ONCE(q->defer_list.first);
+ do {
+ if (first_n && !defer_count) {
+ defer_count = atomic_long_inc_return(&q->defer_count);
+ if (unlikely(defer_count > READ_ONCE(net_hotdata.qdisc_max_burst))) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_BURST_DROP);
+ return NET_XMIT_DROP;
+ }
+ }
+ /* Splice using last skb in the reverse list. */
+ skb->ll_node.next = first_n;
+ } while (!try_cmpxchg(&q->defer_list.first, &first_n, ll_list));
+
+ /* If defer_list was not empty, we know the cpu which queued
+ * the first skb will process the whole list for us.
+ */
+ if (first_n)
+ return NET_XMIT_SUCCESS;
+
+ spin_lock(root_lock);
+
+ ll_list = llist_del_all(&q->defer_list);
+ /* There is a small race because we clear defer_count not atomically
+ * with the prior llist_del_all(). This means defer_list could grow
+ * over qdisc_max_burst.
+ */
+ atomic_long_set(&q->defer_count, 0);
+
+ ll_list = llist_reverse_order(ll_list);
+
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+ __qdisc_drop(skb, &to_free);
+ rc = NET_XMIT_DROP;
+ goto unlock;
+ }
+
+ if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ !llist_next(ll_list) && qdisc_run_begin(q)) {
+ /*
+ * This is a work-conserving queue; there are no old skbs
+ * waiting to be sent out; and the qdisc is not running -
+ * xmit the skb directly.
+ */
+ DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+ struct sk_buff,
+ ll_node));
+ qdisc_bstats_update(q, skb);
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
+ __qdisc_run(q);
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ }
+unlock:
+ spin_unlock(root_lock);
+
+free_skbs:
+ tcf_kfree_skb_list(to_free);
+ tcf_kfree_skb_list(to_free2);
+ return rc;
+}
+
+int dev_queue_xmit_list(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ struct sk_buff *iter;
+ struct Qdisc *q;
+ int rc;
+
+ /* Disable soft irqs for various locks below. Also
+ * stops preemption for RCU.
+ */
+ rcu_read_lock_bh();
+
+ /* Intentionally, no egress hooks here. This is called from the ingress
+ * path, which should have already classified packets before calling
+ * this function.
+ */
+
+ txq = netdev_tx_queue_mapping(dev, skb);
+ if (!txq)
+ txq = netdev_core_pick_tx(dev, skb, NULL);
+
+ q = rcu_dereference_bh(txq->qdisc);
+
+ iter = skb;
+ while (iter) {
+ dev_dst_drop(dev, iter);
+ skb_copy_queue_mapping(iter, skb);
+ iter = iter->next;
+ }
+
+ if (q->enqueue) {
+ rc = dev_queue_xmit_skb_list(skb, q, dev, txq);
+ goto out;
+ }
+
+ rc = dev_noqueue_xmit_list(skb, dev, txq);
+ rcu_read_unlock_bh();
+
+ if (rc < 0) {
+ dev_core_stats_tx_dropped_inc(dev);
+ kfree_skb_list(skb);
+ }
+ return rc;
+out:
+ rcu_read_unlock_bh();
+ return rc;
+}
+EXPORT_SYMBOL(dev_queue_xmit_list);
+
/*************************************************************************
* Receiver routines
*************************************************************************/
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 98b5d5e022c8..3d2d02be0f0d 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -863,14 +863,16 @@ static void nf_flow_neigh_xmit_list(struct sk_buff *skb, struct net_device *outd
iter = iter->next;
}
- iter = skb;
- while (iter) {
- struct sk_buff *next;
-
- next = iter->next;
- iter->next = NULL;
- dev_queue_xmit(iter);
- iter = next;
+ if (dev_queue_xmit_list(skb) == -1) {
+ iter = skb;
+ while (iter) {
+ struct sk_buff *next;
+
+ next = iter->next;
+ iter->next = NULL;
+ dev_queue_xmit(iter);
+ iter = next;
+ }
}
}
--
2.47.3
next prev parent reply other threads:[~2026-03-17 11:29 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-17 11:29 [PATCH net-next,RFC 0/8] netfilter: flowtable bulking Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 1/8] netfilter: flowtable: Add basic bulking infrastructure for early ingress hook Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 2/8] netfilter: flowtable: Add IPv6 " Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 3/8] netfilter: nf_tables: add flowtable early_ingress support Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 4/8] netfilter: nf_tables: add nft_set_pktinfo_ingress() Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 5/8] netfilter: nf_tables: add early ingress chain Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 6/8] net: add dev_dst_drop() helper function Pablo Neira Ayuso
2026-03-17 11:29 ` [PATCH net-next,RFC 7/8] net: add dev_noqueue_xmit_list() " Pablo Neira Ayuso
2026-03-17 11:29 ` Pablo Neira Ayuso [this message]
2026-03-17 11:39 ` [PATCH net-next,RFC 0/8] netfilter: flowtable bulking Pablo Neira Ayuso
2026-03-19 6:15 ` Qingfang Deng
2026-03-19 11:28 ` Steffen Klassert
2026-03-19 12:18 ` Felix Fietkau
2026-03-20 6:49 ` Steffen Klassert
2026-03-20 8:50 ` Felix Fietkau
2026-03-20 9:00 ` Steffen Klassert
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260317112917.4170466-9-pablo@netfilter.org \
--to=pablo@netfilter.org \
--cc=antony.antony@secunet.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=fw@strlen.de \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=steffen.klassert@secunet.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox