From: hawk@kernel.org
To: netdev@vger.kernel.org
Cc: Simon Schippers <simon.schippers@tu-dortmund.de>,
Jesper Dangaard Brouer <hawk@kernel.org>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
John Fastabend <john.fastabend@gmail.com>,
Stanislav Fomichev <sdf@fomichev.me>,
linux-kernel@vger.kernel.org, bpf@vger.kernel.org
Subject: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
Date: Wed, 27 May 2026 15:54:16 +0200 [thread overview]
Message-ID: <20260527135418.1166665-6-hawk@kernel.org> (raw)
In-Reply-To: <20260527135418.1166665-1-hawk@kernel.org>
From: Simon Schippers <simon.schippers@tu-dortmund.de>
Per-packet BQL completion forces DQL to converge on limit=2, causing
excessive NAPI scheduling overhead and qdisc requeues.
Accumulate BQL completions and flush them when a configurable time
threshold is exceeded, letting DQL discover a limit that bounds actual
queuing delay to the configured interval. Coalescing state persists
across NAPI polls in struct veth_rq so completions can accumulate
beyond a single budget=64 cycle.
Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
setting tx-usecs to 0 disables coalescing and falls back to per-packet
completion.
ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
drivers/net/veth.c | 100 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 98 insertions(+), 2 deletions(-)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index d5675d9d5236..743d17b37223 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -28,6 +28,7 @@
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
#include <linux/skbuff_ref.h>
+#include <linux/sched/clock.h>
#include <net/page_pool/helpers.h>
#define DRV_NAME "veth"
@@ -44,6 +45,8 @@
#define VETH_XDP_TX_BULK_SIZE 16
#define VETH_XDP_BATCH 16
+#define VETH_BQL_COAL_TX_USECS 100 /* default tx-usecs for BQL batching */
+
struct veth_stats {
u64 rx_drops;
/* xdp */
@@ -62,6 +65,11 @@ struct veth_rq_stats {
struct u64_stats_sync syncp;
};
+struct veth_bql_state {
+ u64 time; /* sched_clock() when current coalescing window started */
+ int n_bql; /* BQL completions batched in the current window */
+};
+
struct veth_rq {
struct napi_struct xdp_napi;
struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
@@ -69,6 +77,7 @@ struct veth_rq {
struct bpf_prog __rcu *xdp_prog;
struct xdp_mem_info xdp_mem;
struct veth_rq_stats stats;
+ struct veth_bql_state bql_state;
bool rx_notify_masked;
struct ptr_ring xdp_ring;
struct xdp_rxq_info xdp_rxq;
@@ -81,6 +90,7 @@ struct veth_priv {
struct bpf_prog *_xdp_prog;
struct veth_rq *rq;
unsigned int requested_headroom;
+ unsigned int tx_coal_usecs; /* BQL completion coalescing */
};
struct veth_xdp_tx_bq {
@@ -265,7 +275,30 @@ static void veth_get_channels(struct net_device *dev,
static int veth_set_channels(struct net_device *dev,
struct ethtool_channels *ch);
+static int veth_get_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *ec,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+
+ ec->tx_coalesce_usecs = priv->tx_coal_usecs;
+ return 0;
+}
+
+static int veth_set_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *ec,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+
+ priv->tx_coal_usecs = ec->tx_coalesce_usecs;
+ return 0;
+}
+
static const struct ethtool_ops veth_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_strings = veth_get_strings,
@@ -275,6 +308,8 @@ static const struct ethtool_ops veth_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
.get_channels = veth_get_channels,
.set_channels = veth_set_channels,
+ .get_coalesce = veth_get_coalesce,
+ .set_coalesce = veth_set_coalesce,
};
/* general routines */
@@ -937,13 +972,45 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
return NULL;
}
+static void veth_bql_complete(struct veth_bql_state *state,
+ struct netdev_queue *peer_txq)
+{
+ netdev_tx_completed_queue(peer_txq, state->n_bql,
+ state->n_bql * VETH_BQL_UNIT);
+ state->n_bql = 0;
+ state->time = sched_clock();
+}
+
+static void veth_bql_maybe_complete(struct veth_bql_state *state,
+ struct netdev_queue *peer_txq,
+ u64 coalescing_ns)
+{
+ if (state->n_bql && sched_clock() >= state->time + coalescing_ns)
+ veth_bql_complete(state, peer_txq);
+}
+
static int veth_xdp_rcv(struct veth_rq *rq, int budget,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats,
struct netdev_queue *peer_txq)
{
+ struct veth_bql_state *state = &rq->bql_state;
int i, done = 0, n_xdpf = 0;
void *xdpf[VETH_XDP_BATCH];
+ struct veth_priv *priv;
+ u64 bql_flush_ns;
+
+ priv = netdev_priv(rq->dev);
+ bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
+
+ /* Clamp stored timestamp in case we migrated to a CPU with a behind
+ * sched_clock(); prevents the deadline from never firing.
+ */
+ state->time = min(state->time, sched_clock());
+
+ /* Flush completions that timed out since the previous NAPI poll. */
+ if (peer_txq && bql_flush_ns)
+ veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
for (i = 0; i < budget; i++) {
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
@@ -972,8 +1039,16 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
struct sk_buff *skb = veth_ptr_to_skb(ptr);
stats->xdp_bytes += skb->len;
- if (peer_txq && bql_charged)
- netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
+ if (peer_txq && bql_charged) {
+ if (!bql_flush_ns) {
+ netdev_tx_completed_queue(peer_txq, 1,
+ VETH_BQL_UNIT);
+ } else {
+ state->n_bql++;
+ veth_bql_maybe_complete(state, peer_txq,
+ bql_flush_ns);
+ }
+ }
skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
if (skb) {
@@ -989,6 +1064,18 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
if (n_xdpf)
veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
+ /* If the ring is now empty and the peer TX queue is stalled by DQL
+ * backpressure, release completions immediately to unblock it.
+ */
+ if (peer_txq && state->n_bql && __ptr_ring_empty(&rq->xdp_ring)) {
+ /* Pairs with smp_wmb() in __ptr_ring_produce(); ensure ring
+ * emptiness is observed before reading peer_txq->state.
+ */
+ smp_rmb();
+ if (test_bit(__QUEUE_STATE_STACK_XOFF, &peer_txq->state))
+ veth_bql_complete(state, peer_txq);
+ }
+
u64_stats_update_begin(&rq->stats.syncp);
rq->stats.vs.xdp_redirect += stats->xdp_redirect;
rq->stats.vs.xdp_bytes += stats->xdp_bytes;
@@ -1093,6 +1180,9 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
napi_enable(&rq->xdp_napi);
rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
+
+ rq->bql_state.time = sched_clock();
+ rq->bql_state.n_bql = 0;
}
return 0;
@@ -1134,6 +1224,8 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
struct veth_rq *rq = &priv->rq[i];
rq->rx_notify_masked = false;
+ rq->bql_state.n_bql = 0;
+ rq->bql_state.time = 0;
ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
}
@@ -1813,6 +1905,8 @@ static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
static void veth_setup(struct net_device *dev)
{
+ struct veth_priv *priv = netdev_priv(dev);
+
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
@@ -1838,6 +1932,8 @@ static void veth_setup(struct net_device *dev)
dev->max_mtu = ETH_MAX_MTU;
dev->watchdog_timeo = msecs_to_jiffies(16000);
+ priv->tx_coal_usecs = VETH_BQL_COAL_TX_USECS;
+
dev->hw_features = VETH_FEATURES;
dev->hw_enc_features = VETH_FEATURES;
dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
--
2.43.0
prev parent reply other threads:[~2026-05-27 13:54 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-27 13:54 [PATCH net-next v6 0/5] veth: add Byte Queue Limits (BQL) support hawk
2026-05-27 13:54 ` [PATCH net-next v6 1/5] net: add dev->bql flag to allow BQL sysfs for IFF_NO_QUEUE devices hawk
2026-05-27 13:54 ` [PATCH net-next v6 2/5] veth: implement Byte Queue Limits (BQL) for latency reduction hawk
2026-05-27 13:54 ` [PATCH net-next v6 3/5] veth: add tx_timeout watchdog as BQL safety net hawk
2026-05-27 13:54 ` [PATCH net-next v6 4/5] net: sched: add timeout count to NETDEV WATCHDOG message hawk
2026-05-27 13:54 ` hawk [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260527135418.1166665-6-hawk@kernel.org \
--to=hawk@kernel.org \
--cc=andrew+netdev@lunn.ch \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=john.fastabend@gmail.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=sdf@fomichev.me \
--cc=simon.schippers@tu-dortmund.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox