* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-05-27 13:54 ` [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs hawk
@ 2026-05-28 7:46 ` Jonas Köppeler
2026-06-01 12:00 ` Simon Schippers
2026-05-29 14:51 ` Simon Schippers
` (3 subsequent siblings)
4 siblings, 1 reply; 25+ messages in thread
From: Jonas Köppeler @ 2026-05-28 7:46 UTC (permalink / raw)
To: hawk, netdev
Cc: Simon Schippers, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
John Fastabend, Stanislav Fomichev, linux-kernel, bpf
On 5/27/26 3:54 PM, hawk@kernel.org wrote:
> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>
> Per-packet BQL completion forces DQL to converge on limit=2, causing
> excessive NAPI scheduling overhead and qdisc requeues.
>
> Accumulate BQL completions and flush them when a configurable time
> threshold is exceeded, letting DQL discover a limit that bounds actual
> queuing delay to the configured interval. Coalescing state persists
> across NAPI polls in struct veth_rq so completions can accumulate
> beyond a single budget=64 cycle.
>
> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
> setting tx-usecs to 0 disables coalescing and falls back to per-packet
> completion.
>
> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>
> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
> ---
> drivers/net/veth.c | 100 ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 98 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index d5675d9d5236..743d17b37223 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -28,6 +28,7 @@
> #include <linux/bpf_trace.h>
> #include <linux/net_tstamp.h>
> #include <linux/skbuff_ref.h>
> +#include <linux/sched/clock.h>
> #include <net/page_pool/helpers.h>
>
> #define DRV_NAME "veth"
> @@ -44,6 +45,8 @@
> #define VETH_XDP_TX_BULK_SIZE 16
> #define VETH_XDP_BATCH 16
>
> +#define VETH_BQL_COAL_TX_USECS 100 /* default tx-usecs for BQL batching */
> +
> struct veth_stats {
> u64 rx_drops;
> /* xdp */
> @@ -62,6 +65,11 @@ struct veth_rq_stats {
> struct u64_stats_sync syncp;
> };
>
> +struct veth_bql_state {
> + u64 time; /* sched_clock() when current coalescing window started */
> + int n_bql; /* BQL completions batched in the current window */
> +};
> +
> struct veth_rq {
> struct napi_struct xdp_napi;
> struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
> @@ -69,6 +77,7 @@ struct veth_rq {
> struct bpf_prog __rcu *xdp_prog;
> struct xdp_mem_info xdp_mem;
> struct veth_rq_stats stats;
> + struct veth_bql_state bql_state;
> bool rx_notify_masked;
> struct ptr_ring xdp_ring;
> struct xdp_rxq_info xdp_rxq;
> @@ -81,6 +90,7 @@ struct veth_priv {
> struct bpf_prog *_xdp_prog;
> struct veth_rq *rq;
> unsigned int requested_headroom;
> + unsigned int tx_coal_usecs; /* BQL completion coalescing */
> };
>
> struct veth_xdp_tx_bq {
> @@ -265,7 +275,30 @@ static void veth_get_channels(struct net_device *dev,
> static int veth_set_channels(struct net_device *dev,
> struct ethtool_channels *ch);
>
> +static int veth_get_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> +
> + ec->tx_coalesce_usecs = priv->tx_coal_usecs;
> + return 0;
> +}
> +
> +static int veth_set_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> +
> + priv->tx_coal_usecs = ec->tx_coalesce_usecs;
> + return 0;
> +}
> +
> static const struct ethtool_ops veth_ethtool_ops = {
> + .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
> .get_drvinfo = veth_get_drvinfo,
> .get_link = ethtool_op_get_link,
> .get_strings = veth_get_strings,
> @@ -275,6 +308,8 @@ static const struct ethtool_ops veth_ethtool_ops = {
> .get_ts_info = ethtool_op_get_ts_info,
> .get_channels = veth_get_channels,
> .set_channels = veth_set_channels,
> + .get_coalesce = veth_get_coalesce,
> + .set_coalesce = veth_set_coalesce,
> };
>
> /* general routines */
> @@ -937,13 +972,45 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> return NULL;
> }
>
> +static void veth_bql_complete(struct veth_bql_state *state,
> + struct netdev_queue *peer_txq)
> +{
> + netdev_tx_completed_queue(peer_txq, state->n_bql,
> + state->n_bql * VETH_BQL_UNIT);
> + state->n_bql = 0;
> + state->time = sched_clock();
> +}
> +
> +static void veth_bql_maybe_complete(struct veth_bql_state *state,
> + struct netdev_queue *peer_txq,
> + u64 coalescing_ns)
> +{
> + if (state->n_bql && sched_clock() >= state->time + coalescing_ns)
> + veth_bql_complete(state, peer_txq);
> +}
> +
> static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> struct veth_xdp_tx_bq *bq,
> struct veth_stats *stats,
> struct netdev_queue *peer_txq)
> {
> + struct veth_bql_state *state = &rq->bql_state;
> int i, done = 0, n_xdpf = 0;
> void *xdpf[VETH_XDP_BATCH];
> + struct veth_priv *priv;
> + u64 bql_flush_ns;
> +
> + priv = netdev_priv(rq->dev);
> + bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
> +
> + /* Clamp stored timestamp in case we migrated to a CPU with a behind
> + * sched_clock(); prevents the deadline from never firing.
> + */
> + state->time = min(state->time, sched_clock());
> +
> + /* Flush completions that timed out since the previous NAPI poll. */
> + if (peer_txq && bql_flush_ns)
> + veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
>
> for (i = 0; i < budget; i++) {
> void *ptr = __ptr_ring_consume(&rq->xdp_ring);
> @@ -972,8 +1039,16 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> struct sk_buff *skb = veth_ptr_to_skb(ptr);
>
> stats->xdp_bytes += skb->len;
> - if (peer_txq && bql_charged)
> - netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
> + if (peer_txq && bql_charged) {
> + if (!bql_flush_ns) {
> + netdev_tx_completed_queue(peer_txq, 1,
> + VETH_BQL_UNIT);
> + } else {
> + state->n_bql++;
> + veth_bql_maybe_complete(state, peer_txq,
> + bql_flush_ns);
> + }
> + }
>
> skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
> if (skb) {
> @@ -989,6 +1064,18 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> if (n_xdpf)
> veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
>
> + /* If the ring is now empty and the peer TX queue is stalled by DQL
> + * backpressure, release completions immediately to unblock it.
> + */
> + if (peer_txq && state->n_bql && __ptr_ring_empty(&rq->xdp_ring)) {
> + /* Pairs with smp_wmb() in __ptr_ring_produce(); ensure ring
> + * emptiness is observed before reading peer_txq->state.
> + */
> + smp_rmb();
> + if (test_bit(__QUEUE_STATE_STACK_XOFF, &peer_txq->state))
> + veth_bql_complete(state, peer_txq);
> + }
> +
> u64_stats_update_begin(&rq->stats.syncp);
> rq->stats.vs.xdp_redirect += stats->xdp_redirect;
> rq->stats.vs.xdp_bytes += stats->xdp_bytes;
> @@ -1093,6 +1180,9 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
>
> napi_enable(&rq->xdp_napi);
> rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
> +
> + rq->bql_state.time = sched_clock();
> + rq->bql_state.n_bql = 0;
> }
>
> return 0;
> @@ -1134,6 +1224,8 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
> struct veth_rq *rq = &priv->rq[i];
>
> rq->rx_notify_masked = false;
> + rq->bql_state.n_bql = 0;
> + rq->bql_state.time = 0;
> ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
> }
>
> @@ -1813,6 +1905,8 @@ static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
>
> static void veth_setup(struct net_device *dev)
> {
> + struct veth_priv *priv = netdev_priv(dev);
> +
> ether_setup(dev);
>
> dev->priv_flags &= ~IFF_TX_SKB_SHARING;
> @@ -1838,6 +1932,8 @@ static void veth_setup(struct net_device *dev)
> dev->max_mtu = ETH_MAX_MTU;
> dev->watchdog_timeo = msecs_to_jiffies(16000);
>
> + priv->tx_coal_usecs = VETH_BQL_COAL_TX_USECS;
> +
> dev->hw_features = VETH_FEATURES;
> dev->hw_enc_features = VETH_FEATURES;
> dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-05-28 7:46 ` Jonas Köppeler
@ 2026-06-01 12:00 ` Simon Schippers
2026-06-01 14:03 ` Jonas Köppeler
0 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-01 12:00 UTC (permalink / raw)
To: Jonas Köppeler, hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 5/28/26 09:46, Jonas Köppeler wrote:
> On 5/27/26 3:54 PM, hawk@kernel.org wrote:
>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>
>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>> excessive NAPI scheduling overhead and qdisc requeues.
>>
>> Accumulate BQL completions and flush them when a configurable time
>> threshold is exceeded, letting DQL discover a limit that bounds actual
>> queuing delay to the configured interval. Coalescing state persists
>> across NAPI polls in struct veth_rq so completions can accumulate
>> beyond a single budget=64 cycle.
>>
>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>> completion.
>>
>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>
>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>
> Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
>
Thanks for your testing!
However, I have issues reproducing.
I run bare metal (without virtme) with v6 + your pktgen patch
and I am on the branch pktgen-and-benchmark, commit
"results: add veth-bql measurements":
1. ping fails with 100% packet loss ~20% of the times with --pktgen.
When this happens the avg ping of this run is mistakenly set
to 0.0 ms, which distorts the results.
I fixed it locally by rerunning when this happens.
2. pktgen runs with > 3 Mpps even with --nrules 10000, see log below.
I see that this is because of qdisc drops.
I also tried pfifo and sfq but with the same result.
I spent quite some time on it but I do not know a fix.
Do you have an idea?
Thanks!
The raw log:
sudo ./veth_bql_test.sh --pktgen --duration 2 --qdisc fq_codel --no-bpftrace --tx-usecs 100 --nrules 10000
INFO: Setting up veth pair with GRO
INFO: Threaded NAPI enabled
INFO: Installing qdisc: fq_codel
INFO: Loaded 10000 iptables rules in consumer NS
INFO: kernel: 7.1.0-rc4-patched-20260307+
INFO: BQL sysfs found: /sys/class/net/veth_bql0/queues/tx-0/byte_queue_limits
INFO: ethtool tx-usecs set to 100 on veth_bql1 (rx side)
INFO: Starting ping to 10.99.0.2 (5/s) to measure latency under load
INFO: Starting pktgen queue_xmit on veth_bql0 (threads=1 pkt_size=64)
[5s] BQL inflight=1 limit=17 watchdog=0
[5s] qdisc fq_codel pkts=27417 drops=6591520 requeues=14115 backlog=0 qlen=0 overlimits=0
[5s] softnet: processed=27960 time_squeeze=0 multi-CPU(6): cpu0(+5) cpu1(+121) cpu2(+33) cpu3(+116) cpu4(+27641) cpu5(+44)
INFO: Ping loss: 0% packet loss
INFO: Ping summary: rtt min/avg/max/mdev = 0.127/1.818/2.703/0.761 ms
INFO: pktgen results (thread 0):
Params: count 0 min_pkt_size: 64 max_pkt_size: 64
frags: 0 delay: 0 clone_skb: 0 ifname: veth_bql0@0
flows: 0 flowlen: 0
queue_map_min: 0 queue_map_max: 0
dst_min: 10.99.0.2 dst_max:
src_min: src_max:
src_mac: 0e:aa:4e:05:95:89 dst_mac: c2:e5:a3:4c:2a:7f
udp_src_min: 9 udp_src_max: 9 udp_dst_min: 9999 udp_dst_max: 9999
src_mac_count: 0 dst_mac_count: 0
xmit_mode: xmit_queue
Flags: NO_TIMESTAMP QUEUE_MAP_CPU SHARED
Current:
pkts-sofar: 6516031 errors: 102854
started: 12246566004us stopped: 12248535234us idle: 0us
seq_num: 6516032 cur_dst_mac_offset: 0 cur_src_mac_offset: 0
cur_saddr: 10.99.0.1 cur_daddr: 10.99.0.2
cur_udp_dst: 9999 cur_udp_src: 9
cur_queue_map: 0
flows: 0
Result: OK: 1969229(c1969229+d0) usec, 6516031 (64byte,0frags)
3308924pps 1694Mb/sec (1694169088bps) errors: 102854
TEST: veth_bql [ OK ]
INFO: Results: /home/simon/repos/veth-backpressure-performance-testing/results/selftests/2026-06-01T13-24-51
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-01 12:00 ` Simon Schippers
@ 2026-06-01 14:03 ` Jonas Köppeler
2026-06-01 16:16 ` Simon Schippers
0 siblings, 1 reply; 25+ messages in thread
From: Jonas Köppeler @ 2026-06-01 14:03 UTC (permalink / raw)
To: Simon Schippers, hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/1/26 2:00 PM, Simon Schippers wrote:
> On 5/28/26 09:46, Jonas Köppeler wrote:
>> On 5/27/26 3:54 PM, hawk@kernel.org wrote:
>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>
>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>
>>> Accumulate BQL completions and flush them when a configurable time
>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>> queuing delay to the configured interval. Coalescing state persists
>>> across NAPI polls in struct veth_rq so completions can accumulate
>>> beyond a single budget=64 cycle.
>>>
>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>> completion.
>>>
>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>
>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>> Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
>>
> Thanks for your testing!
>
> However, I have issues reproducing.
> I run bare metal (without virtme) with v6 + your pktgen patch
> and I am on the branch pktgen-and-benchmark, commit
> "results: add veth-bql measurements":
>
> 1. ping fails with 100% packet loss ~20% of the times with --pktgen.
> When this happens the avg ping of this run is mistakenly set
> to 0.0 ms, which distorts the results.
> I fixed it locally by rerunning when this happens.
>
> 2. pktgen runs with > 3 Mpps even with --nrules 10000, see log below.
> I see that this is because of qdisc drops.
> I also tried pfifo and sfq but with the same result.
> I spent quite some time on it but I do not know a fix.
>
> Do you have an idea?
> Thanks!
Hi,
yes there are some changes missing in the test script.
I have pushed it now, sorry. This should fix 1.
Regarding 2.: do not look at the pktgen output, in the
new version you will see something like "goodput",
which is the number you should look for.
Pktgen will report at what speed it enqueued packets in
the qdisc.
Let me know if it worked.
Best,
Jonas
> The raw log:
>
> sudo ./veth_bql_test.sh --pktgen --duration 2 --qdisc fq_codel --no-bpftrace --tx-usecs 100 --nrules 10000
> INFO: Setting up veth pair with GRO
> INFO: Threaded NAPI enabled
> INFO: Installing qdisc: fq_codel
> INFO: Loaded 10000 iptables rules in consumer NS
> INFO: kernel: 7.1.0-rc4-patched-20260307+
> INFO: BQL sysfs found: /sys/class/net/veth_bql0/queues/tx-0/byte_queue_limits
> INFO: ethtool tx-usecs set to 100 on veth_bql1 (rx side)
> INFO: Starting ping to 10.99.0.2 (5/s) to measure latency under load
> INFO: Starting pktgen queue_xmit on veth_bql0 (threads=1 pkt_size=64)
> [5s] BQL inflight=1 limit=17 watchdog=0
> [5s] qdisc fq_codel pkts=27417 drops=6591520 requeues=14115 backlog=0 qlen=0 overlimits=0
> [5s] softnet: processed=27960 time_squeeze=0 multi-CPU(6): cpu0(+5) cpu1(+121) cpu2(+33) cpu3(+116) cpu4(+27641) cpu5(+44)
> INFO: Ping loss: 0% packet loss
> INFO: Ping summary: rtt min/avg/max/mdev = 0.127/1.818/2.703/0.761 ms
> INFO: pktgen results (thread 0):
> Params: count 0 min_pkt_size: 64 max_pkt_size: 64
> frags: 0 delay: 0 clone_skb: 0 ifname: veth_bql0@0
> flows: 0 flowlen: 0
> queue_map_min: 0 queue_map_max: 0
> dst_min: 10.99.0.2 dst_max:
> src_min: src_max:
> src_mac: 0e:aa:4e:05:95:89 dst_mac: c2:e5:a3:4c:2a:7f
> udp_src_min: 9 udp_src_max: 9 udp_dst_min: 9999 udp_dst_max: 9999
> src_mac_count: 0 dst_mac_count: 0
> xmit_mode: xmit_queue
> Flags: NO_TIMESTAMP QUEUE_MAP_CPU SHARED
> Current:
> pkts-sofar: 6516031 errors: 102854
> started: 12246566004us stopped: 12248535234us idle: 0us
> seq_num: 6516032 cur_dst_mac_offset: 0 cur_src_mac_offset: 0
> cur_saddr: 10.99.0.1 cur_daddr: 10.99.0.2
> cur_udp_dst: 9999 cur_udp_src: 9
> cur_queue_map: 0
> flows: 0
> Result: OK: 1969229(c1969229+d0) usec, 6516031 (64byte,0frags)
> 3308924pps 1694Mb/sec (1694169088bps) errors: 102854
> TEST: veth_bql [ OK ]
> INFO: Results: /home/simon/repos/veth-backpressure-performance-testing/results/selftests/2026-06-01T13-24-51
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-01 14:03 ` Jonas Köppeler
@ 2026-06-01 16:16 ` Simon Schippers
2026-06-02 7:24 ` Jonas Köppeler
0 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-01 16:16 UTC (permalink / raw)
To: Jonas Köppeler, hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/1/26 16:03, Jonas Köppeler wrote:
> On 6/1/26 2:00 PM, Simon Schippers wrote:
>> On 5/28/26 09:46, Jonas Köppeler wrote:
>>> On 5/27/26 3:54 PM, hawk@kernel.org wrote:
>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>
>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>
>>>> Accumulate BQL completions and flush them when a configurable time
>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>> queuing delay to the configured interval. Coalescing state persists
>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>> beyond a single budget=64 cycle.
>>>>
>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>> completion.
>>>>
>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>
>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>> Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
>>>
>> Thanks for your testing!
>>
>> However, I have issues reproducing.
>> I run bare metal (without virtme) with v6 + your pktgen patch
>> and I am on the branch pktgen-and-benchmark, commit
>> "results: add veth-bql measurements":
>>
>> 1. ping fails with 100% packet loss ~20% of the times with --pktgen.
>> When this happens the avg ping of this run is mistakenly set
>> to 0.0 ms, which distorts the results.
>> I fixed it locally by rerunning when this happens.
>>
>> 2. pktgen runs with > 3 Mpps even with --nrules 10000, see log below.
>> I see that this is because of qdisc drops.
>> I also tried pfifo and sfq but with the same result.
>> I spent quite some time on it but I do not know a fix.
>>
>> Do you have an idea?
>> Thanks!
>
> Hi,
> yes there are some changes missing in the test script.
> I have pushed it now, sorry. This should fix 1.
I pulled it and ran...
sudo ./veth_bql_sweep.sh --runs 1 --pktgen --duration 20 --qdisc fq_codel --no-bpftrace
... but still 8/32=1/4 of the pings are zero, I do not see
a pattern.
I grabbed the logs from /tmp and this is what a failing
ping looks like:
PING 10.99.0.2 (10.99.0.2) 56(84) bytes of data.
--- 10.99.0.2 ping statistics ---
97 packets transmitted, 0 received, 100% packet loss, time 19967ms
Feels like a race or something..
Can you reproduce with the exact command?
I think you need --runs 1, else it just averages over multiple
runs.
> Regarding 2.: do not look at the pktgen output, in the
> new version you will see something like "goodput",
> which is the number you should look for.
> Pktgen will report at what speed it enqueued packets in
> the qdisc.
Exactly. Now it works. Had a single outlier but apart from that
everything is fine.
Thanks,
Simon
>
> Let me know if it worked.
> Best,
> Jonas
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-01 16:16 ` Simon Schippers
@ 2026-06-02 7:24 ` Jonas Köppeler
2026-06-02 15:37 ` Simon Schippers
0 siblings, 1 reply; 25+ messages in thread
From: Jonas Köppeler @ 2026-06-02 7:24 UTC (permalink / raw)
To: Simon Schippers, hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/1/26 6:16 PM, Simon Schippers wrote:
> On 6/1/26 16:03, Jonas Köppeler wrote:
>> On 6/1/26 2:00 PM, Simon Schippers wrote:
>>> On 5/28/26 09:46, Jonas Köppeler wrote:
>>>> On 5/27/26 3:54 PM, hawk@kernel.org wrote:
>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>
>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>
>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>> beyond a single budget=64 cycle.
>>>>>
>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>> completion.
>>>>>
>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>
>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>> Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
>>>>
>>> Thanks for your testing!
>>>
>>> However, I have issues reproducing.
>>> I run bare metal (without virtme) with v6 + your pktgen patch
>>> and I am on the branch pktgen-and-benchmark, commit
>>> "results: add veth-bql measurements":
>>>
>>> 1. ping fails with 100% packet loss ~20% of the times with --pktgen.
>>> When this happens the avg ping of this run is mistakenly set
>>> to 0.0 ms, which distorts the results.
>>> I fixed it locally by rerunning when this happens.
>>>
>>> 2. pktgen runs with > 3 Mpps even with --nrules 10000, see log below.
>>> I see that this is because of qdisc drops.
>>> I also tried pfifo and sfq but with the same result.
>>> I spent quite some time on it but I do not know a fix.
>>>
>>> Do you have an idea?
>>> Thanks!
>> Hi,
>> yes there are some changes missing in the test script.
>> I have pushed it now, sorry. This should fix 1.
> I pulled it and ran...
>
> sudo ./veth_bql_sweep.sh --runs 1 --pktgen --duration 20 --qdisc fq_codel --no-bpftrace
>
> ... but still 8/32=1/4 of the pings are zero, I do not see
> a pattern.
>
>
> I grabbed the logs from /tmp and this is what a failing
> ping looks like:
>
> PING 10.99.0.2 (10.99.0.2) 56(84) bytes of data.
>
> --- 10.99.0.2 ping statistics ---
> 97 packets transmitted, 0 received, 100% packet loss, time 19967ms
>
>
> Feels like a race or something..
> Can you reproduce with the exact command?
> I think you need --runs 1, else it just averages over multiple
> runs.
Sorry, no I could not reproduce this. I used the exact same
command as you did, and I am using net-next/main + v6 patches.
I have 0% ping loss across all tests. Does the ping loss
happen regardless of the qdisc?
>
>> Regarding 2.: do not look at the pktgen output, in the
>> new version you will see something like "goodput",
>> which is the number you should look for.
>> Pktgen will report at what speed it enqueued packets in
>> the qdisc.
> Exactly. Now it works. Had a single outlier but apart from that
> everything is fine.
>
> Thanks,
> Simon
>
>> Let me know if it worked.
>> Best,
>> Jonas
>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-02 7:24 ` Jonas Köppeler
@ 2026-06-02 15:37 ` Simon Schippers
2026-06-03 8:28 ` Jonas Köppeler
0 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-02 15:37 UTC (permalink / raw)
To: Jonas Köppeler, hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/2/26 09:24, Jonas Köppeler wrote:
> On 6/1/26 6:16 PM, Simon Schippers wrote:
>
>> On 6/1/26 16:03, Jonas Köppeler wrote:
>>> On 6/1/26 2:00 PM, Simon Schippers wrote:
>>>> On 5/28/26 09:46, Jonas Köppeler wrote:
>>>>> On 5/27/26 3:54 PM, hawk@kernel.org wrote:
>>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>
>>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>>
>>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>>> beyond a single budget=64 cycle.
>>>>>>
>>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>>> completion.
>>>>>>
>>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>>
>>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>> Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
>>>>>
>>>> Thanks for your testing!
>>>>
>>>> However, I have issues reproducing.
>>>> I run bare metal (without virtme) with v6 + your pktgen patch
>>>> and I am on the branch pktgen-and-benchmark, commit
>>>> "results: add veth-bql measurements":
>>>>
>>>> 1. ping fails with 100% packet loss ~20% of the times with --pktgen.
>>>> When this happens the avg ping of this run is mistakenly set
>>>> to 0.0 ms, which distorts the results.
>>>> I fixed it locally by rerunning when this happens.
>>>>
>>>> 2. pktgen runs with > 3 Mpps even with --nrules 10000, see log below.
>>>> I see that this is because of qdisc drops.
>>>> I also tried pfifo and sfq but with the same result.
>>>> I spent quite some time on it but I do not know a fix.
>>>>
>>>> Do you have an idea?
>>>> Thanks!
>>> Hi,
>>> yes there are some changes missing in the test script.
>>> I have pushed it now, sorry. This should fix 1.
>> I pulled it and ran...
>>
>> sudo ./veth_bql_sweep.sh --runs 1 --pktgen --duration 20 --qdisc fq_codel --no-bpftrace
>>
>> ... but still 8/32=1/4 of the pings are zero, I do not see
>> a pattern.
>>
>>
>> I grabbed the logs from /tmp and this is what a failing
>> ping looks like:
>>
>> PING 10.99.0.2 (10.99.0.2) 56(84) bytes of data.
>>
>> --- 10.99.0.2 ping statistics ---
>> 97 packets transmitted, 0 received, 100% packet loss, time 19967ms
>>
>>
>> Feels like a race or something..
>> Can you reproduce with the exact command?
>> I think you need --runs 1, else it just averages over multiple
>> runs.
>
> Sorry, no I could not reproduce this. I used the exact same
> command as you did, and I am using net-next/main + v6 patches.
> I have 0% ping loss across all tests. Does the ping loss
> happen regardless of the qdisc?
>
Yes, it happens for each qdisc I tested.
As a fix I changed the script to rerun if this happens.
With that I ran the benchmark and also created a script to have
the result as an ASCII table.
I think it would make sense to include something like this in
the commit message.
Throughput (pps)
==================================================
nrules | 0us | 100us | 1000us | 10000us || stock
-------+-------+-------+--------+---------++------
0 | 1.65M | 1.75M | 1.74M | 1.74M || 1.73M
100 | 684K | 755K | 730K | 728K || 744K
1000 | 119K | 126K | 126K | 125K || 126K
10000 | 13K | 12K | 13K | 13K || 13K
Ping RTT ms (avg)
==================================================
nrules | 0us | 100us | 1000us | 10000us || stock
-------+-------+-------+--------+---------++------
0 | 0.016 | 0.138 | 0.137 | 0.135 || 0.133
100 | 0.029 | 0.185 | 0.310 | 0.315 || 0.310
1000 | 0.137 | 0.321 | 1.66 | 1.81 || 1.78
10000 | 1.22 | 1.87 | 3.02 | 16.0 || 17.2
>>
>>> Regarding 2.: do not look at the pktgen output, in the
>>> new version you will see something like "goodput",
>>> which is the number you should look for.
>>> Pktgen will report at what speed it enqueued packets in
>>> the qdisc.
>> Exactly. Now it works. Had a single outlier but apart from that
>> everything is fine.
>>
>> Thanks,
>> Simon
>>
>>> Let me know if it worked.
>>> Best,
>>> Jonas
>>>
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-02 15:37 ` Simon Schippers
@ 2026-06-03 8:28 ` Jonas Köppeler
0 siblings, 0 replies; 25+ messages in thread
From: Jonas Köppeler @ 2026-06-03 8:28 UTC (permalink / raw)
To: Simon Schippers, hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/2/26 17:37, Simon Schippers wrote:
> On 6/2/26 09:24, Jonas Köppeler wrote:
>> On 6/1/26 6:16 PM, Simon Schippers wrote:
>>
>>> On 6/1/26 16:03, Jonas Köppeler wrote:
>>>> On 6/1/26 2:00 PM, Simon Schippers wrote:
>>>>> On 5/28/26 09:46, Jonas Köppeler wrote:
>>>>>> On 5/27/26 3:54 PM, hawk@kernel.org wrote:
>>>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>
>>>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>>>
>>>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>>>> beyond a single budget=64 cycle.
>>>>>>>
>>>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>>>> completion.
>>>>>>>
>>>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>>>
>>>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>> Tested-by: Jonas Köppeler<j.koeppeler@tu-berlin.de>
>>>>>>
>>>>> Thanks for your testing!
>>>>>
>>>>> However, I have issues reproducing.
>>>>> I run bare metal (without virtme) with v6 + your pktgen patch
>>>>> and I am on the branch pktgen-and-benchmark, commit
>>>>> "results: add veth-bql measurements":
>>>>>
>>>>> 1. ping fails with 100% packet loss ~20% of the times with --pktgen.
>>>>> When this happens the avg ping of this run is mistakenly set
>>>>> to 0.0 ms, which distorts the results.
>>>>> I fixed it locally by rerunning when this happens.
>>>>>
>>>>> 2. pktgen runs with > 3 Mpps even with --nrules 10000, see log below.
>>>>> I see that this is because of qdisc drops.
>>>>> I also tried pfifo and sfq but with the same result.
>>>>> I spent quite some time on it but I do not know a fix.
>>>>>
>>>>> Do you have an idea?
>>>>> Thanks!
>>>> Hi,
>>>> yes there are some changes missing in the test script.
>>>> I have pushed it now, sorry. This should fix 1.
>>> I pulled it and ran...
>>>
>>> sudo ./veth_bql_sweep.sh --runs 1 --pktgen --duration 20 --qdisc fq_codel --no-bpftrace
>>>
>>> ... but still 8/32=1/4 of the pings are zero, I do not see
>>> a pattern.
>>>
>>>
>>> I grabbed the logs from /tmp and this is what a failing
>>> ping looks like:
>>>
>>> PING 10.99.0.2 (10.99.0.2) 56(84) bytes of data.
>>>
>>> --- 10.99.0.2 ping statistics ---
>>> 97 packets transmitted, 0 received, 100% packet loss, time 19967ms
>>>
>>>
>>> Feels like a race or something..
>>> Can you reproduce with the exact command?
>>> I think you need --runs 1, else it just averages over multiple
>>> runs.
>> Sorry, no I could not reproduce this. I used the exact same
>> command as you did, and I am using net-next/main + v6 patches.
>> I have 0% ping loss across all tests. Does the ping loss
>> happen regardless of the qdisc?
>>
> Yes, it happens for each qdisc I tested.
> As a fix I changed the script to rerun if this happens.
I was able to reproduce it and I think it is because you are lacking
support of the qdiscs, and thus you will fallback to pfifo, this
explains why it occasionally will drop all packets.
>
> With that I ran the benchmark and also created a script to have
> the result as an ASCII table.
> I think it would make sense to include something like this in
> the commit message.
>
> Throughput (pps)
> ==================================================
> nrules | 0us | 100us | 1000us | 10000us || stock
> -------+-------+-------+--------+---------++------
> 0 | 1.65M | 1.75M | 1.74M | 1.74M || 1.73M
> 100 | 684K | 755K | 730K | 728K || 744K
> 1000 | 119K | 126K | 126K | 125K || 126K
> 10000 | 13K | 12K | 13K | 13K || 13K
>
>
> Ping RTT ms (avg)
> ==================================================
> nrules | 0us | 100us | 1000us | 10000us || stock
> -------+-------+-------+--------+---------++------
> 0 | 0.016 | 0.138 | 0.137 | 0.135 || 0.133
> 100 | 0.029 | 0.185 | 0.310 | 0.315 || 0.310
> 1000 | 0.137 | 0.321 | 1.66 | 1.81 || 1.78
> 10000 | 1.22 | 1.87 | 3.02 | 16.0 || 17.2
Yes, good idea. I think we should report p99 + max_ping and not the avg.
>
>>>> Regarding 2.: do not look at the pktgen output, in the
>>>> new version you will see something like "goodput",
>>>> which is the number you should look for.
>>>> Pktgen will report at what speed it enqueued packets in
>>>> the qdisc.
>>> Exactly. Now it works. Had a single outlier but apart from that
>>> everything is fine.
>>>
>>> Thanks,
>>> Simon
>>>
>>>> Let me know if it worked.
>>>> Best,
>>>> Jonas
>>>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-05-27 13:54 ` [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs hawk
2026-05-28 7:46 ` Jonas Köppeler
@ 2026-05-29 14:51 ` Simon Schippers
2026-05-30 0:50 ` sashiko-bot
` (2 subsequent siblings)
4 siblings, 0 replies; 25+ messages in thread
From: Simon Schippers @ 2026-05-29 14:51 UTC (permalink / raw)
To: hawk, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 5/27/26 15:54, hawk@kernel.org wrote:
> From: Simon Schippers <simon.schippers@tu-dortmund.de>
> + /* If the ring is now empty and the peer TX queue is stalled by DQL
> + * backpressure, release completions immediately to unblock it.
> + */
> + if (peer_txq && state->n_bql && __ptr_ring_empty(&rq->xdp_ring)) {
> + /* Pairs with smp_wmb() in __ptr_ring_produce(); ensure ring
> + * emptiness is observed before reading peer_txq->state.
> + */
> + smp_rmb();
> + if (test_bit(__QUEUE_STATE_STACK_XOFF, &peer_txq->state))
> + veth_bql_complete(state, peer_txq);
> + }
"Does this leave coalesced BQL completions pending indefinitely if the
receive ring is empty but the peer TX queue isn't stalled by DQL? [...]"
Regarding this by [1]: Yes, this is desired behavior IMO.
From my understanding there should not be a BQL stall caused by
this (needs to be tested though).
A minor nit is that the observation of the minimum amount of
slack found over several iterations of the completion processing
(saved in dql->lowest_slack) is inaccurate and therefore the limit
might be set to a too low value once (at the next completion call).
I think it does not matter though.
Overall The AI reviews [1] [2] have some valid points.
Apart from that I would consider joining patch 2 and 5.
[1] Link: https://sashiko.dev/#/patchset/20260527135418.1166665-1-hawk%40kernel.org
[2] Link: https://netdev-ai.bots.linux.dev/sashiko/#/patchset/20260527135418.1166665-1-hawk%40kernel.org
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-05-27 13:54 ` [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs hawk
2026-05-28 7:46 ` Jonas Köppeler
2026-05-29 14:51 ` Simon Schippers
@ 2026-05-30 0:50 ` sashiko-bot
2026-06-04 8:21 ` Paolo Abeni
2026-06-08 10:38 ` Simon Schippers
4 siblings, 0 replies; 25+ messages in thread
From: sashiko-bot @ 2026-05-30 0:50 UTC (permalink / raw)
To: hawk; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [High] Coalesced BQL completions are left pending indefinitely when the receive ring is empty and NAPI yields, corrupting DQL timing.
- [Medium] Disabling BQL coalescing at runtime (tx-usecs = 0) orphans previously accumulated completions, permanently skewing DQL state.
- [Low] Data race on `priv->tx_coal_usecs` between ethtool configuration and the NAPI poll data path.
--
commit d710fd75a7c5d61400890032084a8df84bc9498d
Author: Simon Schippers <simon.schippers@tu-dortmund.de>
veth: time-based BQL completion coalescing via ethtool tx-usecs
This patch implements time-based batching of BQL completions for the veth
driver to reduce NAPI scheduling overhead. It accumulates BQL completions
and flushes them when a configurable time threshold is exceeded, letting
DQL discover a limit that bounds actual queuing delay to the configured
interval.
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index d5675d9d52361..743d17b37223e 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
[ ... ]
> @@ -265,7 +275,30 @@ static void veth_get_channels(struct net_device *dev,
> static int veth_set_channels(struct net_device *dev,
> struct ethtool_channels *ch);
>
> +static int veth_get_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> +
> + ec->tx_coalesce_usecs = priv->tx_coal_usecs;
> + return 0;
> +}
> +
> +static int veth_set_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> +
> + priv->tx_coal_usecs = ec->tx_coalesce_usecs;
[Severity: Low]
Are we missing a WRITE_ONCE() here? priv->tx_coal_usecs is modified under
the RTNL lock but read by veth_xdp_rcv() in softirq context during NAPI
polling without any synchronization. Using WRITE_ONCE() and READ_ONCE()
would prevent compiler reordering and load tearing.
> + return 0;
> +}
[ ... ]
> static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> struct veth_xdp_tx_bq *bq,
> struct veth_stats *stats,
> struct netdev_queue *peer_txq)
> {
> + struct veth_bql_state *state = &rq->bql_state;
> int i, done = 0, n_xdpf = 0;
> void *xdpf[VETH_XDP_BATCH];
> + struct veth_priv *priv;
> + u64 bql_flush_ns;
> +
> + priv = netdev_priv(rq->dev);
> + bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
> +
> + /* Clamp stored timestamp in case we migrated to a CPU with a behind
> + * sched_clock(); prevents the deadline from never firing.
> + */
> + state->time = min(state->time, sched_clock());
> +
> + /* Flush completions that timed out since the previous NAPI poll. */
> + if (peer_txq && bql_flush_ns)
> + veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
[Severity: Medium]
What happens if the user disables coalescing by setting tx-usecs to 0 while
there are accumulated completions?
Because bql_flush_ns becomes 0, this pre-loop time-based flush check will
evaluate to false, skipping the flush of previously batched completions.
Inside the loop, new packets are completed individually, ignoring the old
state->n_bql count.
If those pending completions remain stranded indefinitely, won't the
in-flight byte count tracked by DQL become permanently inflated,
artificially reducing the available queue capacity?
> for (i = 0; i < budget; i++) {
> void *ptr = __ptr_ring_consume(&rq->xdp_ring);
[ ... ]
> @@ -989,6 +1064,18 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> if (n_xdpf)
> veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
>
> + /* If the ring is now empty and the peer TX queue is stalled by DQL
> + * backpressure, release completions immediately to unblock it.
> + */
> + if (peer_txq && state->n_bql && __ptr_ring_empty(&rq->xdp_ring)) {
> + /* Pairs with smp_wmb() in __ptr_ring_produce(); ensure ring
> + * emptiness is observed before reading peer_txq->state.
> + */
> + smp_rmb();
> + if (test_bit(__QUEUE_STATE_STACK_XOFF, &peer_txq->state))
> + veth_bql_complete(state, peer_txq);
> + }
[Severity: High]
Does this leave coalesced BQL completions pending indefinitely if the
receive ring is empty but the peer TX queue isn't stalled by DQL?
When the NAPI budget loop finishes and the RX ring is empty, the device may
go idle and NAPI polling stops. The batched completions (state->n_bql) are
only flushed if the TX queue is currently stalled by DQL.
If the queue is not stalled, these completions are left pending indefinitely
without a timer to flush them. Since DQL measures the time between enqueue
and completion, wouldn't this arbitrary delay (until the next packet
arrives) be perceived by DQL as a massive transmission delay?
This could cause it to incorrectly shrink the dynamic limit to its minimum
and severely throttle transmission performance.
> +
> u64_stats_update_begin(&rq->stats.syncp);
> rq->stats.vs.xdp_redirect += stats->xdp_redirect;
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260527135418.1166665-1-hawk@kernel.org?part=5
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-05-27 13:54 ` [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs hawk
` (2 preceding siblings ...)
2026-05-30 0:50 ` sashiko-bot
@ 2026-06-04 8:21 ` Paolo Abeni
2026-06-08 10:38 ` Simon Schippers
4 siblings, 0 replies; 25+ messages in thread
From: Paolo Abeni @ 2026-06-04 8:21 UTC (permalink / raw)
To: hawk, netdev
Cc: Simon Schippers, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann,
John Fastabend, Stanislav Fomichev, linux-kernel, bpf
On 5/27/26 3:54 PM, hawk@kernel.org wrote:
> +static int veth_set_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> +
> + priv->tx_coal_usecs = ec->tx_coalesce_usecs;
Sashiko noted _ONCE() annotation are needed here and for lockless reader
below.
/P
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-05-27 13:54 ` [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs hawk
` (3 preceding siblings ...)
2026-06-04 8:21 ` Paolo Abeni
@ 2026-06-08 10:38 ` Simon Schippers
2026-06-08 13:04 ` Jesper Dangaard Brouer
4 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-08 10:38 UTC (permalink / raw)
To: hawk, netdev, Jonas Köppeler
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
[-- Attachment #1: Type: text/plain, Size: 2688 bytes --]
On 5/27/26 15:54, hawk@kernel.org wrote:
> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>
> Per-packet BQL completion forces DQL to converge on limit=2, causing
> excessive NAPI scheduling overhead and qdisc requeues.
>
> Accumulate BQL completions and flush them when a configurable time
> threshold is exceeded, letting DQL discover a limit that bounds actual
> queuing delay to the configured interval. Coalescing state persists
> across NAPI polls in struct veth_rq so completions can accumulate
> beyond a single budget=64 cycle.
>
> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
> setting tx-usecs to 0 disables coalescing and falls back to per-packet
> completion.
>
> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>
> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> ---
I found the issue that n_bql may become infinitly large if producer
and consumer have the same speed (and tx_usecs is large). It could
cause a potential BUG_ON if n_bql grows beyond INT_MAX...
Also I figured that no hardware BQL driver ever completes more than
BQL limit many elements.
Therefore, I propose a simpler logic (see attachment) that completes
either on the usual bql_flush_ns or if n_bql > dql.limit.
If n_bql > dql.limit then we either have the case above that the
producer is as fast as the consumer or we have BQL starvation.
if (state->time + bql_flush_ns <= current_time ||
state->n_bql > peer_txq->dql.limit) {
It must be n_bql *bigger than* dql.limit because the producer will
always exceed the limit before it stops, see netdev_tx_sent_queue().
It is fast because peer_txq->dql.limit is in the cacheline of the
completion path, see dynamic_queue_limits.h.
Another advantage is that we avoid the snippet checking for empty
and BQL stopped which requires an smp_rmb() and an test_bit().
Apart from that I:
- Always call veth_bql_maybe_complete() in the for loop to have
more accurate completion intervals when having mixed XDP and
non-XDP packets.
- Made it so tx_usecs = 0 is now also a normal case.
- Change the type of n_bql to uint instead of int.
- Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
- Moved the bql_state init in __veth_napi_enable_range() in front
of napi_enable() to avoid a race (Sashiko).
- Moved the bql_state reset in veth_napi_del_range() after the
ptr_ring_cleanup() (probably does not matter but makes sense to me)
Benchmarks look just fine, see commit message.
WDYT?
Thanks,
Simon
[-- Attachment #2: 0005-veth-time-based-BQL-completion-coalescing-via-ethtoo.patch --]
[-- Type: text/x-patch, Size: 10426 bytes --]
From 59844f703988805ff7913989ed4dcd427ae882af Mon Sep 17 00:00:00 2001
From: Simon Schippers <simon.schippers@tu-dortmund.de>
Date: Wed, 27 May 2026 15:54:16 +0200
Subject: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing
via ethtool tx-usecs
Per-packet BQL completion forces DQL to converge on limit=2, causing
excessive NAPI scheduling overhead and qdisc requeues.
Accumulate BQL completions and flush them when a configurable time
threshold (tx-usecs) is exceeded, letting DQL discover a limit that
bounds actual queuing delay to the configured interval. Coalescing
state persists across NAPI polls in struct veth_rq so completions can
accumulate beyond a single budget=64 cycle.
The flush condition is:
state->time + bql_flush_ns <= current_time || state->n_bql > dql.limit
Flushing when n_bql exceeds dql.limit handles two cases:
- BQL starvation
- The steady-state case where the producer and consumer run at the
same speed with a large tx-usecs, which would otherwise allow n_bql
to grow without bound (and potentially overflow int).
The comparison is strictly greater-than because netdev_tx_sent_queue()
always lets the producer exceed the limit by one before it stops, so
n_bql == dql.limit is a normal in-flight state. dql.limit lives in
the same cacheline as the completion path, so the check is cheap.
Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
setting tx-usecs to 0 disables coalescing and falls back to per-packet
completion.
ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
Benchmarks (10 runs, Ryzen 5 5600X @ 4.3 GHz, SMT off, 3200 MHz RAM):
Throughput (pps)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 1.62M | 1.89M | 1.75M | 1.73M | 1.73M | 1.73M | 1.73M || 1.76M
1 | 1.51M | 1.72M | 1.63M | 1.60M | 1.60M | 1.59M | 1.59M || 1.64M
10 | 1.33M | 1.52M | 1.47M | 1.41M | 1.41M | 1.41M | 1.41M || 1.45M
100 | 675K | 748K | 757K | 722K | 722K | 724K | 729K || 737K
1000 | 117K | 125K | 125K | 126K | 124K | 124K | 124K || 126K
10000 | 13K | 13K | 13K | 13K | 13K | 13K | 13K || 13K
Ping RTT ms (avg)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 0.017 | 0.090 | 0.137 | 0.138 | 0.138 | 0.138 | 0.138 || 0.133
1 | 0.017 | 0.097 | 0.145 | 0.146 | 0.144 | 0.148 | 0.147 || 0.143
10 | 0.018 | 0.092 | 0.158 | 0.165 | 0.165 | 0.162 | 0.167 || 0.159
100 | 0.031 | 0.104 | 0.181 | 0.317 | 0.317 | 0.317 | 0.311 || 0.305
1000 | 0.142 | 0.198 | 0.314 | 0.991 | 1.69 | 1.82 | 1.82 || 1.76
10000 | 1.12 | 1.72 | 1.74 | 1.76 | 2.88 | 9.27 | 15.9 || 17.4
Ping RTT ms (p99)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
1 | 0.027 | 0.123 | 0.170 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
10 | 0.030 | 0.117 | 0.190 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
100 | 0.045 | 0.134 | 0.231 | 0.368 | 0.365 | 0.370 | 0.361 || 0.358
1000 | 0.230 | 0.300 | 0.408 | 0.989 | 2.11 | 2.12 | 2.13 || 2.07
10000 | 0.979 | 1.59 | 1.26 | 2.06 | 3.77 | 9.87 | 20.1 || 20.3
Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
drivers/net/veth.c | 93 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 90 insertions(+), 3 deletions(-)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index d5675d9d5236..b9179de628a6 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -28,6 +28,7 @@
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
#include <linux/skbuff_ref.h>
+#include <linux/sched/clock.h>
#include <net/page_pool/helpers.h>
#define DRV_NAME "veth"
@@ -44,6 +45,8 @@
#define VETH_XDP_TX_BULK_SIZE 16
#define VETH_XDP_BATCH 16
+#define VETH_BQL_COAL_TX_USECS 100 /* default tx-usecs for BQL batching */
+
struct veth_stats {
u64 rx_drops;
/* xdp */
@@ -62,6 +65,11 @@ struct veth_rq_stats {
struct u64_stats_sync syncp;
};
+struct veth_bql_state {
+ u64 time; /* sched_clock() when current coalescing window started */
+ uint n_bql; /* BQL completions batched in the current window */
+};
+
struct veth_rq {
struct napi_struct xdp_napi;
struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
@@ -69,6 +77,7 @@ struct veth_rq {
struct bpf_prog __rcu *xdp_prog;
struct xdp_mem_info xdp_mem;
struct veth_rq_stats stats;
+ struct veth_bql_state bql_state;
bool rx_notify_masked;
struct ptr_ring xdp_ring;
struct xdp_rxq_info xdp_rxq;
@@ -81,6 +90,7 @@ struct veth_priv {
struct bpf_prog *_xdp_prog;
struct veth_rq *rq;
unsigned int requested_headroom;
+ unsigned int tx_coal_usecs; /* BQL completion coalescing */
};
struct veth_xdp_tx_bq {
@@ -265,7 +275,31 @@ static void veth_get_channels(struct net_device *dev,
static int veth_set_channels(struct net_device *dev,
struct ethtool_channels *ch);
+static int veth_get_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *ec,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+
+ ec->tx_coalesce_usecs = priv->tx_coal_usecs;
+ return 0;
+}
+
+static int veth_set_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *ec,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+
+ /* Paired with READ_ONCE in veth_xdp_rcv(). */
+ WRITE_ONCE(priv->tx_coal_usecs, ec->tx_coalesce_usecs);
+ return 0;
+}
+
static const struct ethtool_ops veth_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_strings = veth_get_strings,
@@ -275,6 +309,8 @@ static const struct ethtool_ops veth_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
.get_channels = veth_get_channels,
.set_channels = veth_set_channels,
+ .get_coalesce = veth_get_coalesce,
+ .set_coalesce = veth_set_coalesce,
};
/* general routines */
@@ -937,13 +973,56 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
return NULL;
}
+static void veth_bql_maybe_complete(struct veth_bql_state *state,
+ struct netdev_queue *peer_txq,
+ u64 bql_flush_ns)
+{
+ u64 current_time;
+
+ /* There is no reason to complete with 0 and
+ * peer_txq could go away.
+ */
+ if (!state->n_bql || !peer_txq)
+ return;
+
+ current_time = sched_clock();
+
+ /* We complete if:
+ * 1. We reach bql_flush_ns.
+ * 2. We potentially have BQL starvation.
+ */
+ if (state->time + bql_flush_ns <= current_time ||
+ state->n_bql > peer_txq->dql.limit) {
+ netdev_tx_completed_queue(peer_txq, state->n_bql,
+ state->n_bql * VETH_BQL_UNIT);
+ state->time = current_time;
+ state->n_bql = 0;
+ }
+}
+
static int veth_xdp_rcv(struct veth_rq *rq, int budget,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats,
struct netdev_queue *peer_txq)
{
+ struct veth_bql_state *state = &rq->bql_state;
int i, done = 0, n_xdpf = 0;
void *xdpf[VETH_XDP_BATCH];
+ struct veth_priv *priv;
+ u64 bql_flush_ns;
+
+ priv = netdev_priv(rq->dev);
+
+ /* Paired with WRITE_ONCE() in veth_set_coalesce(). */
+ bql_flush_ns = (u64)READ_ONCE(priv->tx_coal_usecs) * 1000;
+
+ /* Clamp stored timestamp in case we migrated to a CPU with a behind
+ * sched_clock(); tries to reduce late BQL flushes.
+ */
+ state->time = min(state->time, sched_clock());
+
+ /* Flush completions that timed out since the previous NAPI poll. */
+ veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
for (i = 0; i < budget; i++) {
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
@@ -968,12 +1047,10 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
}
} else {
/* ndo_start_xmit */
- bool bql_charged = veth_ptr_is_bql(ptr);
struct sk_buff *skb = veth_ptr_to_skb(ptr);
+ state->n_bql += veth_ptr_is_bql(ptr);
stats->xdp_bytes += skb->len;
- if (peer_txq && bql_charged)
- netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
if (skb) {
@@ -983,6 +1060,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
napi_gro_receive(&rq->xdp_napi, skb);
}
}
+ veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
done++;
}
@@ -1091,6 +1169,9 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
+ rq->bql_state.time = sched_clock();
+ rq->bql_state.n_bql = 0;
+
napi_enable(&rq->xdp_napi);
rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
}
@@ -1135,6 +1216,8 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
rq->rx_notify_masked = false;
ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
+ rq->bql_state.n_bql = 0;
+ rq->bql_state.time = 0;
}
/* Reset BQL and wake stopped peer txqs. A concurrent veth_xmit()
@@ -1813,6 +1896,8 @@ static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
static void veth_setup(struct net_device *dev)
{
+ struct veth_priv *priv = netdev_priv(dev);
+
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
@@ -1838,6 +1923,8 @@ static void veth_setup(struct net_device *dev)
dev->max_mtu = ETH_MAX_MTU;
dev->watchdog_timeo = msecs_to_jiffies(16000);
+ priv->tx_coal_usecs = VETH_BQL_COAL_TX_USECS;
+
dev->hw_features = VETH_FEATURES;
dev->hw_enc_features = VETH_FEATURES;
dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
--
2.43.0
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-08 10:38 ` Simon Schippers
@ 2026-06-08 13:04 ` Jesper Dangaard Brouer
2026-06-08 13:13 ` Jonas Köppeler
0 siblings, 1 reply; 25+ messages in thread
From: Jesper Dangaard Brouer @ 2026-06-08 13:04 UTC (permalink / raw)
To: Simon Schippers, netdev, Jonas Köppeler
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 08/06/2026 12.38, Simon Schippers wrote:
> On 5/27/26 15:54, hawk@kernel.org wrote:
>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>
>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>> excessive NAPI scheduling overhead and qdisc requeues.
>>
>> Accumulate BQL completions and flush them when a configurable time
>> threshold is exceeded, letting DQL discover a limit that bounds actual
>> queuing delay to the configured interval. Coalescing state persists
>> across NAPI polls in struct veth_rq so completions can accumulate
>> beyond a single budget=64 cycle.
>>
>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>> completion.
>>
>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>
>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>> ---
>
> I found the issue that n_bql may become infinitly large if producer
> and consumer have the same speed (and tx_usecs is large). It could
> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
> Also I figured that no hardware BQL driver ever completes more than
> BQL limit many elements.
>
> Therefore, I propose a simpler logic (see attachment) that completes
> either on the usual bql_flush_ns or if n_bql > dql.limit.
> If n_bql > dql.limit then we either have the case above that the
> producer is as fast as the consumer or we have BQL starvation.
>
> if (state->time + bql_flush_ns <= current_time ||
> state->n_bql > peer_txq->dql.limit) {
>
> It must be n_bql *bigger than* dql.limit because the producer will
> always exceed the limit before it stops, see netdev_tx_sent_queue().
> It is fast because peer_txq->dql.limit is in the cacheline of the
> completion path, see dynamic_queue_limits.h.
>
> Another advantage is that we avoid the snippet checking for empty
> and BQL stopped which requires an smp_rmb() and an test_bit().
>
> Apart from that I:
> - Always call veth_bql_maybe_complete() in the for loop to have
> more accurate completion intervals when having mixed XDP and
> non-XDP packets.
> - Made it so tx_usecs = 0 is now also a normal case.
> - Change the type of n_bql to uint instead of int.
> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
> - Moved the bql_state init in __veth_napi_enable_range() in front
> of napi_enable() to avoid a race (Sashiko).
> - Moved the bql_state reset in veth_napi_del_range() after the
> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>
> Benchmarks look just fine, see commit message.
>
> WDYT?
Looks good to me, I will use this in my V7 patchset.
A bike-shedding issue: We change the coalescing parameters for the veth
net_device, but should this be a TX or RX parameter?
For physical NICs adjusting TX coalescing will affect the BQL as this
affect the TX completion of the transmitted packets. For veth, it is the
veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
is where this patch adds netdev_tx_completed_queue calls for BQL.
Any opinions on the "TX" or "RX" color?
--Jesper
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-08 13:04 ` Jesper Dangaard Brouer
@ 2026-06-08 13:13 ` Jonas Köppeler
2026-06-08 14:21 ` Simon Schippers
0 siblings, 1 reply; 25+ messages in thread
From: Jonas Köppeler @ 2026-06-08 13:13 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Simon Schippers, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>
>
> On 08/06/2026 12.38, Simon Schippers wrote:
>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>
>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>
>>> Accumulate BQL completions and flush them when a configurable time
>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>> queuing delay to the configured interval. Coalescing state persists
>>> across NAPI polls in struct veth_rq so completions can accumulate
>>> beyond a single budget=64 cycle.
>>>
>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>> completion.
>>>
>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>
>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>> ---
>>
>> I found the issue that n_bql may become infinitly large if producer
>> and consumer have the same speed (and tx_usecs is large). It could
>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>> Also I figured that no hardware BQL driver ever completes more than
>> BQL limit many elements.
>>
>> Therefore, I propose a simpler logic (see attachment) that completes
>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>> If n_bql > dql.limit then we either have the case above that the
>> producer is as fast as the consumer or we have BQL starvation.
>>
>> if (state->time + bql_flush_ns <= current_time ||
>> state->n_bql > peer_txq->dql.limit) {
>>
>> It must be n_bql *bigger than* dql.limit because the producer will
>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>> It is fast because peer_txq->dql.limit is in the cacheline of the
>> completion path, see dynamic_queue_limits.h.
>>
>> Another advantage is that we avoid the snippet checking for empty
>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>
>> Apart from that I:
>> - Always call veth_bql_maybe_complete() in the for loop to have
>> more accurate completion intervals when having mixed XDP and
>> non-XDP packets.
>> - Made it so tx_usecs = 0 is now also a normal case.
>> - Change the type of n_bql to uint instead of int.
>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>> - Moved the bql_state init in __veth_napi_enable_range() in front
>> of napi_enable() to avoid a race (Sashiko).
>> - Moved the bql_state reset in veth_napi_del_range() after the
>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>
>> Benchmarks look just fine, see commit message.
>>
>> WDYT?
>
> Looks good to me, I will use this in my V7 patchset.
>
> A bike-shedding issue: We change the coalescing parameters for the veth
> net_device, but should this be a TX or RX parameter?
>
Hi,
The results look a little bit suspicious, that in some cases the p99 is
smaller than the average, which can happen if you have really large max
values. It feels something is off with the methodology. I think we
should drop the avg, and include the max value. This would give a better
picture.
Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done
in the kernel? It relies on bool normalization which feels a bit
implicit to me.
> For physical NICs adjusting TX coalescing will affect the BQL as this
> affect the TX completion of the transmitted packets. For veth, it is the
> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
> is where this patch adds netdev_tx_completed_queue calls for BQL.
> Any opinions on the "TX" or "RX" color?
I think I would prefer to configure it on the tx dev, and the recv side
gets the value from the peer. Maybe something like this.
@@ -997,11 +998,12 @@ static int veth_xdp_rcv(struct veth_rq *rq, int
budget,
struct veth_bql_state *state = &rq->bql_state;
int i, done = 0, n_xdpf = 0;
void *xdpf[VETH_XDP_BATCH];
- struct veth_priv *priv;
- u64 bql_flush_ns;
+ u64 bql_flush_ns = 0;
- priv = netdev_priv(rq->dev);
- bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
+ if (peer_txq) {
+ struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
+ bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
+ }
/* Clamp stored timestamp in case we migrated to a CPU with a
behind
* sched_clock(); prevents the deadline from never firing.
- Jonas
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-08 13:13 ` Jonas Köppeler
@ 2026-06-08 14:21 ` Simon Schippers
2026-06-09 13:59 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-08 14:21 UTC (permalink / raw)
To: Jonas Köppeler, Jesper Dangaard Brouer, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf
On 6/8/26 15:13, Jonas Köppeler wrote:
>
> On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>>
>>
>> On 08/06/2026 12.38, Simon Schippers wrote:
>>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>
>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>
>>>> Accumulate BQL completions and flush them when a configurable time
>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>> queuing delay to the configured interval. Coalescing state persists
>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>> beyond a single budget=64 cycle.
>>>>
>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>> completion.
>>>>
>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>
>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>> ---
>>>
>>> I found the issue that n_bql may become infinitly large if producer
>>> and consumer have the same speed (and tx_usecs is large). It could
>>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>>> Also I figured that no hardware BQL driver ever completes more than
>>> BQL limit many elements.
>>>
>>> Therefore, I propose a simpler logic (see attachment) that completes
>>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>>> If n_bql > dql.limit then we either have the case above that the
>>> producer is as fast as the consumer or we have BQL starvation.
>>>
>>> if (state->time + bql_flush_ns <= current_time ||
>>> state->n_bql > peer_txq->dql.limit) {
>>>
>>> It must be n_bql *bigger than* dql.limit because the producer will
>>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>>> It is fast because peer_txq->dql.limit is in the cacheline of the
>>> completion path, see dynamic_queue_limits.h.
>>>
>>> Another advantage is that we avoid the snippet checking for empty
>>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>>
>>> Apart from that I:
>>> - Always call veth_bql_maybe_complete() in the for loop to have
>>> more accurate completion intervals when having mixed XDP and
>>> non-XDP packets.
>>> - Made it so tx_usecs = 0 is now also a normal case.
>>> - Change the type of n_bql to uint instead of int.
>>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>>> - Moved the bql_state init in __veth_napi_enable_range() in front
>>> of napi_enable() to avoid a race (Sashiko).
>>> - Moved the bql_state reset in veth_napi_del_range() after the
>>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>>
>>> Benchmarks look just fine, see commit message.
>>>
>>> WDYT?
>
>>
>> Looks good to me, I will use this in my V7 patchset.
>>
>> A bike-shedding issue: We change the coalescing parameters for the veth
>> net_device, but should this be a TX or RX parameter?
>>
>
> Hi,
>
> The results look a little bit suspicious, that in some cases the p99 is smaller than the average, which can happen if you have really large max values. It feels something is off with the methodology. I think we should drop the avg, and include the max value. This would give a better picture.
Luckily I saved the raw test result from /tmp (the raw ping output
is included).
extract_ping_p99() seems to be broken. It has issues ordering the
numbers. So if there are 2 or 3 numbers after the point I think.
avg ping is fine.
Claude Sonnet wrote a script for me to recalculate the p99 pings
from the raw ping output).
Below is the new result. Not 100% sure but this seems right now.
Most pings are the same.
Ping RTT ms (p99)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
1 | 0.027 | 0.123 | 0.169 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
10 | 0.030 | 0.117 | 0.189 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
100 | 0.045 | 0.134 | 0.233 | 0.368 | 0.365 | 0.369 | 0.361 || 0.358
1000 | 0.230 | 0.300 | 0.412 | 1.26 | 2.11 | 2.12 | 2.13 || 2.07
10000 | 2.04 | 2.55 | 2.54 | 2.72 | 3.77 | 12.1 | 20.1 || 20.3
>
> Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done in the kernel? It relies on bool normalization which feels a bit implicit to me.
Ok, we could swap for something like
if (veth_ptr_is_bql(ptr))
state->n_bql+;
>
>> For physical NICs adjusting TX coalescing will affect the BQL as this
>> affect the TX completion of the transmitted packets. For veth, it is the
>> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
>> is where this patch adds netdev_tx_completed_queue calls for BQL.
>> Any opinions on the "TX" or "RX" color?
Personally I would also say TX.
> I think I would prefer to configure it on the tx dev, and the recv side gets the value from the peer. Maybe something like this.
>
> @@ -997,11 +998,12 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> struct veth_bql_state *state = &rq->bql_state;
> int i, done = 0, n_xdpf = 0;
> void *xdpf[VETH_XDP_BATCH];
Why this deletion?
> - struct veth_priv *priv;
> - u64 bql_flush_ns;
> + u64 bql_flush_ns = 0;
>
> - priv = netdev_priv(rq->dev);
> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
I see we should check if peer_txq exists.
> + if (peer_txq) {
> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
> + }
>
> /* Clamp stored timestamp in case we migrated to a CPU with a behind
> * sched_clock(); prevents the deadline from never firing.
>
> - Jonas
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-08 14:21 ` Simon Schippers
@ 2026-06-09 13:59 ` Jesper Dangaard Brouer
2026-06-09 15:08 ` Simon Schippers
0 siblings, 1 reply; 25+ messages in thread
From: Jesper Dangaard Brouer @ 2026-06-09 13:59 UTC (permalink / raw)
To: Simon Schippers, Jonas Köppeler, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf, kernel-team
On 08/06/2026 16.21, Simon Schippers wrote:
> On 6/8/26 15:13, Jonas Köppeler wrote:
>>
>> On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 08/06/2026 12.38, Simon Schippers wrote:
>>>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>
>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>
>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>> beyond a single budget=64 cycle.
>>>>>
>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>> completion.
>>>>>
>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>
>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>> ---
>>>>
>>>> I found the issue that n_bql may become infinitly large if producer
>>>> and consumer have the same speed (and tx_usecs is large). It could
>>>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>>>> Also I figured that no hardware BQL driver ever completes more than
>>>> BQL limit many elements.
>>>>
>>>> Therefore, I propose a simpler logic (see attachment) that completes
>>>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>>>> If n_bql > dql.limit then we either have the case above that the
>>>> producer is as fast as the consumer or we have BQL starvation.
>>>>
>>>> if (state->time + bql_flush_ns <= current_time ||
>>>> state->n_bql > peer_txq->dql.limit) {
>>>>
>>>> It must be n_bql *bigger than* dql.limit because the producer will
>>>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>>>> It is fast because peer_txq->dql.limit is in the cacheline of the
>>>> completion path, see dynamic_queue_limits.h.
>>>>
>>>> Another advantage is that we avoid the snippet checking for empty
>>>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>>>
>>>> Apart from that I:
>>>> - Always call veth_bql_maybe_complete() in the for loop to have
>>>> more accurate completion intervals when having mixed XDP and
>>>> non-XDP packets.
>>>> - Made it so tx_usecs = 0 is now also a normal case.
>>>> - Change the type of n_bql to uint instead of int.
>>>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>>>> - Moved the bql_state init in __veth_napi_enable_range() in front
>>>> of napi_enable() to avoid a race (Sashiko).
>>>> - Moved the bql_state reset in veth_napi_del_range() after the
>>>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>>>
>>>> Benchmarks look just fine, see commit message.
>>>>
>>>> WDYT?
>>
>>>
>>> Looks good to me, I will use this in my V7 patchset.
>>>
>>> A bike-shedding issue: We change the coalescing parameters for the veth
>>> net_device, but should this be a TX or RX parameter?
>>>
>>
>> Hi,
>>
>> The results look a little bit suspicious, that in some cases the p99 is smaller than the average, which can happen if you have really large max values. It feels something is off with the methodology. I think we should drop the avg, and include the max value. This would give a better picture.
>
> Luckily I saved the raw test result from /tmp (the raw ping output
> is included).
>
> extract_ping_p99() seems to be broken. It has issues ordering the
> numbers. So if there are 2 or 3 numbers after the point I think.
> avg ping is fine.
>
> Claude Sonnet wrote a script for me to recalculate the p99 pings
> from the raw ping output).
For programming switch to Claude Opus, as Sonnet cannot code IMHO.
(Claude Opus wrote the selftest we placed in [1])
Is your script still based on our github[1] selftest ?
If so, then please make some PR(s) against my repo.
I want this to be easy reproducible for others.
[1] https://github.com/netoptimizer/veth-backpressure-performance-testing
> Below is the new result. Not 100% sure but this seems right now.
> Most pings are the same.
>
> Ping RTT ms (p99)
> ===========================================================================
> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
> -------+-------+-------+-------+-------+--------+--------+---------++------
> 0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
> 1 | 0.027 | 0.123 | 0.169 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
> 10 | 0.030 | 0.117 | 0.189 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
> 100 | 0.045 | 0.134 | 0.233 | 0.368 | 0.365 | 0.369 | 0.361 || 0.358
> 1000 | 0.230 | 0.300 | 0.412 | 1.26 | 2.11 | 2.12 | 2.13 || 2.07
> 10000 | 2.04 | 2.55 | 2.54 | 2.72 | 3.77 | 12.1 | 20.1 || 20.3
>
>>
>> Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done in the kernel? It relies on bool normalization which feels a bit implicit to me.
>
> Ok, we could swap for something like
>
> if (veth_ptr_is_bql(ptr))
> state->n_bql+;
I'll see if I can adjust the V7 patch to this feedback.
>>
>>> For physical NICs adjusting TX coalescing will affect the BQL as this
>>> affect the TX completion of the transmitted packets. For veth, it is the
>>> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
>>> is where this patch adds netdev_tx_completed_queue calls for BQL.
>>> Any opinions on the "TX" or "RX" color?
>
> Personally I would also say TX.
>
Okay, we seem to have more votes for TX :-)
>> I think I would prefer to configure it on the tx dev, and the recv
>> side gets the value from the peer. Maybe something like this.
>>
I'm considering if we should simplify the config by having veth pairs
have the same tx-coal value. WDYT?
>> @@ -997,11 +998,12 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>> struct veth_bql_state *state = &rq->bql_state;
>> int i, done = 0, n_xdpf = 0;
>> void *xdpf[VETH_XDP_BATCH];
>
> Why this deletion?
>
>> - struct veth_priv *priv;
>> - u64 bql_flush_ns;
>> + u64 bql_flush_ns = 0;
>>
>> - priv = netdev_priv(rq->dev);
>> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
>
> I see we should check if peer_txq exists.
>
>> + if (peer_txq) {
>> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
>> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
>> + }
>>
>> /* Clamp stored timestamp in case we migrated to a CPU with a behind
>> * sched_clock(); prevents the deadline from never firing.
>>
>> - Jonas
--Jesper
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-09 13:59 ` Jesper Dangaard Brouer
@ 2026-06-09 15:08 ` Simon Schippers
2026-06-10 7:04 ` Simon Schippers
0 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-09 15:08 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Jonas Köppeler, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf, kernel-team
On 6/9/26 15:59, Jesper Dangaard Brouer wrote:
>
>
> On 08/06/2026 16.21, Simon Schippers wrote:
>> On 6/8/26 15:13, Jonas Köppeler wrote:
>>>
>>> On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>>>>
>>>>
>>>> On 08/06/2026 12.38, Simon Schippers wrote:
>>>>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>
>>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>>
>>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>>> beyond a single budget=64 cycle.
>>>>>>
>>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>>> completion.
>>>>>>
>>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>>
>>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>> ---
>>>>>
>>>>> I found the issue that n_bql may become infinitly large if producer
>>>>> and consumer have the same speed (and tx_usecs is large). It could
>>>>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>>>>> Also I figured that no hardware BQL driver ever completes more than
>>>>> BQL limit many elements.
>>>>>
>>>>> Therefore, I propose a simpler logic (see attachment) that completes
>>>>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>>>>> If n_bql > dql.limit then we either have the case above that the
>>>>> producer is as fast as the consumer or we have BQL starvation.
>>>>>
>>>>> if (state->time + bql_flush_ns <= current_time ||
>>>>> state->n_bql > peer_txq->dql.limit) {
>>>>>
>>>>> It must be n_bql *bigger than* dql.limit because the producer will
>>>>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>>>>> It is fast because peer_txq->dql.limit is in the cacheline of the
>>>>> completion path, see dynamic_queue_limits.h.
>>>>>
>>>>> Another advantage is that we avoid the snippet checking for empty
>>>>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>>>>
>>>>> Apart from that I:
>>>>> - Always call veth_bql_maybe_complete() in the for loop to have
>>>>> more accurate completion intervals when having mixed XDP and
>>>>> non-XDP packets.
>>>>> - Made it so tx_usecs = 0 is now also a normal case.
>>>>> - Change the type of n_bql to uint instead of int.
>>>>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>>>>> - Moved the bql_state init in __veth_napi_enable_range() in front
>>>>> of napi_enable() to avoid a race (Sashiko).
>>>>> - Moved the bql_state reset in veth_napi_del_range() after the
>>>>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>>>>
>>>>> Benchmarks look just fine, see commit message.
>>>>>
>>>>> WDYT?
>>>
>>>>
>>>> Looks good to me, I will use this in my V7 patchset.
>>>>
>>>> A bike-shedding issue: We change the coalescing parameters for the veth
>>>> net_device, but should this be a TX or RX parameter?
>>>>
>>>
>>> Hi,
>>>
>>> The results look a little bit suspicious, that in some cases the p99 is smaller than the average, which can happen if you have really large max values. It feels something is off with the methodology. I think we should drop the avg, and include the max value. This would give a better picture.
>>
>> Luckily I saved the raw test result from /tmp (the raw ping output
>> is included).
>>
>> extract_ping_p99() seems to be broken. It has issues ordering the
>> numbers. So if there are 2 or 3 numbers after the point I think.
>> avg ping is fine.
>>
>> Claude Sonnet wrote a script for me to recalculate the p99 pings
>> from the raw ping output).
>
> For programming switch to Claude Opus, as Sonnet cannot code IMHO.
Already used 95% of my Copilot Pro+ Tokens this month :P
> (Claude Opus wrote the selftest we placed in [1])
>
> Is your script still based on our github[1] selftest ?
> If so, then please make some PR(s) against my repo.
> I want this to be easy reproducible for others.
>
> [1] https://github.com/netoptimizer/veth-backpressure-performance-testing
>
I forked Jonas latest branch [1] and ran the tests with that.
But I faced 2 issues that I then fixed:
- Ping fails in some runs (~20%) -> My fix: Retry the run until it
works.
- extract_ping_p99() has a bug for me which caused wrong results,
as pointed out by Jonas -> Also fixed it
Because of the inconsistencies I will rerun over night to have
clean benchmark results.
I will make a PR tomorrow.
[1] https://github.com/jkoeppeler/veth-backpressure-performance-testing/tree/pktgen-and-benchmark
>> Below is the new result. Not 100% sure but this seems right now.
>> Most pings are the same.
>>
>> Ping RTT ms (p99)
>> ===========================================================================
>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>> -------+-------+-------+-------+-------+--------+--------+---------++------
>> 0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
>> 1 | 0.027 | 0.123 | 0.169 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
>> 10 | 0.030 | 0.117 | 0.189 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
>> 100 | 0.045 | 0.134 | 0.233 | 0.368 | 0.365 | 0.369 | 0.361 || 0.358
>> 1000 | 0.230 | 0.300 | 0.412 | 1.26 | 2.11 | 2.12 | 2.13 || 2.07
>> 10000 | 2.04 | 2.55 | 2.54 | 2.72 | 3.77 | 12.1 | 20.1 || 20.3
>>
>>>
>>> Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done in the kernel? It relies on bool normalization which feels a bit implicit to me.
>>
>> Ok, we could swap for something like
>>
>> if (veth_ptr_is_bql(ptr))
>> state->n_bql+;
>
> I'll see if I can adjust the V7 patch to this feedback.
Yes, and also this snippet pointed out by Jonas below:
- struct veth_priv *priv;
- u64 bql_flush_ns;
+ u64 bql_flush_ns = 0;
- priv = netdev_priv(rq->dev);
- bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
+ if (peer_txq) {
+ struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
+ bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
+ }
Or is peer_txq != 0 guaranteed in veth_xdp_rcv()?
And also I would change in the commit message:
"Flushing when n_bql exceeds dql.limit handles two cases:
- BQL starvation
- The steady-state case where the producer and consumer run at the
same speed with a large tx-usecs, which would otherwise allow n_bql
to grow without bound (and potentially overflow int)."
... to just...
"Flushing when n_bql exceeds dql.limit handles BQL starvation."
... because after rethinking I think I overstated here.
n_bql should also not grow infinitely for the v6 version.
Nevertheless the new method should be simpler and faster.
>
>
>>>
>>>> For physical NICs adjusting TX coalescing will affect the BQL as this
>>>> affect the TX completion of the transmitted packets. For veth, it is the
>>>> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
>>>> is where this patch adds netdev_tx_completed_queue calls for BQL.
>>>> Any opinions on the "TX" or "RX" color?
>>
>> Personally I would also say TX.
>>
>
> Okay, we seem to have more votes for TX :-)
>
>
>>> I think I would prefer to configure it on the tx dev, and the recv
>>> side gets the value from the peer. Maybe something like this.
>>>
>
> I'm considering if we should simplify the config by having veth pairs have the same tx-coal value. WDYT?
I can not think of a case where a user would like to have a different
tx-coal value on either side..
So it's fine for me.
>
>
>>> @@ -997,11 +998,12 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>>> struct veth_bql_state *state = &rq->bql_state;
>>> int i, done = 0, n_xdpf = 0;
>>> void *xdpf[VETH_XDP_BATCH];
>>
>> Why this deletion?
>>
>>> - struct veth_priv *priv;
>>> - u64 bql_flush_ns;
>>> + u64 bql_flush_ns = 0;
>>>
>>> - priv = netdev_priv(rq->dev);
>>> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
>>
>> I see we should check if peer_txq exists.
>>
>>> + if (peer_txq) {
>>> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
>>> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
>>> + }
>>>
>>> /* Clamp stored timestamp in case we migrated to a CPU with a behind
>>> * sched_clock(); prevents the deadline from never firing.
>>>
>>> - Jonas
>
> --Jesper
>
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-09 15:08 ` Simon Schippers
@ 2026-06-10 7:04 ` Simon Schippers
2026-06-10 10:15 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 25+ messages in thread
From: Simon Schippers @ 2026-06-10 7:04 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Jonas Köppeler, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf, kernel-team
On 6/9/26 17:08, Simon Schippers wrote:
> On 6/9/26 15:59, Jesper Dangaard Brouer wrote:
>>
>>
>> On 08/06/2026 16.21, Simon Schippers wrote:
>>> On 6/8/26 15:13, Jonas Köppeler wrote:
>>>>
>>>> On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>>>>>
>>>>>
>>>>> On 08/06/2026 12.38, Simon Schippers wrote:
>>>>>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>
>>>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>>>
>>>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>>>> beyond a single budget=64 cycle.
>>>>>>>
>>>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>>>> completion.
>>>>>>>
>>>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>>>
>>>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>> ---
>>>>>>
>>>>>> I found the issue that n_bql may become infinitly large if producer
>>>>>> and consumer have the same speed (and tx_usecs is large). It could
>>>>>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>>>>>> Also I figured that no hardware BQL driver ever completes more than
>>>>>> BQL limit many elements.
>>>>>>
>>>>>> Therefore, I propose a simpler logic (see attachment) that completes
>>>>>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>>>>>> If n_bql > dql.limit then we either have the case above that the
>>>>>> producer is as fast as the consumer or we have BQL starvation.
>>>>>>
>>>>>> if (state->time + bql_flush_ns <= current_time ||
>>>>>> state->n_bql > peer_txq->dql.limit) {
>>>>>>
>>>>>> It must be n_bql *bigger than* dql.limit because the producer will
>>>>>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>>>>>> It is fast because peer_txq->dql.limit is in the cacheline of the
>>>>>> completion path, see dynamic_queue_limits.h.
>>>>>>
>>>>>> Another advantage is that we avoid the snippet checking for empty
>>>>>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>>>>>
>>>>>> Apart from that I:
>>>>>> - Always call veth_bql_maybe_complete() in the for loop to have
>>>>>> more accurate completion intervals when having mixed XDP and
>>>>>> non-XDP packets.
>>>>>> - Made it so tx_usecs = 0 is now also a normal case.
>>>>>> - Change the type of n_bql to uint instead of int.
>>>>>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>>>>>> - Moved the bql_state init in __veth_napi_enable_range() in front
>>>>>> of napi_enable() to avoid a race (Sashiko).
>>>>>> - Moved the bql_state reset in veth_napi_del_range() after the
>>>>>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>>>>>
>>>>>> Benchmarks look just fine, see commit message.
>>>>>>
>>>>>> WDYT?
>>>>
>>>>>
>>>>> Looks good to me, I will use this in my V7 patchset.
>>>>>
>>>>> A bike-shedding issue: We change the coalescing parameters for the veth
>>>>> net_device, but should this be a TX or RX parameter?
>>>>>
>>>>
>>>> Hi,
>>>>
>>>> The results look a little bit suspicious, that in some cases the p99 is smaller than the average, which can happen if you have really large max values. It feels something is off with the methodology. I think we should drop the avg, and include the max value. This would give a better picture.
>>>
>>> Luckily I saved the raw test result from /tmp (the raw ping output
>>> is included).
>>>
>>> extract_ping_p99() seems to be broken. It has issues ordering the
>>> numbers. So if there are 2 or 3 numbers after the point I think.
>>> avg ping is fine.
>>>
>>> Claude Sonnet wrote a script for me to recalculate the p99 pings
>>> from the raw ping output).
>>
>> For programming switch to Claude Opus, as Sonnet cannot code IMHO.
>
> Already used 95% of my Copilot Pro+ Tokens this month :P
>
>> (Claude Opus wrote the selftest we placed in [1])
>>
>> Is your script still based on our github[1] selftest ?
>> If so, then please make some PR(s) against my repo.
>> I want this to be easy reproducible for others.
>>
>> [1] https://github.com/netoptimizer/veth-backpressure-performance-testing
>>
>
> I forked Jonas latest branch [1] and ran the tests with that.
>
> But I faced 2 issues that I then fixed:
> - Ping fails in some runs (~20%) -> My fix: Retry the run until it
> works.
> - extract_ping_p99() has a bug for me which caused wrong results,
> as pointed out by Jonas -> Also fixed it
>
> Because of the inconsistencies I will rerun over night to have
> clean benchmark results.
>
> I will make a PR tomorrow.
>
> [1] https://github.com/jkoeppeler/veth-backpressure-performance-testing/tree/pktgen-and-benchmark
>
I ran the benchmarks over night and they look fine,
see ASCII table below.
I do not see a regression, tx_usecs of 50us even outperforms stock.
I did a PR, see [1]. Most of the code changes are by Jonas,
because he rewrote the code to allow the usage of pktgen and
added the bql_sweep.sh script. Thanks Jonas!
My part is the 2 bugfixes, the output as ASCII table and
the new benchmark results with my new 'v7' method.
I have not tested bql_sweep.sh inside virtme-ng, I ran my benchmarks
on the host system. I guess you could mount the folder with read+write
in vng and then run bql_sweep.sh there.
[1] https://github.com/netoptimizer/veth-backpressure-performance-testing/pull/9
System information:
Ryzen 5 5600X @ fixed 4.3 GHz, 3200 MHz RAM, CPU mitigations disabled,
SMT disabled, on host system (NOT virtme-ng).
Patched benchmark command:
sudo ./veth_bql_sweep.sh --runs 10 --tx-usecs-list \
"0 50 100 500 1000 5000 10000" --nrules-list "0 1 10 100 1000 10000" \
--pktgen --duration 20 --qdisc fq_codel --no-bpftrace
Stock benchmark command:
sudo ./veth_bql_sweep.sh --runs 10 --tx-usecs-list "0" --nrules-list \
"0 1 10 100 1000 10000" --pktgen --duration 20 --qdisc fq_codel \
--no-bpftrace
Throughput (pps)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 1.62M | 1.89M | 1.75M | 1.73M | 1.73M | 1.73M | 1.73M || 1.76M
1 | 1.51M | 1.72M | 1.63M | 1.60M | 1.60M | 1.60M | 1.60M || 1.65M
10 | 1.33M | 1.52M | 1.47M | 1.41M | 1.41M | 1.41M | 1.41M || 1.45M
100 | 681K | 751K | 756K | 726K | 721K | 723K | 730K || 736K
1000 | 116K | 123K | 124K | 124K | 124K | 125K | 123K || 126K
10000 | 13K | 13K | 13K | 13K | 13K | 13K | 13K || 13K
Ping RTT ms (avg)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 0.016 | 0.089 | 0.136 | 0.137 | 0.138 | 0.134 | 0.135 || 0.132
1 | 0.018 | 0.097 | 0.144 | 0.146 | 0.146 | 0.150 | 0.147 || 0.142
10 | 0.019 | 0.093 | 0.156 | 0.164 | 0.164 | 0.164 | 0.167 || 0.158
100 | 0.029 | 0.104 | 0.180 | 0.315 | 0.312 | 0.314 | 0.311 || 0.307
1000 | 0.139 | 0.201 | 0.309 | 0.971 | 1.66 | 1.81 | 1.82 || 1.77
10000 | 1.12 | 1.74 | 1.72 | 1.83 | 2.90 | 9.14 | 15.9 || 17.2
Ping RTT ms (p99)
===========================================================================
nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
-------+-------+-------+-------+-------+--------+--------+---------++------
0 | 0.026 | 0.122 | 0.161 | 0.164 | 0.164 | 0.159 | 0.161 || 0.155
1 | 0.028 | 0.123 | 0.169 | 0.173 | 0.173 | 0.175 | 0.173 || 0.165
10 | 0.032 | 0.119 | 0.187 | 0.194 | 0.193 | 0.193 | 0.197 || 0.185
100 | 0.047 | 0.134 | 0.232 | 0.368 | 0.369 | 0.367 | 0.359 || 0.361
1000 | 0.232 | 0.303 | 0.409 | 1.27 | 2.13 | 2.10 | 2.13 || 2.10
10000 | 1.94 | 2.52 | 2.59 | 2.69 | 3.87 | 12.0 | 20.0 || 20.3
>>> Below is the new result. Not 100% sure but this seems right now.
>>> Most pings are the same.
>>>
>>> Ping RTT ms (p99)
>>> ===========================================================================
>>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>>> -------+-------+-------+-------+-------+--------+--------+---------++------
>>> 0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
>>> 1 | 0.027 | 0.123 | 0.169 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
>>> 10 | 0.030 | 0.117 | 0.189 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
>>> 100 | 0.045 | 0.134 | 0.233 | 0.368 | 0.365 | 0.369 | 0.361 || 0.358
>>> 1000 | 0.230 | 0.300 | 0.412 | 1.26 | 2.11 | 2.12 | 2.13 || 2.07
>>> 10000 | 2.04 | 2.55 | 2.54 | 2.72 | 3.77 | 12.1 | 20.1 || 20.3
>>>
>>>>
>>>> Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done in the kernel? It relies on bool normalization which feels a bit implicit to me.
>>>
>>> Ok, we could swap for something like
>>>
>>> if (veth_ptr_is_bql(ptr))
>>> state->n_bql+;
>>
>> I'll see if I can adjust the V7 patch to this feedback.
>
> Yes, and also this snippet pointed out by Jonas below:
>
> - struct veth_priv *priv;
> - u64 bql_flush_ns;
> + u64 bql_flush_ns = 0;
>
> - priv = netdev_priv(rq->dev);
> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
> + if (peer_txq) {
> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
> + }
>
> Or is peer_txq != 0 guaranteed in veth_xdp_rcv()?
>
>
> And also I would change in the commit message:
>
> "Flushing when n_bql exceeds dql.limit handles two cases:
> - BQL starvation
> - The steady-state case where the producer and consumer run at the
> same speed with a large tx-usecs, which would otherwise allow n_bql
> to grow without bound (and potentially overflow int)."
>
> ... to just...
>
> "Flushing when n_bql exceeds dql.limit handles BQL starvation."
>
> ... because after rethinking I think I overstated here.
> n_bql should also not grow infinitely for the v6 version.
> Nevertheless the new method should be simpler and faster.
>
>>
>>
>>>>
>>>>> For physical NICs adjusting TX coalescing will affect the BQL as this
>>>>> affect the TX completion of the transmitted packets. For veth, it is the
>>>>> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
>>>>> is where this patch adds netdev_tx_completed_queue calls for BQL.
>>>>> Any opinions on the "TX" or "RX" color?
>>>
>>> Personally I would also say TX.
>>>
>>
>> Okay, we seem to have more votes for TX :-)
>>
>>
>>>> I think I would prefer to configure it on the tx dev, and the recv
>>>> side gets the value from the peer. Maybe something like this.
>>>>
>>
>> I'm considering if we should simplify the config by having veth pairs have the same tx-coal value. WDYT?
>
> I can not think of a case where a user would like to have a different
> tx-coal value on either side..
> So it's fine for me.
>
>>
>>
>>>> @@ -997,11 +998,12 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>>>> struct veth_bql_state *state = &rq->bql_state;
>>>> int i, done = 0, n_xdpf = 0;
>>>> void *xdpf[VETH_XDP_BATCH];
>>>
>>> Why this deletion?
>>>
>>>> - struct veth_priv *priv;
>>>> - u64 bql_flush_ns;
>>>> + u64 bql_flush_ns = 0;
>>>>
>>>> - priv = netdev_priv(rq->dev);
>>>> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
>>>
>>> I see we should check if peer_txq exists.
>>>
>>>> + if (peer_txq) {
>>>> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
>>>> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
>>>> + }
>>>>
>>>> /* Clamp stored timestamp in case we migrated to a CPU with a behind
>>>> * sched_clock(); prevents the deadline from never firing.
>>>>
>>>> - Jonas
>>
>> --Jesper
>>
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-10 7:04 ` Simon Schippers
@ 2026-06-10 10:15 ` Jesper Dangaard Brouer
2026-06-10 12:00 ` Simon Schippers
0 siblings, 1 reply; 25+ messages in thread
From: Jesper Dangaard Brouer @ 2026-06-10 10:15 UTC (permalink / raw)
To: Simon Schippers, Jonas Köppeler, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf, kernel-team
On 10/06/2026 09.04, Simon Schippers wrote:
> On 6/9/26 17:08, Simon Schippers wrote:
>> On 6/9/26 15:59, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 08/06/2026 16.21, Simon Schippers wrote:
>>>> On 6/8/26 15:13, Jonas Köppeler wrote:
>>>>>
>>>>> On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>>>>>>
>>>>>>
>>>>>> On 08/06/2026 12.38, Simon Schippers wrote:
>>>>>>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>>>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>
>>>>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>>>>
>>>>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>>>>> beyond a single budget=64 cycle.
>>>>>>>>
>>>>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>>>>> completion.
>>>>>>>>
>>>>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>>>>
>>>>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>> ---
>>>>>>>
>>>>>>> I found the issue that n_bql may become infinitly large if producer
>>>>>>> and consumer have the same speed (and tx_usecs is large). It could
>>>>>>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>>>>>>> Also I figured that no hardware BQL driver ever completes more than
>>>>>>> BQL limit many elements.
>>>>>>>
>>>>>>> Therefore, I propose a simpler logic (see attachment) that completes
>>>>>>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>>>>>>> If n_bql > dql.limit then we either have the case above that the
>>>>>>> producer is as fast as the consumer or we have BQL starvation.
>>>>>>>
>>>>>>> if (state->time + bql_flush_ns <= current_time ||
>>>>>>> state->n_bql > peer_txq->dql.limit) {
>>>>>>>
>>>>>>> It must be n_bql *bigger than* dql.limit because the producer will
>>>>>>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>>>>>>> It is fast because peer_txq->dql.limit is in the cacheline of the
>>>>>>> completion path, see dynamic_queue_limits.h.
>>>>>>>
>>>>>>> Another advantage is that we avoid the snippet checking for empty
>>>>>>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>>>>>>
>>>>>>> Apart from that I:
>>>>>>> - Always call veth_bql_maybe_complete() in the for loop to have
>>>>>>> more accurate completion intervals when having mixed XDP and
>>>>>>> non-XDP packets.
>>>>>>> - Made it so tx_usecs = 0 is now also a normal case.
>>>>>>> - Change the type of n_bql to uint instead of int.
>>>>>>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>>>>>>> - Moved the bql_state init in __veth_napi_enable_range() in front
>>>>>>> of napi_enable() to avoid a race (Sashiko).
>>>>>>> - Moved the bql_state reset in veth_napi_del_range() after the
>>>>>>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>>>>>>
>>>>>>> Benchmarks look just fine, see commit message.
>>>>>>>
>>>>>>> WDYT?
>>>>>
>>>>>>
>>>>>> Looks good to me, I will use this in my V7 patchset.
>>>>>>
>>>>>> A bike-shedding issue: We change the coalescing parameters for the veth
>>>>>> net_device, but should this be a TX or RX parameter?
>>>>>>
>>>>>
>>>>> Hi,
>>>>>
>>>>> The results look a little bit suspicious, that in some cases the p99 is smaller than the average, which can happen if you have really large max values. It feels something is off with the methodology. I think we should drop the avg, and include the max value. This would give a better picture.
>>>>
>>>> Luckily I saved the raw test result from /tmp (the raw ping output
>>>> is included).
>>>>
>>>> extract_ping_p99() seems to be broken. It has issues ordering the
>>>> numbers. So if there are 2 or 3 numbers after the point I think.
>>>> avg ping is fine.
>>>>
>>>> Claude Sonnet wrote a script for me to recalculate the p99 pings
>>>> from the raw ping output).
>>>
>>> For programming switch to Claude Opus, as Sonnet cannot code IMHO.
>>
>> Already used 95% of my Copilot Pro+ Tokens this month :P
>>
LOL - I'm lucky to (still) have unlimited tokens.
>>> (Claude Opus wrote the selftest we placed in [1])
>>>
>>> Is your script still based on our github[1] selftest ?
>>> If so, then please make some PR(s) against my repo.
>>> I want this to be easy reproducible for others.
>>>
>>> [1] https://github.com/netoptimizer/veth-backpressure-performance-testing
>>>
>>
>> I forked Jonas latest branch [1] and ran the tests with that.
>>
>> But I faced 2 issues that I then fixed:
>> - Ping fails in some runs (~20%) -> My fix: Retry the run until it
>> works.
>> - extract_ping_p99() has a bug for me which caused wrong results,
>> as pointed out by Jonas -> Also fixed it
>>
>> Because of the inconsistencies I will rerun over night to have
>> clean benchmark results.
>>
>> I will make a PR tomorrow.
>>
>> [1] https://github.com/jkoeppeler/veth-backpressure-performance-testing/tree/pktgen-and-benchmark
>>
>
> I ran the benchmarks over night and they look fine,
> see ASCII table below.
> I do not see a regression, tx_usecs of 50us even outperforms stock.
>
Very interesting that tx_usecs 50us is generally performing better.
It seems we should "sweep" the area between 0-100 usecs for finding best
default from the beginning. (And see cool colored graphs that Jonas
generated).
> I did a PR, see [1]. Most of the code changes are by Jonas,
> because he rewrote the code to allow the usage of pktgen and
> added the bql_sweep.sh script. Thanks Jonas!
> My part is the 2 bugfixes, the output as ASCII table and
> the new benchmark results with my new 'v7' method.
>
> I have not tested bql_sweep.sh inside virtme-ng, I ran my benchmarks
> on the host system. I guess you could mount the folder with read+write
> in vng and then run bql_sweep.sh there.
>
> [1] https://github.com/netoptimizer/veth-backpressure-performance-testing/pull/9
>
I will take a look today.
- Will likely just merge after sanity checking as I want us iterate faster
>
> System information:
> Ryzen 5 5600X @ fixed 4.3 GHz, 3200 MHz RAM, CPU mitigations disabled,
> SMT disabled, on host system (NOT virtme-ng).
>
> Patched benchmark command:
> sudo ./veth_bql_sweep.sh --runs 10 --tx-usecs-list \
> "0 50 100 500 1000 5000 10000" --nrules-list "0 1 10 100 1000 10000" \
> --pktgen --duration 20 --qdisc fq_codel --no-bpftrace
>
> Stock benchmark command:
> sudo ./veth_bql_sweep.sh --runs 10 --tx-usecs-list "0" --nrules-list \
> "0 1 10 100 1000 10000" --pktgen --duration 20 --qdisc fq_codel \
> --no-bpftrace
>
> Throughput (pps)
> ===========================================================================
> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
> -------+-------+-------+-------+-------+--------+--------+---------++------
> 0 | 1.62M | 1.89M | 1.75M | 1.73M | 1.73M | 1.73M | 1.73M || 1.76M
> 1 | 1.51M | 1.72M | 1.63M | 1.60M | 1.60M | 1.60M | 1.60M || 1.65M
> 10 | 1.33M | 1.52M | 1.47M | 1.41M | 1.41M | 1.41M | 1.41M || 1.45M
> 100 | 681K | 751K | 756K | 726K | 721K | 723K | 730K || 736K
> 1000 | 116K | 123K | 124K | 124K | 124K | 125K | 123K || 126K
> 10000 | 13K | 13K | 13K | 13K | 13K | 13K | 13K || 13K
>
>
> Ping RTT ms (avg)
> ===========================================================================
> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
> -------+-------+-------+-------+-------+--------+--------+---------++------
> 0 | 0.016 | 0.089 | 0.136 | 0.137 | 0.138 | 0.134 | 0.135 || 0.132
> 1 | 0.018 | 0.097 | 0.144 | 0.146 | 0.146 | 0.150 | 0.147 || 0.142
> 10 | 0.019 | 0.093 | 0.156 | 0.164 | 0.164 | 0.164 | 0.167 || 0.158
> 100 | 0.029 | 0.104 | 0.180 | 0.315 | 0.312 | 0.314 | 0.311 || 0.307
> 1000 | 0.139 | 0.201 | 0.309 | 0.971 | 1.66 | 1.81 | 1.82 || 1.77
> 10000 | 1.12 | 1.74 | 1.72 | 1.83 | 2.90 | 9.14 | 15.9 || 17.2
>
>
> Ping RTT ms (p99)
> ===========================================================================
> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
> -------+-------+-------+-------+-------+--------+--------+---------++------
> 0 | 0.026 | 0.122 | 0.161 | 0.164 | 0.164 | 0.159 | 0.161 || 0.155
> 1 | 0.028 | 0.123 | 0.169 | 0.173 | 0.173 | 0.175 | 0.173 || 0.165
> 10 | 0.032 | 0.119 | 0.187 | 0.194 | 0.193 | 0.193 | 0.197 || 0.185
> 100 | 0.047 | 0.134 | 0.232 | 0.368 | 0.369 | 0.367 | 0.359 || 0.361
> 1000 | 0.232 | 0.303 | 0.409 | 1.27 | 2.13 | 2.10 | 2.13 || 2.10
> 10000 | 1.94 | 2.52 | 2.59 | 2.69 | 3.87 | 12.0 | 20.0 || 20.3
>
Looks like 50us delivers better (lower) latency and higher throughput
across the range.
I'm strongly considering changing V7 to use 50usec as default.
>
>>>> Below is the new result. Not 100% sure but this seems right now.
>>>> Most pings are the same.
>>>>
>>>> Ping RTT ms (p99)
>>>> ===========================================================================
>>>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>>>> -------+-------+-------+-------+-------+--------+--------+---------++------
>>>> 0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
>>>> 1 | 0.027 | 0.123 | 0.169 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
>>>> 10 | 0.030 | 0.117 | 0.189 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
>>>> 100 | 0.045 | 0.134 | 0.233 | 0.368 | 0.365 | 0.369 | 0.361 || 0.358
>>>> 1000 | 0.230 | 0.300 | 0.412 | 1.26 | 2.11 | 2.12 | 2.13 || 2.07
>>>> 10000 | 2.04 | 2.55 | 2.54 | 2.72 | 3.77 | 12.1 | 20.1 || 20.3
>>>>
>>>>>
>>>>> Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done in the kernel? It relies on bool normalization which feels a bit implicit to me.
>>>>
>>>> Ok, we could swap for something like
>>>>
>>>> if (veth_ptr_is_bql(ptr))
>>>> state->n_bql+;
>>>
>>> I'll see if I can adjust the V7 patch to this feedback.
>>
>> Yes, and also this snippet pointed out by Jonas below:
>>
>> - struct veth_priv *priv;
>> - u64 bql_flush_ns;
>> + u64 bql_flush_ns = 0;
>>
>> - priv = netdev_priv(rq->dev);
>> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
>> + if (peer_txq) {
>> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
>> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
>> + }
>>
>> Or is peer_txq != 0 guaranteed in veth_xdp_rcv()?
>>
>>
>> And also I would change in the commit message:
>>
>> "Flushing when n_bql exceeds dql.limit handles two cases:
>> - BQL starvation
>> - The steady-state case where the producer and consumer run at the
>> same speed with a large tx-usecs, which would otherwise allow n_bql
>> to grow without bound (and potentially overflow int)."
>>
>> ... to just...
>>
>> "Flushing when n_bql exceeds dql.limit handles BQL starvation."
>>
>> ... because after rethinking I think I overstated here.
>> n_bql should also not grow infinitely for the v6 version.
>> Nevertheless the new method should be simpler and faster.
Ok, adjusted this for V7 patch.
>>>
>>>
>>>>>
>>>>>> For physical NICs adjusting TX coalescing will affect the BQL as this
>>>>>> affect the TX completion of the transmitted packets. For veth, it is the
>>>>>> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
>>>>>> is where this patch adds netdev_tx_completed_queue calls for BQL.
>>>>>> Any opinions on the "TX" or "RX" color?
>>>>
>>>> Personally I would also say TX.
>>>>
>>>
>>> Okay, we seem to have more votes for TX :-)
>>>
>>>
>>>>> I think I would prefer to configure it on the tx dev, and the recv
>>>>> side gets the value from the peer. Maybe something like this.
>>>>>
>>>
>>> I'm considering if we should simplify the config by having veth pairs have the same tx-coal value. WDYT?
>>
>> I can not think of a case where a user would like to have a different
>> tx-coal value on either side..
>> So it's fine for me.
Okay, agreed - switching V7 to have mirrored tx-coal value.
--Jesper
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH net-next v6 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
2026-06-10 10:15 ` Jesper Dangaard Brouer
@ 2026-06-10 12:00 ` Simon Schippers
0 siblings, 0 replies; 25+ messages in thread
From: Simon Schippers @ 2026-06-10 12:00 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Jonas Köppeler, netdev
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Stanislav Fomichev, linux-kernel, bpf, kernel-team
On 6/10/26 12:15, Jesper Dangaard Brouer wrote:
>
>
>
> On 10/06/2026 09.04, Simon Schippers wrote:
>> On 6/9/26 17:08, Simon Schippers wrote:
>>> On 6/9/26 15:59, Jesper Dangaard Brouer wrote:
>>>>
>>>>
>>>> On 08/06/2026 16.21, Simon Schippers wrote:
>>>>> On 6/8/26 15:13, Jonas Köppeler wrote:
>>>>>>
>>>>>> On 6/8/26 3:04 PM, Jesper Dangaard Brouer wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 08/06/2026 12.38, Simon Schippers wrote:
>>>>>>>> On 5/27/26 15:54, hawk@kernel.org wrote:
>>>>>>>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>
>>>>>>>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>>>>>>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>>>>>>>
>>>>>>>>> Accumulate BQL completions and flush them when a configurable time
>>>>>>>>> threshold is exceeded, letting DQL discover a limit that bounds actual
>>>>>>>>> queuing delay to the configured interval. Coalescing state persists
>>>>>>>>> across NAPI polls in struct veth_rq so completions can accumulate
>>>>>>>>> beyond a single budget=64 cycle.
>>>>>>>>>
>>>>>>>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>>>>>>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>>>>>>>> completion.
>>>>>>>>>
>>>>>>>>> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
>>>>>>>>> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>>>>>>>>>
>>>>>>>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>> ---
>>>>>>>>
>>>>>>>> I found the issue that n_bql may become infinitly large if producer
>>>>>>>> and consumer have the same speed (and tx_usecs is large). It could
>>>>>>>> cause a potential BUG_ON if n_bql grows beyond INT_MAX...
>>>>>>>> Also I figured that no hardware BQL driver ever completes more than
>>>>>>>> BQL limit many elements.
>>>>>>>>
>>>>>>>> Therefore, I propose a simpler logic (see attachment) that completes
>>>>>>>> either on the usual bql_flush_ns or if n_bql > dql.limit.
>>>>>>>> If n_bql > dql.limit then we either have the case above that the
>>>>>>>> producer is as fast as the consumer or we have BQL starvation.
>>>>>>>>
>>>>>>>> if (state->time + bql_flush_ns <= current_time ||
>>>>>>>> state->n_bql > peer_txq->dql.limit) {
>>>>>>>>
>>>>>>>> It must be n_bql *bigger than* dql.limit because the producer will
>>>>>>>> always exceed the limit before it stops, see netdev_tx_sent_queue().
>>>>>>>> It is fast because peer_txq->dql.limit is in the cacheline of the
>>>>>>>> completion path, see dynamic_queue_limits.h.
>>>>>>>>
>>>>>>>> Another advantage is that we avoid the snippet checking for empty
>>>>>>>> and BQL stopped which requires an smp_rmb() and an test_bit().
>>>>>>>>
>>>>>>>> Apart from that I:
>>>>>>>> - Always call veth_bql_maybe_complete() in the for loop to have
>>>>>>>> more accurate completion intervals when having mixed XDP and
>>>>>>>> non-XDP packets.
>>>>>>>> - Made it so tx_usecs = 0 is now also a normal case.
>>>>>>>> - Change the type of n_bql to uint instead of int.
>>>>>>>> - Added _ONCE() for tx_coalesce_usecs as suggested by Paolo.
>>>>>>>> - Moved the bql_state init in __veth_napi_enable_range() in front
>>>>>>>> of napi_enable() to avoid a race (Sashiko).
>>>>>>>> - Moved the bql_state reset in veth_napi_del_range() after the
>>>>>>>> ptr_ring_cleanup() (probably does not matter but makes sense to me)
>>>>>>>>
>>>>>>>> Benchmarks look just fine, see commit message.
>>>>>>>>
>>>>>>>> WDYT?
>>>>>>
>>>>>>>
>>>>>>> Looks good to me, I will use this in my V7 patchset.
>>>>>>>
>>>>>>> A bike-shedding issue: We change the coalescing parameters for the veth
>>>>>>> net_device, but should this be a TX or RX parameter?
>>>>>>>
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> The results look a little bit suspicious, that in some cases the p99 is smaller than the average, which can happen if you have really large max values. It feels something is off with the methodology. I think we should drop the avg, and include the max value. This would give a better picture.
>>>>>
>>>>> Luckily I saved the raw test result from /tmp (the raw ping output
>>>>> is included).
>>>>>
>>>>> extract_ping_p99() seems to be broken. It has issues ordering the
>>>>> numbers. So if there are 2 or 3 numbers after the point I think.
>>>>> avg ping is fine.
>>>>>
>>>>> Claude Sonnet wrote a script for me to recalculate the p99 pings
>>>>> from the raw ping output).
>>>>
>>>> For programming switch to Claude Opus, as Sonnet cannot code IMHO.
>>>
>>> Already used 95% of my Copilot Pro+ Tokens this month :P
>>>
>
> LOL - I'm lucky to (still) have unlimited tokens.
>
>>>> (Claude Opus wrote the selftest we placed in [1])
>>>>
>>>> Is your script still based on our github[1] selftest ?
>>>> If so, then please make some PR(s) against my repo.
>>>> I want this to be easy reproducible for others.
>>>>
>>>> [1] https://github.com/netoptimizer/veth-backpressure-performance-testing
>>>>
>>>
>>> I forked Jonas latest branch [1] and ran the tests with that.
>>>
>>> But I faced 2 issues that I then fixed:
>>> - Ping fails in some runs (~20%) -> My fix: Retry the run until it
>>> works.
>>> - extract_ping_p99() has a bug for me which caused wrong results,
>>> as pointed out by Jonas -> Also fixed it
>>>
>>> Because of the inconsistencies I will rerun over night to have
>>> clean benchmark results.
>>>
>>> I will make a PR tomorrow.
>>>
>>> [1] https://github.com/jkoeppeler/veth-backpressure-performance-testing/tree/pktgen-and-benchmark
>>>
>>
>> I ran the benchmarks over night and they look fine,
>> see ASCII table below.
>> I do not see a regression, tx_usecs of 50us even outperforms stock.
>>
>
> Very interesting that tx_usecs 50us is generally performing better.
> It seems we should "sweep" the area between 0-100 usecs for finding best
> default from the beginning. (And see cool colored graphs that Jonas
> generated).
>
For quick verification I ran 5 runs in the 10-90us range
for --nrules=0 and 20s per run. Here the results:
sudo ./veth_bql_sweep.sh --runs 5 --tx-usecs-list \
"0 10 20 30 40 50 60 70 80 90" --nrules-list "0" --pktgen \
--duration 20 --qdisc fq_codel --no-bpftrace
Throughput (pps)
================================================================================
nrules | 10us | 20us | 30us | 40us | 50us | 60us | 70us | 80us | 90us |
-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
0 | 1.75M | 1.76M | 1.76M | 1.76M | 1.76M | 1.76M | 1.77M | 1.77M | 1.76M |
Ping RTT ms (avg)
================================================================================
nrules | 10us | 20us | 30us | 40us | 50us | 60us | 70us | 80us | 90us |
-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
0 | 0.133 | 0.133 | 0.132 | 0.132 | 0.130 | 0.133 | 0.132 | 0.131 | 0.131 |
Ping RTT ms (p99)
================================================================================
nrules | 10us | 20us | 30us | 40us | 50us | 60us | 70us | 80us | 90us |
-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
0 | 0.158 | 0.156 | 0.155 | 0.159 | 0.156 | 0.156 | 0.156 | 0.153 | 0.153 |
Now 50us is just as fast as stock, even though I changed *nothing* :/
Nevertheless still great.
Even 10us seems valid. Maybe it just has to be as big as the time
for running a single veth_xdp_rcv() execution? Idk :D
>
>> I did a PR, see [1]. Most of the code changes are by Jonas,
>> because he rewrote the code to allow the usage of pktgen and
>> added the bql_sweep.sh script. Thanks Jonas!
>> My part is the 2 bugfixes, the output as ASCII table and
>> the new benchmark results with my new 'v7' method.
>>
>> I have not tested bql_sweep.sh inside virtme-ng, I ran my benchmarks
>> on the host system. I guess you could mount the folder with read+write
>> in vng and then run bql_sweep.sh there.
>>
>> [1] https://github.com/netoptimizer/veth-backpressure-performance-testing/pull/9
>>
>
> I will take a look today.
> - Will likely just merge after sanity checking as I want us iterate faster
>
>>
>> System information:
>> Ryzen 5 5600X @ fixed 4.3 GHz, 3200 MHz RAM, CPU mitigations disabled,
>> SMT disabled, on host system (NOT virtme-ng).
>>
>> Patched benchmark command:
>> sudo ./veth_bql_sweep.sh --runs 10 --tx-usecs-list \
>> "0 50 100 500 1000 5000 10000" --nrules-list "0 1 10 100 1000 10000" \
>> --pktgen --duration 20 --qdisc fq_codel --no-bpftrace
>>
>> Stock benchmark command:
>> sudo ./veth_bql_sweep.sh --runs 10 --tx-usecs-list "0" --nrules-list \
>> "0 1 10 100 1000 10000" --pktgen --duration 20 --qdisc fq_codel \
>> --no-bpftrace
>>
>> Throughput (pps)
>> ===========================================================================
>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>> -------+-------+-------+-------+-------+--------+--------+---------++------
>> 0 | 1.62M | 1.89M | 1.75M | 1.73M | 1.73M | 1.73M | 1.73M || 1.76M
>> 1 | 1.51M | 1.72M | 1.63M | 1.60M | 1.60M | 1.60M | 1.60M || 1.65M
>> 10 | 1.33M | 1.52M | 1.47M | 1.41M | 1.41M | 1.41M | 1.41M || 1.45M
>> 100 | 681K | 751K | 756K | 726K | 721K | 723K | 730K || 736K
>> 1000 | 116K | 123K | 124K | 124K | 124K | 125K | 123K || 126K
>> 10000 | 13K | 13K | 13K | 13K | 13K | 13K | 13K || 13K
>>
>>
>> Ping RTT ms (avg)
>> ===========================================================================
>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>> -------+-------+-------+-------+-------+--------+--------+---------++------
>> 0 | 0.016 | 0.089 | 0.136 | 0.137 | 0.138 | 0.134 | 0.135 || 0.132
>> 1 | 0.018 | 0.097 | 0.144 | 0.146 | 0.146 | 0.150 | 0.147 || 0.142
>> 10 | 0.019 | 0.093 | 0.156 | 0.164 | 0.164 | 0.164 | 0.167 || 0.158
>> 100 | 0.029 | 0.104 | 0.180 | 0.315 | 0.312 | 0.314 | 0.311 || 0.307
>> 1000 | 0.139 | 0.201 | 0.309 | 0.971 | 1.66 | 1.81 | 1.82 || 1.77
>> 10000 | 1.12 | 1.74 | 1.72 | 1.83 | 2.90 | 9.14 | 15.9 || 17.2
>>
>>
>> Ping RTT ms (p99)
>> ===========================================================================
>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>> -------+-------+-------+-------+-------+--------+--------+---------++------
>> 0 | 0.026 | 0.122 | 0.161 | 0.164 | 0.164 | 0.159 | 0.161 || 0.155
>> 1 | 0.028 | 0.123 | 0.169 | 0.173 | 0.173 | 0.175 | 0.173 || 0.165
>> 10 | 0.032 | 0.119 | 0.187 | 0.194 | 0.193 | 0.193 | 0.197 || 0.185
>> 100 | 0.047 | 0.134 | 0.232 | 0.368 | 0.369 | 0.367 | 0.359 || 0.361
>> 1000 | 0.232 | 0.303 | 0.409 | 1.27 | 2.13 | 2.10 | 2.13 || 2.10
>> 10000 | 1.94 | 2.52 | 2.59 | 2.69 | 3.87 | 12.0 | 20.0 || 20.3
>>
>
> Looks like 50us delivers better (lower) latency and higher throughput across the range.
> I'm strongly considering changing V7 to use 50usec as default.
>
>>
>>>>> Below is the new result. Not 100% sure but this seems right now.
>>>>> Most pings are the same.
>>>>>
>>>>> Ping RTT ms (p99)
>>>>> ===========================================================================
>>>>> nrules | 0us | 50us | 100us | 500us | 1000us | 5000us | 10000us || stock
>>>>> -------+-------+-------+-------+-------+--------+--------+---------++------
>>>>> 0 | 0.028 | 0.115 | 0.159 | 0.161 | 0.163 | 0.161 | 0.163 || 0.154
>>>>> 1 | 0.027 | 0.123 | 0.169 | 0.172 | 0.169 | 0.173 | 0.172 || 0.169
>>>>> 10 | 0.030 | 0.117 | 0.189 | 0.193 | 0.195 | 0.192 | 0.196 || 0.186
>>>>> 100 | 0.045 | 0.134 | 0.233 | 0.368 | 0.365 | 0.369 | 0.361 || 0.358
>>>>> 1000 | 0.230 | 0.300 | 0.412 | 1.26 | 2.11 | 2.12 | 2.13 || 2.07
>>>>> 10000 | 2.04 | 2.55 | 2.54 | 2.72 | 3.77 | 12.1 | 20.1 || 20.3
>>>>>
>>>>>>
>>>>>> Is state->n_bql += veth_ptr_is_bql(ptr) something that is commonly done in the kernel? It relies on bool normalization which feels a bit implicit to me.
>>>>>
>>>>> Ok, we could swap for something like
>>>>>
>>>>> if (veth_ptr_is_bql(ptr))
>>>>> state->n_bql+;
>>>>
>>>> I'll see if I can adjust the V7 patch to this feedback.
>>>
>>> Yes, and also this snippet pointed out by Jonas below:
>>>
>>> - struct veth_priv *priv;
>>> - u64 bql_flush_ns;
>>> + u64 bql_flush_ns = 0;
>>>
>>> - priv = netdev_priv(rq->dev);
>>> - bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
>>> + if (peer_txq) {
>>> + struct veth_priv *peer_priv = netdev_priv(peer_txq->dev);
>>> + bql_flush_ns = (u64)READ_ONCE(peer_priv->tx_coal_usecs) * 1000;
>>> + }
>>>
>>> Or is peer_txq != 0 guaranteed in veth_xdp_rcv()?
>>>
>>>
>>> And also I would change in the commit message:
>>>
>>> "Flushing when n_bql exceeds dql.limit handles two cases:
>>> - BQL starvation
>>> - The steady-state case where the producer and consumer run at the
>>> same speed with a large tx-usecs, which would otherwise allow n_bql
>>> to grow without bound (and potentially overflow int)."
>>>
>>> ... to just...
>>>
>>> "Flushing when n_bql exceeds dql.limit handles BQL starvation."
>>>
>>> ... because after rethinking I think I overstated here.
>>> n_bql should also not grow infinitely for the v6 version.
>>> Nevertheless the new method should be simpler and faster.
>
> Ok, adjusted this for V7 patch.
>
>>>>
>>>>
>>>>>>
>>>>>>> For physical NICs adjusting TX coalescing will affect the BQL as this
>>>>>>> affect the TX completion of the transmitted packets. For veth, it is the
>>>>>>> veth-peer's RX NAPI that is "completing" or emptying the ptr_ring, which
>>>>>>> is where this patch adds netdev_tx_completed_queue calls for BQL.
>>>>>>> Any opinions on the "TX" or "RX" color?
>>>>>
>>>>> Personally I would also say TX.
>>>>>
>>>>
>>>> Okay, we seem to have more votes for TX :-)
>>>>
>>>>
>>>>>> I think I would prefer to configure it on the tx dev, and the recv
>>>>>> side gets the value from the peer. Maybe something like this.
>>>>>>
>>>>
>>>> I'm considering if we should simplify the config by having veth pairs have the same tx-coal value. WDYT?
>>>
>>> I can not think of a case where a user would like to have a different
>>> tx-coal value on either side..
>>> So it's fine for me.
>
> Okay, agreed - switching V7 to have mirrored tx-coal value.
>
> --Jesper
^ permalink raw reply [flat|nested] 25+ messages in thread