veth: time-based BQL completion coalescing via ethtool tx-usecs From: Jesper Dangaard Brouer Bufferbloat is fundamentally a latency problem -- what matters is the time packets spend waiting in queues, as perceived by users and applications. Base BQL completion coalescing on elapsed time rather than packet counts to directly control queuing delay. Add ethtool tx-usecs support to veth for tuning BQL completion coalescing. Instead of completing BQL per-packet (which forces DQL to limit=2 with high NAPI scheduling overhead) or per-NAPI-poll (which over-buffers at budget=64), accumulate completions and flush them when a configurable time threshold is exceeded. This lets DQL discover a limit that bounds the actual queuing delay to the configured interval. The default of 10 usecs (VETH_BQL_COAL_TX_USECS) provides a good balance: DQL converges to a small limit that keeps queuing delay bounded while allowing efficient NAPI batch processing. Setting tx-usecs to 0 disables coalescing and falls back to per-packet completion (limit=2, lowest latency, highest NAPI overhead). Usage: ethtool -C tx-usecs 500 # 500us coalescing ethtool -C tx-usecs 0 # per-packet (no coalescing) Uses local_clock() (rdtsc on x86, ~20ns) for sub-jiffy resolution. Signed-off-by: Jesper Dangaard Brouer --- drivers/net/veth.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 4103d298aa9b..6035f7ec92b4 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -43,6 +43,7 @@ #define VETH_XDP_TX_BULK_SIZE 16 #define VETH_XDP_BATCH 16 +#define VETH_BQL_COAL_TX_USECS 10 /* default tx-usecs for BQL batching */ struct veth_stats { u64 rx_drops; @@ -81,6 +82,7 @@ struct veth_priv { struct bpf_prog *_xdp_prog; struct veth_rq *rq; unsigned int requested_headroom; + unsigned int tx_coal_usecs; /* BQL completion coalescing */ }; struct veth_xdp_tx_bq { @@ -265,7 +267,30 @@ static void veth_get_channels(struct net_device *dev, static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch); +static int veth_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct veth_priv *priv = netdev_priv(dev); + + ec->tx_coalesce_usecs = priv->tx_coal_usecs; + return 0; +} + +static int veth_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct veth_priv *priv = netdev_priv(dev); + + priv->tx_coal_usecs = ec->tx_coalesce_usecs; + return 0; +} + static const struct ethtool_ops veth_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS, .get_drvinfo = veth_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = veth_get_strings, @@ -275,6 +300,8 @@ static const struct ethtool_ops veth_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, .get_channels = veth_get_channels, .set_channels = veth_set_channels, + .get_coalesce = veth_get_coalesce, + .set_coalesce = veth_set_coalesce, }; /* general routines */ @@ -942,8 +969,14 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_stats *stats, struct netdev_queue *peer_txq) { + u64 bql_flush_ns, bql_flush_time = 0; int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; + struct veth_priv *priv; + int n_bql = 0; + + priv = netdev_priv(rq->dev); + bql_flush_ns = (u64)priv->tx_coal_usecs * 1000; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); @@ -972,8 +1005,23 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct sk_buff *skb = veth_ptr_to_skb(ptr); stats->xdp_bytes += skb->len; - if (peer_txq && bql_charged) - netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT); + if (peer_txq && bql_charged) { + if (!bql_flush_ns) { + netdev_tx_completed_queue(peer_txq, 1, + VETH_BQL_UNIT); + } else { + u64 now = local_clock(); + n_bql++; + if (!bql_flush_time) { + bql_flush_time = now; + } else if (now - bql_flush_time >= bql_flush_ns) { + netdev_tx_completed_queue(peer_txq, n_bql, + n_bql * VETH_BQL_UNIT); + n_bql = 0; + bql_flush_time = 0; + } + } + } skb = veth_xdp_rcv_skb(rq, skb, bq, stats); if (skb) { @@ -989,6 +1037,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, if (n_xdpf) veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); + if (peer_txq && n_bql) + netdev_tx_completed_queue(peer_txq, n_bql, n_bql * VETH_BQL_UNIT); + u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; @@ -1813,6 +1864,8 @@ static const struct xdp_metadata_ops veth_xdp_metadata_ops = { static void veth_setup(struct net_device *dev) { + struct veth_priv *priv = netdev_priv(dev); + ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; @@ -1838,6 +1891,8 @@ static void veth_setup(struct net_device *dev) dev->max_mtu = ETH_MAX_MTU; dev->watchdog_timeo = msecs_to_jiffies(16000); + priv->tx_coal_usecs = VETH_BQL_COAL_TX_USECS; + dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;