veth: time-based BQL completion coalescing via ethtool tx-usecs

From: Jesper Dangaard Brouer <hawk@kernel.org>

Bufferbloat is fundamentally a latency problem -- what matters is the
time packets spend waiting in queues, as perceived by users and
applications. Base BQL completion coalescing on elapsed time rather
than packet counts to directly control queuing delay.

Add ethtool tx-usecs support to veth for tuning BQL completion
coalescing. Instead of completing BQL per-packet (which forces DQL to
limit=2 with high NAPI scheduling overhead) or per-NAPI-poll (which
over-buffers at budget=64), accumulate completions and flush them when
a configurable time threshold is exceeded. This lets DQL discover a
limit that bounds the actual queuing delay to the configured interval.

The default of 10 usecs (VETH_BQL_COAL_TX_USECS) provides a good
balance: DQL converges to a small limit that keeps queuing delay
bounded while allowing efficient NAPI batch processing. Setting
tx-usecs to 0 disables coalescing and falls back to per-packet
completion (limit=2, lowest latency, highest NAPI overhead).

Usage:
  ethtool -C <veth-dev> tx-usecs 500   # 500us coalescing
  ethtool -C <veth-dev> tx-usecs 0     # per-packet (no coalescing)

Uses local_clock() (rdtsc on x86, ~20ns) for sub-jiffy resolution.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 drivers/net/veth.c |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 4103d298aa9b..6035f7ec92b4 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -43,6 +43,7 @@
 
 #define VETH_XDP_TX_BULK_SIZE	16
 #define VETH_XDP_BATCH		16
+#define VETH_BQL_COAL_TX_USECS	10	/* default tx-usecs for BQL batching */
 
 struct veth_stats {
 	u64	rx_drops;
@@ -81,6 +82,7 @@ struct veth_priv {
 	struct bpf_prog		*_xdp_prog;
 	struct veth_rq		*rq;
 	unsigned int		requested_headroom;
+	unsigned int		tx_coal_usecs;	/* BQL completion coalescing */
 };
 
 struct veth_xdp_tx_bq {
@@ -265,7 +267,30 @@ static void veth_get_channels(struct net_device *dev,
 static int veth_set_channels(struct net_device *dev,
 			     struct ethtool_channels *ch);
 
+static int veth_get_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	ec->tx_coalesce_usecs = priv->tx_coal_usecs;
+	return 0;
+}
+
+static int veth_set_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	priv->tx_coal_usecs = ec->tx_coalesce_usecs;
+	return 0;
+}
+
 static const struct ethtool_ops veth_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
 	.get_drvinfo		= veth_get_drvinfo,
 	.get_link		= ethtool_op_get_link,
 	.get_strings		= veth_get_strings,
@@ -275,6 +300,8 @@ static const struct ethtool_ops veth_ethtool_ops = {
 	.get_ts_info		= ethtool_op_get_ts_info,
 	.get_channels		= veth_get_channels,
 	.set_channels		= veth_set_channels,
+	.get_coalesce		= veth_get_coalesce,
+	.set_coalesce		= veth_set_coalesce,
 };
 
 /* general routines */
@@ -942,8 +969,14 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 			struct veth_stats *stats,
 			struct netdev_queue *peer_txq)
 {
+	u64 bql_flush_ns, bql_flush_time = 0;
 	int i, done = 0, n_xdpf = 0;
 	void *xdpf[VETH_XDP_BATCH];
+	struct veth_priv *priv;
+	int n_bql = 0;
+
+	priv = netdev_priv(rq->dev);
+	bql_flush_ns = (u64)priv->tx_coal_usecs * 1000;
 
 	for (i = 0; i < budget; i++) {
 		void *ptr = __ptr_ring_consume(&rq->xdp_ring);
@@ -972,8 +1005,23 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 			struct sk_buff *skb = veth_ptr_to_skb(ptr);
 
 			stats->xdp_bytes += skb->len;
-			if (peer_txq && bql_charged)
-				netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
+			if (peer_txq && bql_charged) {
+				if (!bql_flush_ns) {
+					netdev_tx_completed_queue(peer_txq, 1,
+								  VETH_BQL_UNIT);
+				} else {
+					u64 now = local_clock();
+					n_bql++;
+					if (!bql_flush_time) {
+						bql_flush_time = now;
+					} else if (now - bql_flush_time >= bql_flush_ns) {
+						netdev_tx_completed_queue(peer_txq, n_bql,
+									  n_bql * VETH_BQL_UNIT);
+						n_bql = 0;
+						bql_flush_time = 0;
+					}
+				}
+			}
 
 			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
 			if (skb) {
@@ -989,6 +1037,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 	if (n_xdpf)
 		veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
 
+	if (peer_txq && n_bql)
+		netdev_tx_completed_queue(peer_txq, n_bql, n_bql * VETH_BQL_UNIT);
+
 	u64_stats_update_begin(&rq->stats.syncp);
 	rq->stats.vs.xdp_redirect += stats->xdp_redirect;
 	rq->stats.vs.xdp_bytes += stats->xdp_bytes;
@@ -1813,6 +1864,8 @@ static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
 
 static void veth_setup(struct net_device *dev)
 {
+	struct veth_priv *priv = netdev_priv(dev);
+
 	ether_setup(dev);
 
 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
@@ -1838,6 +1891,8 @@ static void veth_setup(struct net_device *dev)
 	dev->max_mtu = ETH_MAX_MTU;
 	dev->watchdog_timeo = msecs_to_jiffies(16000);
 
+	priv->tx_coal_usecs = VETH_BQL_COAL_TX_USECS;
+
 	dev->hw_features = VETH_FEATURES;
 	dev->hw_enc_features = VETH_FEATURES;
 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;