* [PATCH V2,net-next, 0/2] net: mana: Add support for coalesced RX packets
@ 2026-01-06 20:46 Haiyang Zhang
2026-01-06 20:46 ` [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE Haiyang Zhang
2026-01-06 20:46 ` [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type Haiyang Zhang
0 siblings, 2 replies; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-06 20:46 UTC (permalink / raw)
To: linux-hyperv, netdev; +Cc: haiyangz, paulros
From: Haiyang Zhang <haiyangz@microsoft.com>
Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
update the RX code path, and ethtool handler. Also add counters for it.
Haiyang Zhang (2):
net: mana: Add support for coalesced RX packets on CQE
net: mana: Add ethtool counters for RX CQEs in coalesced type
drivers/net/ethernet/microsoft/mana/mana_en.c | 49 +++++++++----
.../ethernet/microsoft/mana/mana_ethtool.c | 72 +++++++++++++++++--
include/net/mana/mana.h | 12 ++--
3 files changed, 112 insertions(+), 21 deletions(-)
--
2.34.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-06 20:46 [PATCH V2,net-next, 0/2] net: mana: Add support for coalesced RX packets Haiyang Zhang
@ 2026-01-06 20:46 ` Haiyang Zhang
2026-01-06 21:50 ` Long Li
2026-01-10 1:56 ` Jakub Kicinski
2026-01-06 20:46 ` [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type Haiyang Zhang
1 sibling, 2 replies; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-06 20:46 UTC (permalink / raw)
To: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
linux-rdma
Cc: paulros
From: Haiyang Zhang <haiyangz@microsoft.com>
Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
check and process the type CQE_RX_COALESCED_4. The default setting is
disabled, to avoid possible regression on latency.
And add ethtool handler to switch this feature. To turn it on, run:
ethtool -C <nic> rx-frames 4
To turn it off:
ethtool -C <nic> rx-frames 1
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
V2:
Updated extack msg, as recommended by Jakub Kicinski, and Simon Horman.
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 32 ++++++-----
.../ethernet/microsoft/mana/mana_ethtool.c | 55 +++++++++++++++++++
include/net/mana/mana.h | 2 +
3 files changed, 74 insertions(+), 15 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 1ad154f9db1a..a46a1adf83bc 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1330,7 +1330,7 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
req->update_hashkey = update_key;
req->update_indir_tab = update_tab;
req->default_rxobj = apc->default_rxobj;
- req->cqe_coalescing_enable = 0;
+ req->cqe_coalescing_enable = apc->cqe_coalescing_enable;
if (update_key)
memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE);
@@ -1864,11 +1864,12 @@ static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va,
}
static void mana_rx_skb(void *buf_va, bool from_pool,
- struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq)
+ struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq,
+ int i)
{
struct mana_stats_rx *rx_stats = &rxq->stats;
struct net_device *ndev = rxq->ndev;
- uint pkt_len = cqe->ppi[0].pkt_len;
+ uint pkt_len = cqe->ppi[i].pkt_len;
u16 rxq_idx = rxq->rxq_idx;
struct napi_struct *napi;
struct xdp_buff xdp = {};
@@ -1912,7 +1913,7 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
}
if (cqe->rx_hashtype != 0 && (ndev->features & NETIF_F_RXHASH)) {
- hash_value = cqe->ppi[0].pkt_hash;
+ hash_value = cqe->ppi[i].pkt_hash;
if (cqe->rx_hashtype & MANA_HASH_L4)
skb_set_hash(skb, hash_value, PKT_HASH_TYPE_L4);
@@ -2047,9 +2048,11 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
struct mana_recv_buf_oob *rxbuf_oob;
struct mana_port_context *apc;
struct device *dev = gc->dev;
+ bool coalesced = false;
void *old_buf = NULL;
u32 curr, pktlen;
bool old_fp;
+ int i = 0;
apc = netdev_priv(ndev);
@@ -2064,9 +2067,8 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
goto drop;
case CQE_RX_COALESCED_4:
- netdev_err(ndev, "RX coalescing is unsupported\n");
- apc->eth_stats.rx_coalesced_err++;
- return;
+ coalesced = true;
+ break;
case CQE_RX_OBJECT_FENCE:
complete(&rxq->fence_event);
@@ -2079,14 +2081,10 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
return;
}
- pktlen = oob->ppi[0].pkt_len;
-
- if (pktlen == 0) {
- /* data packets should never have packetlength of zero */
- netdev_err(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n",
- rxq->gdma_id, cq->gdma_id, rxq->rxobj);
+nextpkt:
+ pktlen = oob->ppi[i].pkt_len;
+ if (pktlen == 0)
return;
- }
curr = rxq->buf_index;
rxbuf_oob = &rxq->rx_oobs[curr];
@@ -2097,12 +2095,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
/* Unsuccessful refill will have old_buf == NULL.
* In this case, mana_rx_skb() will drop the packet.
*/
- mana_rx_skb(old_buf, old_fp, oob, rxq);
+ mana_rx_skb(old_buf, old_fp, oob, rxq, i);
drop:
mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
mana_post_pkt_rxq(rxq);
+
+ if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
+ goto nextpkt;
}
static void mana_poll_rx_cq(struct mana_cq *cq)
@@ -3276,6 +3277,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
apc->port_handle = INVALID_MANA_HANDLE;
apc->pf_filter_handle = INVALID_MANA_HANDLE;
apc->port_idx = port_idx;
+ apc->cqe_coalescing_enable = 0;
mutex_init(&apc->vport_mutex);
apc->vport_use_count = 0;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 0e2f4343ac67..b2b9bfb50396 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -397,6 +397,58 @@ static void mana_get_channels(struct net_device *ndev,
channel->combined_count = apc->num_queues;
}
+static int mana_get_coalesce(struct net_device *ndev,
+ struct ethtool_coalesce *ec,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct mana_port_context *apc = netdev_priv(ndev);
+
+ ec->rx_max_coalesced_frames =
+ apc->cqe_coalescing_enable ? MANA_RXCOMP_OOB_NUM_PPI : 1;
+
+ return 0;
+}
+
+static int mana_set_coalesce(struct net_device *ndev,
+ struct ethtool_coalesce *ec,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct mana_port_context *apc = netdev_priv(ndev);
+ u8 saved_cqe_coalescing_enable;
+ int err;
+
+ if (ec->rx_max_coalesced_frames != 1 &&
+ ec->rx_max_coalesced_frames != MANA_RXCOMP_OOB_NUM_PPI) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "rx-frames must be 1 or %u, got %u",
+ MANA_RXCOMP_OOB_NUM_PPI,
+ ec->rx_max_coalesced_frames);
+ return -EINVAL;
+ }
+
+ saved_cqe_coalescing_enable = apc->cqe_coalescing_enable;
+ apc->cqe_coalescing_enable =
+ ec->rx_max_coalesced_frames == MANA_RXCOMP_OOB_NUM_PPI;
+
+ if (!apc->port_is_up)
+ return 0;
+
+ err = mana_config_rss(apc, TRI_STATE_TRUE, false, false);
+
+ if (err) {
+ netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
+ ec->rx_max_coalesced_frames, err);
+ NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed",
+ ec->rx_max_coalesced_frames);
+
+ apc->cqe_coalescing_enable = saved_cqe_coalescing_enable;
+ }
+
+ return err;
+}
+
static int mana_set_channels(struct net_device *ndev,
struct ethtool_channels *channels)
{
@@ -517,6 +569,7 @@ static int mana_get_link_ksettings(struct net_device *ndev,
}
const struct ethtool_ops mana_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
.get_ethtool_stats = mana_get_ethtool_stats,
.get_sset_count = mana_get_sset_count,
.get_strings = mana_get_strings,
@@ -527,6 +580,8 @@ const struct ethtool_ops mana_ethtool_ops = {
.set_rxfh = mana_set_rxfh,
.get_channels = mana_get_channels,
.set_channels = mana_set_channels,
+ .get_coalesce = mana_get_coalesce,
+ .set_coalesce = mana_set_coalesce,
.get_ringparam = mana_get_ringparam,
.set_ringparam = mana_set_ringparam,
.get_link_ksettings = mana_get_link_ksettings,
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index d7e089c6b694..51d26ebeff6c 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -556,6 +556,8 @@ struct mana_port_context {
bool port_is_up;
bool port_st_save; /* Saved port state */
+ u8 cqe_coalescing_enable;
+
struct mana_ethtool_stats eth_stats;
struct mana_ethtool_phy_stats phy_stats;
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type
2026-01-06 20:46 [PATCH V2,net-next, 0/2] net: mana: Add support for coalesced RX packets Haiyang Zhang
2026-01-06 20:46 ` [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE Haiyang Zhang
@ 2026-01-06 20:46 ` Haiyang Zhang
2026-01-06 22:10 ` Long Li
2026-01-10 1:56 ` Jakub Kicinski
1 sibling, 2 replies; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-06 20:46 UTC (permalink / raw)
To: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
linux-rdma
Cc: paulros
From: Haiyang Zhang <haiyangz@microsoft.com>
For RX CQEs with type CQE_RX_COALESCED_4, to measure the coalescing
efficiency, add counters to count how many contains 2, 3, 4 packets
respectively.
Also, add a counter for the error case of first packet with length == 0.
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 25 +++++++++++++++++--
.../ethernet/microsoft/mana/mana_ethtool.c | 17 ++++++++++---
include/net/mana/mana.h | 10 +++++---
3 files changed, 42 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a46a1adf83bc..78824567d80b 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2083,8 +2083,22 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
nextpkt:
pktlen = oob->ppi[i].pkt_len;
- if (pktlen == 0)
+ if (pktlen == 0) {
+ /* Collect coalesced CQE count based on packets processed.
+ * Coalesced CQEs have at least 2 packets, so index is i - 2.
+ */
+ if (i > 1) {
+ u64_stats_update_begin(&rxq->stats.syncp);
+ rxq->stats.coalesced_cqe[i - 2]++;
+ u64_stats_update_end(&rxq->stats.syncp);
+ } else if (i == 0) {
+ /* Error case stat */
+ u64_stats_update_begin(&rxq->stats.syncp);
+ rxq->stats.pkt_len0_err++;
+ u64_stats_update_end(&rxq->stats.syncp);
+ }
return;
+ }
curr = rxq->buf_index;
rxbuf_oob = &rxq->rx_oobs[curr];
@@ -2102,8 +2116,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
mana_post_pkt_rxq(rxq);
- if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
+ if (!coalesced)
+ return;
+
+ if (++i < MANA_RXCOMP_OOB_NUM_PPI)
goto nextpkt;
+
+ u64_stats_update_begin(&rxq->stats.syncp);
+ rxq->stats.coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 2]++;
+ u64_stats_update_end(&rxq->stats.syncp);
}
static void mana_poll_rx_cq(struct mana_cq *cq)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index b2b9bfb50396..635796bfdaf1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -20,8 +20,6 @@ static const struct mana_stats_desc mana_eth_stats[] = {
tx_cqe_unknown_type)},
{"tx_linear_pkt_cnt", offsetof(struct mana_ethtool_stats,
tx_linear_pkt_cnt)},
- {"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
- rx_coalesced_err)},
{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
rx_cqe_unknown_type)},
};
@@ -151,7 +149,7 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
{
struct mana_port_context *apc = netdev_priv(ndev);
unsigned int num_queues = apc->num_queues;
- int i;
+ int i, j;
if (stringset != ETH_SS_STATS)
return;
@@ -170,6 +168,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
+ ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+ for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
+ ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
}
for (i = 0; i < num_queues; i++) {
@@ -203,6 +204,8 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
u64 xdp_xmit;
u64 xdp_drop;
u64 xdp_tx;
+ u64 pkt_len0_err;
+ u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
u64 tso_packets;
u64 tso_bytes;
u64 tso_inner_packets;
@@ -211,7 +214,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
u64 short_pkt_fmt;
u64 csum_partial;
u64 mana_map_err;
- int q, i = 0;
+ int q, i = 0, j;
if (!apc->port_is_up)
return;
@@ -241,6 +244,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
xdp_drop = rx_stats->xdp_drop;
xdp_tx = rx_stats->xdp_tx;
xdp_redirect = rx_stats->xdp_redirect;
+ pkt_len0_err = rx_stats->pkt_len0_err;
+ for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
+ coalesced_cqe[j] = rx_stats->coalesced_cqe[j];
} while (u64_stats_fetch_retry(&rx_stats->syncp, start));
data[i++] = packets;
@@ -248,6 +254,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
data[i++] = xdp_drop;
data[i++] = xdp_tx;
data[i++] = xdp_redirect;
+ data[i++] = pkt_len0_err;
+ for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
+ data[i++] = coalesced_cqe[j];
}
for (q = 0; q < num_queues; q++) {
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 51d26ebeff6c..f8dd19860103 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -61,8 +61,11 @@ enum TRI_STATE {
#define MAX_PORTS_IN_MANA_DEV 256
+/* Maximum number of packets per coalesced CQE */
+#define MANA_RXCOMP_OOB_NUM_PPI 4
+
/* Update this count whenever the respective structures are changed */
-#define MANA_STATS_RX_COUNT 5
+#define MANA_STATS_RX_COUNT (6 + MANA_RXCOMP_OOB_NUM_PPI - 1)
#define MANA_STATS_TX_COUNT 11
#define MANA_RX_FRAG_ALIGNMENT 64
@@ -73,6 +76,8 @@ struct mana_stats_rx {
u64 xdp_drop;
u64 xdp_tx;
u64 xdp_redirect;
+ u64 pkt_len0_err;
+ u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
struct u64_stats_sync syncp;
};
@@ -227,8 +232,6 @@ struct mana_rxcomp_perpkt_info {
u32 pkt_hash;
}; /* HW DATA */
-#define MANA_RXCOMP_OOB_NUM_PPI 4
-
/* Receive completion OOB */
struct mana_rxcomp_oob {
struct mana_cqe_header cqe_hdr;
@@ -378,7 +381,6 @@ struct mana_ethtool_stats {
u64 tx_cqe_err;
u64 tx_cqe_unknown_type;
u64 tx_linear_pkt_cnt;
- u64 rx_coalesced_err;
u64 rx_cqe_unknown_type;
};
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* RE: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-06 20:46 ` [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE Haiyang Zhang
@ 2026-01-06 21:50 ` Long Li
2026-01-10 1:56 ` Jakub Kicinski
1 sibling, 0 replies; 23+ messages in thread
From: Long Li @ 2026-01-06 21:50 UTC (permalink / raw)
To: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
Aditya Garg, Dipayaan Roy, Shiraz Saleem,
linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org
Cc: Paul Rosswurm
> Subject: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX
> packets on CQE
>
> From: Haiyang Zhang <haiyangz@microsoft.com>
>
> Our NIC can have up to 4 RX packets on 1 CQE. To support this feature, check
> and process the type CQE_RX_COALESCED_4. The default setting is disabled,
> to avoid possible regression on latency.
>
> And add ethtool handler to switch this feature. To turn it on, run:
> ethtool -C <nic> rx-frames 4
> To turn it off:
> ethtool -C <nic> rx-frames 1
>
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
> ---
> V2:
> Updated extack msg, as recommended by Jakub Kicinski, and Simon Horman.
>
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 32 ++++++-----
> .../ethernet/microsoft/mana/mana_ethtool.c | 55 +++++++++++++++++++
> include/net/mana/mana.h | 2 +
> 3 files changed, 74 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 1ad154f9db1a..a46a1adf83bc 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -1330,7 +1330,7 @@ static int mana_cfg_vport_steering(struct
> mana_port_context *apc,
> req->update_hashkey = update_key;
> req->update_indir_tab = update_tab;
> req->default_rxobj = apc->default_rxobj;
> - req->cqe_coalescing_enable = 0;
> + req->cqe_coalescing_enable = apc->cqe_coalescing_enable;
>
> if (update_key)
> memcpy(&req->hashkey, apc->hashkey,
> MANA_HASH_KEY_SIZE); @@ -1864,11 +1864,12 @@ static struct sk_buff
> *mana_build_skb(struct mana_rxq *rxq, void *buf_va, }
>
> static void mana_rx_skb(void *buf_va, bool from_pool,
> - struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq)
> + struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq,
> + int i)
> {
> struct mana_stats_rx *rx_stats = &rxq->stats;
> struct net_device *ndev = rxq->ndev;
> - uint pkt_len = cqe->ppi[0].pkt_len;
> + uint pkt_len = cqe->ppi[i].pkt_len;
> u16 rxq_idx = rxq->rxq_idx;
> struct napi_struct *napi;
> struct xdp_buff xdp = {};
> @@ -1912,7 +1913,7 @@ static void mana_rx_skb(void *buf_va, bool
> from_pool,
> }
>
> if (cqe->rx_hashtype != 0 && (ndev->features & NETIF_F_RXHASH)) {
> - hash_value = cqe->ppi[0].pkt_hash;
> + hash_value = cqe->ppi[i].pkt_hash;
>
> if (cqe->rx_hashtype & MANA_HASH_L4)
> skb_set_hash(skb, hash_value, PKT_HASH_TYPE_L4);
> @@ -2047,9 +2048,11 @@ static void mana_process_rx_cqe(struct
> mana_rxq *rxq, struct mana_cq *cq,
> struct mana_recv_buf_oob *rxbuf_oob;
> struct mana_port_context *apc;
> struct device *dev = gc->dev;
> + bool coalesced = false;
> void *old_buf = NULL;
> u32 curr, pktlen;
> bool old_fp;
> + int i = 0;
>
> apc = netdev_priv(ndev);
>
> @@ -2064,9 +2067,8 @@ static void mana_process_rx_cqe(struct mana_rxq
> *rxq, struct mana_cq *cq,
> goto drop;
>
> case CQE_RX_COALESCED_4:
> - netdev_err(ndev, "RX coalescing is unsupported\n");
> - apc->eth_stats.rx_coalesced_err++;
> - return;
> + coalesced = true;
> + break;
>
> case CQE_RX_OBJECT_FENCE:
> complete(&rxq->fence_event);
> @@ -2079,14 +2081,10 @@ static void mana_process_rx_cqe(struct
> mana_rxq *rxq, struct mana_cq *cq,
> return;
> }
>
> - pktlen = oob->ppi[0].pkt_len;
> -
> - if (pktlen == 0) {
> - /* data packets should never have packetlength of zero */
> - netdev_err(ndev, "RX pkt len=0, rq=%u, cq=%u,
> rxobj=0x%llx\n",
> - rxq->gdma_id, cq->gdma_id, rxq->rxobj);
> +nextpkt:
> + pktlen = oob->ppi[i].pkt_len;
> + if (pktlen == 0)
> return;
> - }
>
> curr = rxq->buf_index;
> rxbuf_oob = &rxq->rx_oobs[curr];
> @@ -2097,12 +2095,15 @@ static void mana_process_rx_cqe(struct
> mana_rxq *rxq, struct mana_cq *cq,
> /* Unsuccessful refill will have old_buf == NULL.
> * In this case, mana_rx_skb() will drop the packet.
> */
> - mana_rx_skb(old_buf, old_fp, oob, rxq);
> + mana_rx_skb(old_buf, old_fp, oob, rxq, i);
>
> drop:
> mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob-
> >wqe_inf.wqe_size_in_bu);
>
> mana_post_pkt_rxq(rxq);
> +
> + if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
> + goto nextpkt;
> }
>
> static void mana_poll_rx_cq(struct mana_cq *cq) @@ -3276,6 +3277,7 @@
> static int mana_probe_port(struct mana_context *ac, int port_idx,
> apc->port_handle = INVALID_MANA_HANDLE;
> apc->pf_filter_handle = INVALID_MANA_HANDLE;
> apc->port_idx = port_idx;
> + apc->cqe_coalescing_enable = 0;
>
> mutex_init(&apc->vport_mutex);
> apc->vport_use_count = 0;
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> index 0e2f4343ac67..b2b9bfb50396 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> @@ -397,6 +397,58 @@ static void mana_get_channels(struct net_device
> *ndev,
> channel->combined_count = apc->num_queues; }
>
> +static int mana_get_coalesce(struct net_device *ndev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack) {
> + struct mana_port_context *apc = netdev_priv(ndev);
> +
> + ec->rx_max_coalesced_frames =
> + apc->cqe_coalescing_enable ?
> MANA_RXCOMP_OOB_NUM_PPI : 1;
> +
> + return 0;
> +}
> +
> +static int mana_set_coalesce(struct net_device *ndev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack) {
> + struct mana_port_context *apc = netdev_priv(ndev);
> + u8 saved_cqe_coalescing_enable;
> + int err;
> +
> + if (ec->rx_max_coalesced_frames != 1 &&
> + ec->rx_max_coalesced_frames != MANA_RXCOMP_OOB_NUM_PPI)
> {
> + NL_SET_ERR_MSG_FMT(extack,
> + "rx-frames must be 1 or %u, got %u",
> + MANA_RXCOMP_OOB_NUM_PPI,
> + ec->rx_max_coalesced_frames);
> + return -EINVAL;
> + }
> +
> + saved_cqe_coalescing_enable = apc->cqe_coalescing_enable;
> + apc->cqe_coalescing_enable =
> + ec->rx_max_coalesced_frames ==
> MANA_RXCOMP_OOB_NUM_PPI;
> +
> + if (!apc->port_is_up)
> + return 0;
> +
> + err = mana_config_rss(apc, TRI_STATE_TRUE, false, false);
> +
> + if (err) {
> + netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
> + ec->rx_max_coalesced_frames, err);
> + NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed",
> + ec->rx_max_coalesced_frames);
> +
> + apc->cqe_coalescing_enable = saved_cqe_coalescing_enable;
> + }
> +
> + return err;
> +}
> +
> static int mana_set_channels(struct net_device *ndev,
> struct ethtool_channels *channels) { @@ -517,6
> +569,7 @@ static int mana_get_link_ksettings(struct net_device *ndev, }
>
> const struct ethtool_ops mana_ethtool_ops = {
> + .supported_coalesce_params =
> ETHTOOL_COALESCE_RX_MAX_FRAMES,
> .get_ethtool_stats = mana_get_ethtool_stats,
> .get_sset_count = mana_get_sset_count,
> .get_strings = mana_get_strings,
> @@ -527,6 +580,8 @@ const struct ethtool_ops mana_ethtool_ops = {
> .set_rxfh = mana_set_rxfh,
> .get_channels = mana_get_channels,
> .set_channels = mana_set_channels,
> + .get_coalesce = mana_get_coalesce,
> + .set_coalesce = mana_set_coalesce,
> .get_ringparam = mana_get_ringparam,
> .set_ringparam = mana_set_ringparam,
> .get_link_ksettings = mana_get_link_ksettings,
> diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index
> d7e089c6b694..51d26ebeff6c 100644
> --- a/include/net/mana/mana.h
> +++ b/include/net/mana/mana.h
> @@ -556,6 +556,8 @@ struct mana_port_context {
> bool port_is_up;
> bool port_st_save; /* Saved port state */
>
> + u8 cqe_coalescing_enable;
> +
> struct mana_ethtool_stats eth_stats;
>
> struct mana_ethtool_phy_stats phy_stats;
> --
> 2.34.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type
2026-01-06 20:46 ` [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type Haiyang Zhang
@ 2026-01-06 22:10 ` Long Li
2026-01-10 1:56 ` Jakub Kicinski
1 sibling, 0 replies; 23+ messages in thread
From: Long Li @ 2026-01-06 22:10 UTC (permalink / raw)
To: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
Aditya Garg, Dipayaan Roy, Shiraz Saleem,
linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org
Cc: Paul Rosswurm
> Subject: [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX
> CQEs in coalesced type
>
> From: Haiyang Zhang <haiyangz@microsoft.com>
>
> For RX CQEs with type CQE_RX_COALESCED_4, to measure the coalescing
> efficiency, add counters to count how many contains 2, 3, 4 packets
> respectively.
> Also, add a counter for the error case of first packet with length == 0.
>
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 25
> +++++++++++++++++--
> .../ethernet/microsoft/mana/mana_ethtool.c | 17 ++++++++++---
> include/net/mana/mana.h | 10 +++++---
> 3 files changed, 42 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a46a1adf83bc..78824567d80b 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -2083,8 +2083,22 @@ static void mana_process_rx_cqe(struct
> mana_rxq *rxq, struct mana_cq *cq,
>
> nextpkt:
> pktlen = oob->ppi[i].pkt_len;
> - if (pktlen == 0)
> + if (pktlen == 0) {
> + /* Collect coalesced CQE count based on packets processed.
> + * Coalesced CQEs have at least 2 packets, so index is i - 2.
> + */
> + if (i > 1) {
> + u64_stats_update_begin(&rxq->stats.syncp);
> + rxq->stats.coalesced_cqe[i - 2]++;
> + u64_stats_update_end(&rxq->stats.syncp);
> + } else if (i == 0) {
> + /* Error case stat */
> + u64_stats_update_begin(&rxq->stats.syncp);
> + rxq->stats.pkt_len0_err++;
> + u64_stats_update_end(&rxq->stats.syncp);
> + }
> return;
> + }
>
> curr = rxq->buf_index;
> rxbuf_oob = &rxq->rx_oobs[curr];
> @@ -2102,8 +2116,15 @@ static void mana_process_rx_cqe(struct
> mana_rxq *rxq, struct mana_cq *cq,
>
> mana_post_pkt_rxq(rxq);
>
> - if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
> + if (!coalesced)
> + return;
> +
> + if (++i < MANA_RXCOMP_OOB_NUM_PPI)
> goto nextpkt;
> +
> + u64_stats_update_begin(&rxq->stats.syncp);
> + rxq->stats.coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 2]++;
> + u64_stats_update_end(&rxq->stats.syncp);
> }
>
> static void mana_poll_rx_cq(struct mana_cq *cq) diff --git
> a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> index b2b9bfb50396..635796bfdaf1 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> @@ -20,8 +20,6 @@ static const struct mana_stats_desc mana_eth_stats[] =
> {
> tx_cqe_unknown_type)},
> {"tx_linear_pkt_cnt", offsetof(struct mana_ethtool_stats,
> tx_linear_pkt_cnt)},
> - {"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
> - rx_coalesced_err)},
> {"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
> rx_cqe_unknown_type)},
> };
> @@ -151,7 +149,7 @@ static void mana_get_strings(struct net_device *ndev,
> u32 stringset, u8 *data) {
> struct mana_port_context *apc = netdev_priv(ndev);
> unsigned int num_queues = apc->num_queues;
> - int i;
> + int i, j;
>
> if (stringset != ETH_SS_STATS)
> return;
> @@ -170,6 +168,9 @@ static void mana_get_strings(struct net_device *ndev,
> u32 stringset, u8 *data)
> ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
> ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
> ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
> + ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
> + for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
> + ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i,
> j + 2);
> }
>
> for (i = 0; i < num_queues; i++) {
> @@ -203,6 +204,8 @@ static void mana_get_ethtool_stats(struct net_device
> *ndev,
> u64 xdp_xmit;
> u64 xdp_drop;
> u64 xdp_tx;
> + u64 pkt_len0_err;
> + u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
> u64 tso_packets;
> u64 tso_bytes;
> u64 tso_inner_packets;
> @@ -211,7 +214,7 @@ static void mana_get_ethtool_stats(struct net_device
> *ndev,
> u64 short_pkt_fmt;
> u64 csum_partial;
> u64 mana_map_err;
> - int q, i = 0;
> + int q, i = 0, j;
>
> if (!apc->port_is_up)
> return;
> @@ -241,6 +244,9 @@ static void mana_get_ethtool_stats(struct net_device
> *ndev,
> xdp_drop = rx_stats->xdp_drop;
> xdp_tx = rx_stats->xdp_tx;
> xdp_redirect = rx_stats->xdp_redirect;
> + pkt_len0_err = rx_stats->pkt_len0_err;
> + for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1;
> j++)
> + coalesced_cqe[j] = rx_stats->coalesced_cqe[j];
> } while (u64_stats_fetch_retry(&rx_stats->syncp, start));
>
> data[i++] = packets;
> @@ -248,6 +254,9 @@ static void mana_get_ethtool_stats(struct net_device
> *ndev,
> data[i++] = xdp_drop;
> data[i++] = xdp_tx;
> data[i++] = xdp_redirect;
> + data[i++] = pkt_len0_err;
> + for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
> + data[i++] = coalesced_cqe[j];
> }
>
> for (q = 0; q < num_queues; q++) {
> diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index
> 51d26ebeff6c..f8dd19860103 100644
> --- a/include/net/mana/mana.h
> +++ b/include/net/mana/mana.h
> @@ -61,8 +61,11 @@ enum TRI_STATE {
>
> #define MAX_PORTS_IN_MANA_DEV 256
>
> +/* Maximum number of packets per coalesced CQE */ #define
> +MANA_RXCOMP_OOB_NUM_PPI 4
> +
> /* Update this count whenever the respective structures are changed */ -
> #define MANA_STATS_RX_COUNT 5
> +#define MANA_STATS_RX_COUNT (6 + MANA_RXCOMP_OOB_NUM_PPI - 1)
> #define MANA_STATS_TX_COUNT 11
>
> #define MANA_RX_FRAG_ALIGNMENT 64
> @@ -73,6 +76,8 @@ struct mana_stats_rx {
> u64 xdp_drop;
> u64 xdp_tx;
> u64 xdp_redirect;
> + u64 pkt_len0_err;
> + u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
> struct u64_stats_sync syncp;
> };
>
> @@ -227,8 +232,6 @@ struct mana_rxcomp_perpkt_info {
> u32 pkt_hash;
> }; /* HW DATA */
>
> -#define MANA_RXCOMP_OOB_NUM_PPI 4
> -
> /* Receive completion OOB */
> struct mana_rxcomp_oob {
> struct mana_cqe_header cqe_hdr;
> @@ -378,7 +381,6 @@ struct mana_ethtool_stats {
> u64 tx_cqe_err;
> u64 tx_cqe_unknown_type;
> u64 tx_linear_pkt_cnt;
> - u64 rx_coalesced_err;
> u64 rx_cqe_unknown_type;
> };
>
> --
> 2.34.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-06 20:46 ` [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE Haiyang Zhang
2026-01-06 21:50 ` Long Li
@ 2026-01-10 1:56 ` Jakub Kicinski
2026-01-12 21:01 ` [EXTERNAL] " Haiyang Zhang
1 sibling, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-10 1:56 UTC (permalink / raw)
To: Haiyang Zhang
Cc: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
Paolo Abeni, Konstantin Taranov, Simon Horman,
Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
linux-rdma, paulros
On Tue, 6 Jan 2026 12:46:46 -0800 Haiyang Zhang wrote:
> From: Haiyang Zhang <haiyangz@microsoft.com>
>
> Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
> check and process the type CQE_RX_COALESCED_4. The default setting is
> disabled, to avoid possible regression on latency.
>
> And add ethtool handler to switch this feature. To turn it on, run:
> ethtool -C <nic> rx-frames 4
> To turn it off:
> ethtool -C <nic> rx-frames 1
Exposing just rx frame count, and only two values is quite unusual.
Please explain in more detail the coalescing logic of the device.
> @@ -2079,14 +2081,10 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
> return;
> }
>
> - pktlen = oob->ppi[0].pkt_len;
> -
> - if (pktlen == 0) {
> - /* data packets should never have packetlength of zero */
> - netdev_err(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n",
> - rxq->gdma_id, cq->gdma_id, rxq->rxobj);
> +nextpkt:
> + pktlen = oob->ppi[i].pkt_len;
> + if (pktlen == 0)
> return;
> - }
>
> curr = rxq->buf_index;
> rxbuf_oob = &rxq->rx_oobs[curr];
> @@ -2097,12 +2095,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
> /* Unsuccessful refill will have old_buf == NULL.
> * In this case, mana_rx_skb() will drop the packet.
> */
> - mana_rx_skb(old_buf, old_fp, oob, rxq);
> + mana_rx_skb(old_buf, old_fp, oob, rxq, i);
>
> drop:
> mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
>
> mana_post_pkt_rxq(rxq);
> +
> + if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
> + goto nextpkt;
Please code this up as a loop. Using gotos for control flow other than
to jump to error handling epilogues is a poor coding practice (see the
kernel coding style).
> +static int mana_set_coalesce(struct net_device *ndev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct mana_port_context *apc = netdev_priv(ndev);
> + u8 saved_cqe_coalescing_enable;
> + int err;
> +
> + if (ec->rx_max_coalesced_frames != 1 &&
> + ec->rx_max_coalesced_frames != MANA_RXCOMP_OOB_NUM_PPI) {
> + NL_SET_ERR_MSG_FMT(extack,
> + "rx-frames must be 1 or %u, got %u",
> + MANA_RXCOMP_OOB_NUM_PPI,
> + ec->rx_max_coalesced_frames);
> + return -EINVAL;
> + }
> +
> + saved_cqe_coalescing_enable = apc->cqe_coalescing_enable;
> + apc->cqe_coalescing_enable =
> + ec->rx_max_coalesced_frames == MANA_RXCOMP_OOB_NUM_PPI;
> +
> + if (!apc->port_is_up)
> + return 0;
> +
> + err = mana_config_rss(apc, TRI_STATE_TRUE, false, false);
> +
unnecessary empty line
> + if (err) {
> + netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
> + ec->rx_max_coalesced_frames, err);
> + NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed",
> + ec->rx_max_coalesced_frames);
These messages are both pointless. If HW communication has failed
presumably there will already be an error in the logs. The extack
gives the user no information they wouldn't already have.
> + apc->cqe_coalescing_enable = saved_cqe_coalescing_enable;
> + }
> +
> + return err;
> +}
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type
2026-01-06 20:46 ` [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type Haiyang Zhang
2026-01-06 22:10 ` Long Li
@ 2026-01-10 1:56 ` Jakub Kicinski
2026-01-12 21:03 ` [EXTERNAL] " Haiyang Zhang
1 sibling, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-10 1:56 UTC (permalink / raw)
To: Haiyang Zhang
Cc: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
Paolo Abeni, Konstantin Taranov, Simon Horman,
Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
linux-rdma, paulros
On Tue, 6 Jan 2026 12:46:47 -0800 Haiyang Zhang wrote:
> @@ -227,8 +232,6 @@ struct mana_rxcomp_perpkt_info {
> u32 pkt_hash;
> }; /* HW DATA */
>
> -#define MANA_RXCOMP_OOB_NUM_PPI 4
> -
> /* Receive completion OOB */
> struct mana_rxcomp_oob {
> struct mana_cqe_header cqe_hdr;
> @@ -378,7 +381,6 @@ struct mana_ethtool_stats {
> u64 tx_cqe_err;
> u64 tx_cqe_unknown_type;
> u64 tx_linear_pkt_cnt;
> - u64 rx_coalesced_err;
> u64 rx_cqe_unknown_type;
> };
This should be deleted in the previous patch already
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-10 1:56 ` Jakub Kicinski
@ 2026-01-12 21:01 ` Haiyang Zhang
2026-01-13 1:21 ` Jakub Kicinski
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-12 21:01 UTC (permalink / raw)
To: Jakub Kicinski, Haiyang Zhang
Cc: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
KY Srinivasan, Wei Liu, Dexuan Cui, Long Li, Andrew Lunn,
David S. Miller, Eric Dumazet, Paolo Abeni, Konstantin Taranov,
Simon Horman, Erni Sri Satya Vennela, Shradha Gupta,
Saurabh Sengar, Aditya Garg, Dipayaan Roy, Shiraz Saleem,
linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Friday, January 9, 2026 8:56 PM
> To: Haiyang Zhang <haiyangz@linux.microsoft.com>
> Cc: linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Wei Liu
> <wei.liu@kernel.org>; Dexuan Cui <DECUI@microsoft.com>; Long Li
> <longli@microsoft.com>; Andrew Lunn <andrew+netdev@lunn.ch>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Paolo
> Abeni <pabeni@redhat.com>; Konstantin Taranov <kotaranov@microsoft.com>;
> Simon Horman <horms@kernel.org>; Erni Sri Satya Vennela
> <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support
> for coalesced RX packets on CQE
>
> On Tue, 6 Jan 2026 12:46:46 -0800 Haiyang Zhang wrote:
> > From: Haiyang Zhang <haiyangz@microsoft.com>
> >
> > Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
> > check and process the type CQE_RX_COALESCED_4. The default setting is
> > disabled, to avoid possible regression on latency.
> >
> > And add ethtool handler to switch this feature. To turn it on, run:
> > ethtool -C <nic> rx-frames 4
> > To turn it off:
> > ethtool -C <nic> rx-frames 1
>
> Exposing just rx frame count, and only two values is quite unusual.
> Please explain in more detail the coalescing logic of the device.
Our NIC device only supports coalescing on RX. And when it's disabled each
RX CQE indicates 1 RX packet; when enabled each RX CQE indicates up to 4 packets.
>
> > @@ -2079,14 +2081,10 @@ static void mana_process_rx_cqe(struct mana_rxq
> *rxq, struct mana_cq *cq,
> > return;
> > }
> >
> > - pktlen = oob->ppi[0].pkt_len;
> > -
> > - if (pktlen == 0) {
> > - /* data packets should never have packetlength of zero */
> > - netdev_err(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n",
> > - rxq->gdma_id, cq->gdma_id, rxq->rxobj);
> > +nextpkt:
> > + pktlen = oob->ppi[i].pkt_len;
> > + if (pktlen == 0)
> > return;
> > - }
> >
> > curr = rxq->buf_index;
> > rxbuf_oob = &rxq->rx_oobs[curr];
> > @@ -2097,12 +2095,15 @@ static void mana_process_rx_cqe(struct mana_rxq
> *rxq, struct mana_cq *cq,
> > /* Unsuccessful refill will have old_buf == NULL.
> > * In this case, mana_rx_skb() will drop the packet.
> > */
> > - mana_rx_skb(old_buf, old_fp, oob, rxq);
> > + mana_rx_skb(old_buf, old_fp, oob, rxq, i);
> >
> > drop:
> > mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
> >
> > mana_post_pkt_rxq(rxq);
> > +
> > + if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
> > + goto nextpkt;
>
> Please code this up as a loop. Using gotos for control flow other than
> to jump to error handling epilogues is a poor coding practice (see the
> kernel coding style).
Will do.
>
> > +static int mana_set_coalesce(struct net_device *ndev,
> > + struct ethtool_coalesce *ec,
> > + struct kernel_ethtool_coalesce *kernel_coal,
> > + struct netlink_ext_ack *extack)
> > +{
> > + struct mana_port_context *apc = netdev_priv(ndev);
> > + u8 saved_cqe_coalescing_enable;
> > + int err;
> > +
> > + if (ec->rx_max_coalesced_frames != 1 &&
> > + ec->rx_max_coalesced_frames != MANA_RXCOMP_OOB_NUM_PPI) {
> > + NL_SET_ERR_MSG_FMT(extack,
> > + "rx-frames must be 1 or %u, got %u",
> > + MANA_RXCOMP_OOB_NUM_PPI,
> > + ec->rx_max_coalesced_frames);
> > + return -EINVAL;
> > + }
> > +
> > + saved_cqe_coalescing_enable = apc->cqe_coalescing_enable;
> > + apc->cqe_coalescing_enable =
> > + ec->rx_max_coalesced_frames == MANA_RXCOMP_OOB_NUM_PPI;
> > +
> > + if (!apc->port_is_up)
> > + return 0;
> > +
> > + err = mana_config_rss(apc, TRI_STATE_TRUE, false, false);
> > +
>
> unnecessary empty line
Will rm.
>
> > + if (err) {
> > + netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
> > + ec->rx_max_coalesced_frames, err);
> > + NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed",
> > + ec->rx_max_coalesced_frames);
>
> These messages are both pointless. If HW communication has failed
> presumably there will already be an error in the logs. The extack
> gives the user no information they wouldn't already have.
Will rm.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type
2026-01-10 1:56 ` Jakub Kicinski
@ 2026-01-12 21:03 ` Haiyang Zhang
0 siblings, 0 replies; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-12 21:03 UTC (permalink / raw)
To: Jakub Kicinski, Haiyang Zhang
Cc: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
KY Srinivasan, Wei Liu, Dexuan Cui, Long Li, Andrew Lunn,
David S. Miller, Eric Dumazet, Paolo Abeni, Konstantin Taranov,
Simon Horman, Erni Sri Satya Vennela, Shradha Gupta,
Saurabh Sengar, Aditya Garg, Dipayaan Roy, Shiraz Saleem,
linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Friday, January 9, 2026 8:56 PM
> To: Haiyang Zhang <haiyangz@linux.microsoft.com>
> Cc: linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Wei Liu
> <wei.liu@kernel.org>; Dexuan Cui <DECUI@microsoft.com>; Long Li
> <longli@microsoft.com>; Andrew Lunn <andrew+netdev@lunn.ch>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Paolo
> Abeni <pabeni@redhat.com>; Konstantin Taranov <kotaranov@microsoft.com>;
> Simon Horman <horms@kernel.org>; Erni Sri Satya Vennela
> <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: [EXTERNAL] Re: [PATCH V2,net-next, 2/2] net: mana: Add ethtool
> counters for RX CQEs in coalesced type
>
> On Tue, 6 Jan 2026 12:46:47 -0800 Haiyang Zhang wrote:
> > @@ -227,8 +232,6 @@ struct mana_rxcomp_perpkt_info {
> > u32 pkt_hash;
> > }; /* HW DATA */
> >
> > -#define MANA_RXCOMP_OOB_NUM_PPI 4
> > -
> > /* Receive completion OOB */
> > struct mana_rxcomp_oob {
> > struct mana_cqe_header cqe_hdr;
> > @@ -378,7 +381,6 @@ struct mana_ethtool_stats {
> > u64 tx_cqe_err;
> > u64 tx_cqe_unknown_type;
> > u64 tx_linear_pkt_cnt;
> > - u64 rx_coalesced_err;
> > u64 rx_cqe_unknown_type;
> > };
>
> This should be deleted in the previous patch already
Will do.
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-12 21:01 ` [EXTERNAL] " Haiyang Zhang
@ 2026-01-13 1:21 ` Jakub Kicinski
2026-01-13 15:09 ` Haiyang Zhang
0 siblings, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-13 1:21 UTC (permalink / raw)
To: Haiyang Zhang
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
On Mon, 12 Jan 2026 21:01:59 +0000 Haiyang Zhang wrote:
> > > Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
> > > check and process the type CQE_RX_COALESCED_4. The default setting is
> > > disabled, to avoid possible regression on latency.
> > >
> > > And add ethtool handler to switch this feature. To turn it on, run:
> > > ethtool -C <nic> rx-frames 4
> > > To turn it off:
> > > ethtool -C <nic> rx-frames 1
> >
> > Exposing just rx frame count, and only two values is quite unusual.
> > Please explain in more detail the coalescing logic of the device.
> Our NIC device only supports coalescing on RX. And when it's disabled each
> RX CQE indicates 1 RX packet; when enabled each RX CQE indicates up to 4 packets.
I get that. What is the logic for combining 4 packets into a single
completion? How does it work? Your commit message mentions "regression
on latency" - what is the bound on that regression?
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-13 1:21 ` Jakub Kicinski
@ 2026-01-13 15:09 ` Haiyang Zhang
2026-01-13 15:13 ` Haiyang Zhang
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-13 15:09 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Monday, January 12, 2026 8:22 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Mon, 12 Jan 2026 21:01:59 +0000 Haiyang Zhang wrote:
> > > > Our NIC can have up to 4 RX packets on 1 CQE. To support this
> feature,
> > > > check and process the type CQE_RX_COALESCED_4. The default setting
> is
> > > > disabled, to avoid possible regression on latency.
> > > >
> > > > And add ethtool handler to switch this feature. To turn it on, run:
> > > > ethtool -C <nic> rx-frames 4
> > > > To turn it off:
> > > > ethtool -C <nic> rx-frames 1
> > >
> > > Exposing just rx frame count, and only two values is quite unusual.
> > > Please explain in more detail the coalescing logic of the device.
> > Our NIC device only supports coalescing on RX. And when it's disabled
> each
> > RX CQE indicates 1 RX packet; when enabled each RX CQE indicates up to 4
> packets.
>
> I get that. What is the logic for combining 4 packets into a single
> completion? How does it work? Your commit message mentions "regression
> on latency" - what is the bound on that regression?
When we received CQE type CQE_RX_COALESCED_4, it's a coalesced CQE. And in the CQE
OOB, there is an array with 4 PPI elements, with each pkt's length:
oob->ppi[i].pkt_len.
So we read the related WQE and the DMA buffers for the RX pkt payloads, up to 4.
But, if the coalesced pkts <4, the pkt_len will be 0 after the last pkt, so we
know when to stop reading the WQEs.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-13 15:09 ` Haiyang Zhang
@ 2026-01-13 15:13 ` Haiyang Zhang
2026-01-14 1:09 ` Jakub Kicinski
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-13 15:13 UTC (permalink / raw)
To: Haiyang Zhang, Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Haiyang Zhang <haiyangz@microsoft.com>
> Sent: Tuesday, January 13, 2026 10:09 AM
> To: Jakub Kicinski <kuba@kernel.org>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
>
>
> > -----Original Message-----
> > From: Jakub Kicinski <kuba@kernel.org>
> > Sent: Monday, January 12, 2026 8:22 PM
> > To: Haiyang Zhang <haiyangz@microsoft.com>
> > Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> > hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> > <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> > <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> > <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> > Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>;
> Konstantin
> > Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> > Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> > <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> > <ssengar@linux.microsoft.com>; Aditya Garg
> > <gargaditya@linux.microsoft.com>; Dipayaan Roy
> > <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> > <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> > rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> > Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> > support for coalesced RX packets on CQE
> >
> > On Mon, 12 Jan 2026 21:01:59 +0000 Haiyang Zhang wrote:
> > > > > Our NIC can have up to 4 RX packets on 1 CQE. To support this
> > feature,
> > > > > check and process the type CQE_RX_COALESCED_4. The default setting
> > is
> > > > > disabled, to avoid possible regression on latency.
> > > > >
> > > > > And add ethtool handler to switch this feature. To turn it on,
> run:
> > > > > ethtool -C <nic> rx-frames 4
> > > > > To turn it off:
> > > > > ethtool -C <nic> rx-frames 1
> > > >
> > > > Exposing just rx frame count, and only two values is quite unusual.
> > > > Please explain in more detail the coalescing logic of the device.
> > > Our NIC device only supports coalescing on RX. And when it's disabled
> > each
> > > RX CQE indicates 1 RX packet; when enabled each RX CQE indicates up to
> 4
> > packets.
> >
> > I get that. What is the logic for combining 4 packets into a single
> > completion? How does it work? Your commit message mentions "regression
> > on latency" - what is the bound on that regression?
>
> When we received CQE type CQE_RX_COALESCED_4, it's a coalesced CQE. And in
> the CQE
> OOB, there is an array with 4 PPI elements, with each pkt's length:
> oob->ppi[i].pkt_len.
>
> So we read the related WQE and the DMA buffers for the RX pkt payloads, up
> to 4.
> But, if the coalesced pkts <4, the pkt_len will be 0 after the last pkt,
> so we
> know when to stop reading the WQEs.
And, the coalescing can add up to 2 microseconds into one-way latency.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-13 15:13 ` Haiyang Zhang
@ 2026-01-14 1:09 ` Jakub Kicinski
2026-01-14 18:27 ` Haiyang Zhang
0 siblings, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-14 1:09 UTC (permalink / raw)
To: Haiyang Zhang
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
On Tue, 13 Jan 2026 15:13:24 +0000 Haiyang Zhang wrote:
> > > I get that. What is the logic for combining 4 packets into a single
> > > completion? How does it work? Your commit message mentions "regression
> > > on latency" - what is the bound on that regression?
> >
> > When we received CQE type CQE_RX_COALESCED_4, it's a coalesced CQE. And in
> > the CQE OOB, there is an array with 4 PPI elements, with each pkt's length:
> > oob->ppi[i].pkt_len.
> >
> > So we read the related WQE and the DMA buffers for the RX pkt payloads, up
> > to 4.
> > But, if the coalesced pkts <4, the pkt_len will be 0 after the last pkt,
> > so we know when to stop reading the WQEs.
>
> And, the coalescing can add up to 2 microseconds into one-way latency.
I am asking you how the _device_ (hypervisor?) decides when to coalesce
and when to send a partial CQE (<4 packets in 4 pkt CQE). You are using
the coalescing uAPI, so I'm trying to make sure this is the correct API.
CQE configuration can also be done via ringparam.
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-14 1:09 ` Jakub Kicinski
@ 2026-01-14 18:27 ` Haiyang Zhang
2026-01-15 2:54 ` Jakub Kicinski
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-14 18:27 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Tuesday, January 13, 2026 8:10 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Tue, 13 Jan 2026 15:13:24 +0000 Haiyang Zhang wrote:
> > > > I get that. What is the logic for combining 4 packets into a single
> > > > completion? How does it work? Your commit message mentions
> "regression
> > > > on latency" - what is the bound on that regression?
> > >
> > > When we received CQE type CQE_RX_COALESCED_4, it's a coalesced CQE.
> And in
> > > the CQE OOB, there is an array with 4 PPI elements, with each pkt's
> length:
> > > oob->ppi[i].pkt_len.
> > >
> > > So we read the related WQE and the DMA buffers for the RX pkt
> payloads, up
> > > to 4.
> > > But, if the coalesced pkts <4, the pkt_len will be 0 after the last
> pkt,
> > > so we know when to stop reading the WQEs.
> >
> > And, the coalescing can add up to 2 microseconds into one-way latency.
>
> I am asking you how the _device_ (hypervisor?) decides when to coalesce
> and when to send a partial CQE (<4 packets in 4 pkt CQE). You are using
> the coalescing uAPI, so I'm trying to make sure this is the correct API.
> CQE configuration can also be done via ringparam.
When coalescing is enabled, the device waits for packets which can
have the CQE coalesced with previous packet(s). That coalescing process
is finished (and a CQE written to the appropriate CQ) when the CQE is
filled with 4 pkts, or time expired, or other device specific logic is
satisfied.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-14 18:27 ` Haiyang Zhang
@ 2026-01-15 2:54 ` Jakub Kicinski
2026-01-15 19:57 ` Haiyang Zhang
0 siblings, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-15 2:54 UTC (permalink / raw)
To: Haiyang Zhang
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
On Wed, 14 Jan 2026 18:27:50 +0000 Haiyang Zhang wrote:
> > > And, the coalescing can add up to 2 microseconds into one-way latency.
> >
> > I am asking you how the _device_ (hypervisor?) decides when to coalesce
> > and when to send a partial CQE (<4 packets in 4 pkt CQE). You are using
> > the coalescing uAPI, so I'm trying to make sure this is the correct API.
> > CQE configuration can also be done via ringparam.
>
> When coalescing is enabled, the device waits for packets which can
> have the CQE coalesced with previous packet(s). That coalescing process
> is finished (and a CQE written to the appropriate CQ) when the CQE is
> filled with 4 pkts, or time expired, or other device specific logic is
> satisfied.
See, what I'm afraid is happening here is that you are enabling
completion coalescing (how long the device keeps the CQE pending).
Which is _not_ what rx_max_coalesced_frames controls for most NICs.
For most NICs rx_max_coalesced_frames controls IRQ generation logic.
The NIC first buffers up CQEs for typically single digit usecs, and
then once CQE timer exipred and writeback happened it starts an IRQ
coalescing timer. Once the IRQ coalescing timer expires IRQ is
triggered, which schedules NAPI. (broad strokes, obviously many
differences and optimizations exist)
Is my guess correct? Are you controlling CQE coalescing>
Can you control the timeout instead of the frame count?
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-15 2:54 ` Jakub Kicinski
@ 2026-01-15 19:57 ` Haiyang Zhang
2026-01-16 2:14 ` Jakub Kicinski
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-15 19:57 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Wednesday, January 14, 2026 9:55 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Wed, 14 Jan 2026 18:27:50 +0000 Haiyang Zhang wrote:
> > > > And, the coalescing can add up to 2 microseconds into one-way
> latency.
> > >
> > > I am asking you how the _device_ (hypervisor?) decides when to
> coalesce
> > > and when to send a partial CQE (<4 packets in 4 pkt CQE). You are
> using
> > > the coalescing uAPI, so I'm trying to make sure this is the correct
> API.
> > > CQE configuration can also be done via ringparam.
> >
> > When coalescing is enabled, the device waits for packets which can
> > have the CQE coalesced with previous packet(s). That coalescing process
> > is finished (and a CQE written to the appropriate CQ) when the CQE is
> > filled with 4 pkts, or time expired, or other device specific logic is
> > satisfied.
>
> See, what I'm afraid is happening here is that you are enabling
> completion coalescing (how long the device keeps the CQE pending).
> Which is _not_ what rx_max_coalesced_frames controls for most NICs.
> For most NICs rx_max_coalesced_frames controls IRQ generation logic.
>
> The NIC first buffers up CQEs for typically single digit usecs, and
> then once CQE timer exipred and writeback happened it starts an IRQ
> coalescing timer. Once the IRQ coalescing timer expires IRQ is
> triggered, which schedules NAPI. (broad strokes, obviously many
> differences and optimizations exist)
>
> Is my guess correct? Are you controlling CQE coalescing>
>
> Can you control the timeout instead of the frame count?
Our NIC's timeout value cannot be controlled by driver. Also, the
timeout may be changed in future NIC HW.
So, I use the ethtool/rx-frames, which is either 1 or 4 on our
NIC, to switch the CQE coalescing feature on/off.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-15 19:57 ` Haiyang Zhang
@ 2026-01-16 2:14 ` Jakub Kicinski
2026-01-16 16:44 ` Haiyang Zhang
0 siblings, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-16 2:14 UTC (permalink / raw)
To: Haiyang Zhang
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
On Thu, 15 Jan 2026 19:57:44 +0000 Haiyang Zhang wrote:
> > > When coalescing is enabled, the device waits for packets which can
> > > have the CQE coalesced with previous packet(s). That coalescing process
> > > is finished (and a CQE written to the appropriate CQ) when the CQE is
> > > filled with 4 pkts, or time expired, or other device specific logic is
> > > satisfied.
> >
> > See, what I'm afraid is happening here is that you are enabling
> > completion coalescing (how long the device keeps the CQE pending).
> > Which is _not_ what rx_max_coalesced_frames controls for most NICs.
> > For most NICs rx_max_coalesced_frames controls IRQ generation logic.
> >
> > The NIC first buffers up CQEs for typically single digit usecs, and
> > then once CQE timer exipred and writeback happened it starts an IRQ
> > coalescing timer. Once the IRQ coalescing timer expires IRQ is
> > triggered, which schedules NAPI. (broad strokes, obviously many
> > differences and optimizations exist)
> >
> > Is my guess correct? Are you controlling CQE coalescing>
> >
> > Can you control the timeout instead of the frame count?
>
> Our NIC's timeout value cannot be controlled by driver. Also, the
> timeout may be changed in future NIC HW.
>
> So, I use the ethtool/rx-frames, which is either 1 or 4 on our
> NIC, to switch the CQE coalescing feature on/off.
I feel like this is not the first time I'm having a conversation with
you where you are not answering my direct questions, not just one
sliver. IDK why you're doing this, but being able to participate
in an email exchange is a bare minimum for participating upstream.
Please consider this a warning.
If I interpret your reply correctly you are indeed coalescing writeback.
You need to add a new param to the uAPI. Please add both size and
timeout. Expose the timeout as read only if your device doesn't support
controlling it per queue.
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-16 2:14 ` Jakub Kicinski
@ 2026-01-16 16:44 ` Haiyang Zhang
2026-01-17 16:58 ` Jakub Kicinski
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-16 16:44 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Thursday, January 15, 2026 9:15 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Thu, 15 Jan 2026 19:57:44 +0000 Haiyang Zhang wrote:
> > > > When coalescing is enabled, the device waits for packets which can
> > > > have the CQE coalesced with previous packet(s). That coalescing
> process
> > > > is finished (and a CQE written to the appropriate CQ) when the CQE
> is
> > > > filled with 4 pkts, or time expired, or other device specific logic
> is
> > > > satisfied.
> > >
> > > See, what I'm afraid is happening here is that you are enabling
> > > completion coalescing (how long the device keeps the CQE pending).
> > > Which is _not_ what rx_max_coalesced_frames controls for most NICs.
> > > For most NICs rx_max_coalesced_frames controls IRQ generation logic.
> > >
> > > The NIC first buffers up CQEs for typically single digit usecs, and
> > > then once CQE timer exipred and writeback happened it starts an IRQ
> > > coalescing timer. Once the IRQ coalescing timer expires IRQ is
> > > triggered, which schedules NAPI. (broad strokes, obviously many
> > > differences and optimizations exist)
> > >
> > > Is my guess correct? Are you controlling CQE coalescing>
> > >
> > > Can you control the timeout instead of the frame count?
> >
> > Our NIC's timeout value cannot be controlled by driver. Also, the
> > timeout may be changed in future NIC HW.
> >
> > So, I use the ethtool/rx-frames, which is either 1 or 4 on our
> > NIC, to switch the CQE coalescing feature on/off.
>
> I feel like this is not the first time I'm having a conversation with
> you where you are not answering my direct questions, not just one
> sliver. IDK why you're doing this, but being able to participate
> in an email exchange is a bare minimum for participating upstream.
> Please consider this a warning.
Sure, let me try to reply again -- does this (see below) answer all
your questions? And, feel free to ask any further questions, we are
willing to collaborate with you and other upstream people at any time :)
> The NIC first buffers up CQEs for typically single digit usecs, and
> then once CQE timer exipred and writeback happened it starts an IRQ
> coalescing timer. Once the IRQ coalescing timer expires IRQ is
> triggered, which schedules NAPI. (broad strokes, obviously many
> differences and optimizations exist)
> Is my guess correct? Are you controlling CQE coalescing?
Yes, it's correct. And we are controlling "CQE coalescing".
>
> If I interpret your reply correctly you are indeed coalescing writeback.
Yes, we are coalescing CQE writeback.
> You need to add a new param to the uAPI.
Since this feature is not common to other NICs, can we use an
ethtool private flag instead?
When the flag is set, the CQE coalescing will be enabled and put
up to 4 pkts in a CQE.
> Please add both size and
> timeout. Expose the timeout as read only if your device doesn't support
> controlling it per queue.
Does the "size" mean the max pks per CQE (1 or 4)?
The timeout value is not even exposed to driver, and subject to change
in the future. Also the HW mechanism is proprietary... So, can we not
"expose" the timeout value in "ethtool -c" outputs, because it's not
available at driver level?
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-16 16:44 ` Haiyang Zhang
@ 2026-01-17 16:58 ` Jakub Kicinski
2026-01-17 18:01 ` Haiyang Zhang
0 siblings, 1 reply; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-17 16:58 UTC (permalink / raw)
To: Haiyang Zhang
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
On Fri, 16 Jan 2026 16:44:33 +0000 Haiyang Zhang wrote:
> > You need to add a new param to the uAPI.
>
> Since this feature is not common to other NICs, can we use an
> ethtool private flag instead?
It's extremely common. Descriptor writeback at the granularity of one
packet would kill PCIe performance. We just don't have uAPI so NICs
either don't expose the knob or "reuse" another coalescing param.
> When the flag is set, the CQE coalescing will be enabled and put
> up to 4 pkts in a CQE.
>
> > Please add both size and
> > timeout. Expose the timeout as read only if your device doesn't support
> > controlling it per queue.
>
> Does the "size" mean the max pks per CQE (1 or 4)?
The definition of "size" is always a little funny when it comes to
coalescing and ringparam. In Tx does one frame mean one wire frame
or one TSO superframe? I wouldn't worry about the exact meaning of
size too much. Important thing is that user knows what making this
param smaller or larger will do.
> The timeout value is not even exposed to driver, and subject to change
> in the future. Also the HW mechanism is proprietary... So, can we not
> "expose" the timeout value in "ethtool -c" outputs, because it's not
> available at driver level?
Add it to the FW API and have FW send the current value to the driver?
You were concerned (in the commit msg) that there's a latency cost,
which is fair but I think for 99% of users 2usec is absolutely
not detectable (it takes longer for the CPU to wake). So I think it'd
be very valuable to the user to understand the order of magnitude of
latency we're talking about here.
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-17 16:58 ` Jakub Kicinski
@ 2026-01-17 18:01 ` Haiyang Zhang
2026-01-17 22:48 ` Jakub Kicinski
0 siblings, 1 reply; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-17 18:01 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Saturday, January 17, 2026 11:59 AM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Fri, 16 Jan 2026 16:44:33 +0000 Haiyang Zhang wrote:
> > > You need to add a new param to the uAPI.
> >
> > Since this feature is not common to other NICs, can we use an
> > ethtool private flag instead?
>
> It's extremely common. Descriptor writeback at the granularity of one
> packet would kill PCIe performance. We just don't have uAPI so NICs
> either don't expose the knob or "reuse" another coalescing param.
I see. So how about adding a new param like below to "ethtool -C"?
ethtool -C|--coalesce devname [rx-cqe-coalesce on|off]
> > When the flag is set, the CQE coalescing will be enabled and put
> > up to 4 pkts in a CQE.
> >
> > > Please add both size and
> > > timeout. Expose the timeout as read only if your device doesn't
> support
> > > controlling it per queue.
> >
> > Does the "size" mean the max pks per CQE (1 or 4)?
>
> The definition of "size" is always a little funny when it comes to
> coalescing and ringparam. In Tx does one frame mean one wire frame
> or one TSO superframe? I wouldn't worry about the exact meaning of
> size too much. Important thing is that user knows what making this
> param smaller or larger will do.
In "ethtool -c" output, add a new value like this?
rx-cqe-frames: (1 or 4 frames/CQE for this NIC)
> > The timeout value is not even exposed to driver, and subject to change
> > in the future. Also the HW mechanism is proprietary... So, can we not
> > "expose" the timeout value in "ethtool -c" outputs, because it's not
> > available at driver level?
>
> Add it to the FW API and have FW send the current value to the driver?
I don't know where is the timeout value in the HW / FW layers. Adding
new info to the HW/FW API needs other team's approval, and their work,
which will need a complex process and a long time.
> You were concerned (in the commit msg) that there's a latency cost,
> which is fair but I think for 99% of users 2usec is absolutely
> not detectable (it takes longer for the CPU to wake). So I think it'd
> be very valuable to the user to understand the order of magnitude of
> latency we're talking about here.
For now, may I document the 2us in the patch description? And add a
new item to the "ethtool -c" output, like "rx-cqe-usecs", label is as
"n/a" for now, while we work out with other teams on the time value
API at HW/FW layers? So, this CQE coalescing feature support won't be
blocked by this "2usec" info API for a long time?
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-17 18:01 ` Haiyang Zhang
@ 2026-01-17 22:48 ` Jakub Kicinski
2026-01-18 18:31 ` Haiyang Zhang
2026-02-22 21:32 ` Haiyang Zhang
0 siblings, 2 replies; 23+ messages in thread
From: Jakub Kicinski @ 2026-01-17 22:48 UTC (permalink / raw)
To: Haiyang Zhang
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
On Sat, 17 Jan 2026 18:01:18 +0000 Haiyang Zhang wrote:
> > > Since this feature is not common to other NICs, can we use an
> > > ethtool private flag instead?
> >
> > It's extremely common. Descriptor writeback at the granularity of one
> > packet would kill PCIe performance. We just don't have uAPI so NICs
> > either don't expose the knob or "reuse" another coalescing param.
>
> I see. So how about adding a new param like below to "ethtool -C"?
> ethtool -C|--coalesce devname [rx-cqe-coalesce on|off]
I don't think we need on / off, just the params.
If someone needs on / off setting - the size to 1 is basically off.
> > > When the flag is set, the CQE coalescing will be enabled and put
> > > up to 4 pkts in a CQE. support
> > > Does the "size" mean the max pks per CQE (1 or 4)?
> [...]
>
> In "ethtool -c" output, add a new value like this?
> rx-cqe-frames: (1 or 4 frames/CQE for this NIC)
SG
> > > The timeout value is not even exposed to driver, and subject to change
> > > in the future. Also the HW mechanism is proprietary... So, can we not
> > > "expose" the timeout value in "ethtool -c" outputs, because it's not
> > > available at driver level?
> >
> > Add it to the FW API and have FW send the current value to the driver?
>
> I don't know where is the timeout value in the HW / FW layers. Adding
> new info to the HW/FW API needs other team's approval, and their work,
> which will need a complex process and a long time.
>
> > You were concerned (in the commit msg) that there's a latency cost,
> > which is fair but I think for 99% of users 2usec is absolutely
> > not detectable (it takes longer for the CPU to wake). So I think it'd
> > be very valuable to the user to understand the order of magnitude of
> > latency we're talking about here.
>
> For now, may I document the 2us in the patch description? And add a
> new item to the "ethtool -c" output, like "rx-cqe-usecs", label is as
> "n/a" for now, while we work out with other teams on the time value
> API at HW/FW layers? So, this CQE coalescing feature support won't be
> blocked by this "2usec" info API for a long time?
Please do it right. We are in no rush upstream. It can't be that hard
to add a single API to the FW within a single organization..
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-17 22:48 ` Jakub Kicinski
@ 2026-01-18 18:31 ` Haiyang Zhang
2026-02-22 21:32 ` Haiyang Zhang
1 sibling, 0 replies; 23+ messages in thread
From: Haiyang Zhang @ 2026-01-18 18:31 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Saturday, January 17, 2026 5:49 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Sat, 17 Jan 2026 18:01:18 +0000 Haiyang Zhang wrote:
> > > > Since this feature is not common to other NICs, can we use an
> > > > ethtool private flag instead?
> > >
> > > It's extremely common. Descriptor writeback at the granularity of one
> > > packet would kill PCIe performance. We just don't have uAPI so NICs
> > > either don't expose the knob or "reuse" another coalescing param.
> >
> > I see. So how about adding a new param like below to "ethtool -C"?
> > ethtool -C|--coalesce devname [rx-cqe-coalesce on|off]
>
> I don't think we need on / off, just the params.
> If someone needs on / off setting - the size to 1 is basically off.
Ok --
I will add a numerical param "rx-cqe-frames" to "ethtool -C":
ethtool -C|--coalesce devname [rx-cqe-frames N]
//Accepts 1 or 4 frames/CQE for this NIC
>
> > > > When the flag is set, the CQE coalescing will be enabled and put
> > > > up to 4 pkts in a CQE. support
> > > > Does the "size" mean the max pks per CQE (1 or 4)?
> > [...]
> >
> > In "ethtool -c" output, add a new value like this?
> > rx-cqe-frames: (1 or 4 frames/CQE for this NIC)
>
> SG
Thanks.
> > > > The timeout value is not even exposed to driver, and subject to
> change
> > > > in the future. Also the HW mechanism is proprietary... So, can we
> not
> > > > "expose" the timeout value in "ethtool -c" outputs, because it's not
> > > > available at driver level?
> > >
> > > Add it to the FW API and have FW send the current value to the driver?
> >
> > I don't know where is the timeout value in the HW / FW layers. Adding
> > new info to the HW/FW API needs other team's approval, and their work,
> > which will need a complex process and a long time.
> >
> > > You were concerned (in the commit msg) that there's a latency cost,
> > > which is fair but I think for 99% of users 2usec is absolutely
> > > not detectable (it takes longer for the CPU to wake). So I think it'd
> > > be very valuable to the user to understand the order of magnitude of
> > > latency we're talking about here.
> >
> > For now, may I document the 2us in the patch description? And add a
> > new item to the "ethtool -c" output, like "rx-cqe-usecs", label is as
> > "n/a" for now, while we work out with other teams on the time value
> > API at HW/FW layers? So, this CQE coalescing feature support won't be
> > blocked by this "2usec" info API for a long time?
>
> Please do it right. We are in no rush upstream. It can't be that hard
> to add a single API to the FW within a single organization..
I will discuss this with our HW/FW teams.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
2026-01-17 22:48 ` Jakub Kicinski
2026-01-18 18:31 ` Haiyang Zhang
@ 2026-02-22 21:32 ` Haiyang Zhang
1 sibling, 0 replies; 23+ messages in thread
From: Haiyang Zhang @ 2026-02-22 21:32 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Wei Liu, Dexuan Cui,
Long Li, Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Shradha Gupta, Saurabh Sengar, Aditya Garg, Dipayaan Roy,
Shiraz Saleem, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org, Paul Rosswurm
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Saturday, January 17, 2026 5:49 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Simon Horman <horms@kernel.org>; Erni
> Sri Satya Vennela <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: Re: [EXTERNAL] Re: [PATCH V2,net-next, 1/2] net: mana: Add
> support for coalesced RX packets on CQE
>
> On Sat, 17 Jan 2026 18:01:18 +0000 Haiyang Zhang wrote:
> > > > Since this feature is not common to other NICs, can we use an
> > > > ethtool private flag instead?
> > >
> > > It's extremely common. Descriptor writeback at the granularity of one
> > > packet would kill PCIe performance. We just don't have uAPI so NICs
> > > either don't expose the knob or "reuse" another coalescing param.
> >
> > I see. So how about adding a new param like below to "ethtool -C"?
> > ethtool -C|--coalesce devname [rx-cqe-coalesce on|off]
>
> I don't think we need on / off, just the params.
> If someone needs on / off setting - the size to 1 is basically off.
>
> > > > When the flag is set, the CQE coalescing will be enabled and put
> > > > up to 4 pkts in a CQE. support
> > > > Does the "size" mean the max pks per CQE (1 or 4)?
> > [...]
> >
> > In "ethtool -c" output, add a new value like this?
> > rx-cqe-frames: (1 or 4 frames/CQE for this NIC)
>
> SG
>
> > > > The timeout value is not even exposed to driver, and subject to
> change
> > > > in the future. Also the HW mechanism is proprietary... So, can we
> not
> > > > "expose" the timeout value in "ethtool -c" outputs, because it's not
> > > > available at driver level?
> > >
> > > Add it to the FW API and have FW send the current value to the driver?
> >
> > I don't know where is the timeout value in the HW / FW layers. Adding
> > new info to the HW/FW API needs other team's approval, and their work,
> > which will need a complex process and a long time.
> >
> > > You were concerned (in the commit msg) that there's a latency cost,
> > > which is fair but I think for 99% of users 2usec is absolutely
> > > not detectable (it takes longer for the CPU to wake). So I think it'd
> > > be very valuable to the user to understand the order of magnitude of
> > > latency we're talking about here.
> >
> > For now, may I document the 2us in the patch description? And add a
> > new item to the "ethtool -c" output, like "rx-cqe-usecs", label is as
> > "n/a" for now, while we work out with other teams on the time value
> > API at HW/FW layers? So, this CQE coalescing feature support won't be
> > blocked by this "2usec" info API for a long time?
>
> Please do it right. We are in no rush upstream. It can't be that hard
> to add a single API to the FW within a single organization..
I have sent out a patch to add two parameters for ethtool:
COALESCE_RX_CQE_FRAMES/NSECS
I will send out ethtool user cmd patch, and driver patches later, after
the new parameters are added to kernel.
Thanks,
- Haiyang
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2026-02-22 21:32 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-06 20:46 [PATCH V2,net-next, 0/2] net: mana: Add support for coalesced RX packets Haiyang Zhang
2026-01-06 20:46 ` [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE Haiyang Zhang
2026-01-06 21:50 ` Long Li
2026-01-10 1:56 ` Jakub Kicinski
2026-01-12 21:01 ` [EXTERNAL] " Haiyang Zhang
2026-01-13 1:21 ` Jakub Kicinski
2026-01-13 15:09 ` Haiyang Zhang
2026-01-13 15:13 ` Haiyang Zhang
2026-01-14 1:09 ` Jakub Kicinski
2026-01-14 18:27 ` Haiyang Zhang
2026-01-15 2:54 ` Jakub Kicinski
2026-01-15 19:57 ` Haiyang Zhang
2026-01-16 2:14 ` Jakub Kicinski
2026-01-16 16:44 ` Haiyang Zhang
2026-01-17 16:58 ` Jakub Kicinski
2026-01-17 18:01 ` Haiyang Zhang
2026-01-17 22:48 ` Jakub Kicinski
2026-01-18 18:31 ` Haiyang Zhang
2026-02-22 21:32 ` Haiyang Zhang
2026-01-06 20:46 ` [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type Haiyang Zhang
2026-01-06 22:10 ` Long Li
2026-01-10 1:56 ` Jakub Kicinski
2026-01-12 21:03 ` [EXTERNAL] " Haiyang Zhang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox