* [PATCH net-next v4 5/6] net: mana: Allocate interrupt context for each EQ when creating vPort
From: Long Li @ 2026-03-20 23:54 UTC (permalink / raw)
To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
Dexuan Cui
Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <cover.1774049761.git.longli@microsoft.com>
Use GIC functions to create a dedicated interrupt context or acquire a
shared interrupt context for each EQ when setting up a vPort.
Signed-off-by: Long Li <longli@microsoft.com>
---
drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 +-
drivers/net/ethernet/microsoft/mana/mana_en.c | 17 ++++++++++++++++-
include/net/mana/gdma.h | 1 +
3 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index e7d5e589a217..34b19e0740e1 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -826,7 +826,6 @@ static void mana_gd_deregister_irq(struct gdma_queue *queue)
}
spin_unlock_irqrestore(&gic->lock, flags);
- queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
synchronize_rcu();
}
@@ -941,6 +940,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd,
out:
dev_err(dev, "Failed to create EQ: %d\n", err);
mana_gd_destroy_eq(gc, false, queue);
+ queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
return err;
}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 87a444a6c297..22444c7530a5 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1606,6 +1606,7 @@ void mana_destroy_eq(struct mana_port_context *apc)
struct gdma_context *gc = ac->gdma_dev->gdma_context;
struct gdma_queue *eq;
int i;
+ unsigned int msi;
if (!apc->eqs)
return;
@@ -1618,7 +1619,9 @@ void mana_destroy_eq(struct mana_port_context *apc)
if (!eq)
continue;
+ msi = eq->eq.msix_index;
mana_gd_destroy_queue(gc, eq);
+ mana_gd_put_gic(gc, !gc->msi_sharing, msi);
}
kfree(apc->eqs);
@@ -1635,6 +1638,7 @@ static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
+ debugfs_create_u32("irq", 0400, eq.mana_eq_debugfs, &eq.eq->eq.irq);
debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
}
@@ -1645,6 +1649,7 @@ int mana_create_eq(struct mana_port_context *apc)
struct gdma_queue_spec spec = {};
int err;
int i;
+ struct gdma_irq_context *gic;
WARN_ON(apc->eqs);
apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
@@ -1661,12 +1666,22 @@ int mana_create_eq(struct mana_port_context *apc)
apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs);
for (i = 0; i < apc->num_queues; i++) {
- spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+ if (gc->msi_sharing)
+ spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+
+ gic = mana_gd_get_gic(gc, !gc->msi_sharing, &spec.eq.msix_index);
+ if (!gic) {
+ err = -ENOMEM;
+ goto out;
+ }
+
err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
if (err) {
dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
+ mana_gd_put_gic(gc, !gc->msi_sharing, spec.eq.msix_index);
goto out;
}
+ apc->eqs[i].eq->eq.irq = gic->irq;
mana_create_eq_debugfs(apc, i);
}
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 4614a6a7271b..84f85b2299b4 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -342,6 +342,7 @@ struct gdma_queue {
void *context;
unsigned int msix_index;
+ unsigned int irq;
u32 log2_throttle_limit;
} eq;
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v4 6/6] RDMA/mana_ib: Allocate interrupt contexts on EQs
From: Long Li @ 2026-03-20 23:54 UTC (permalink / raw)
To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
Dexuan Cui
Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <cover.1774049761.git.longli@microsoft.com>
Use the GIC functions to allocate interrupt contexts for RDMA EQs. These
interrupt contexts may be shared with Ethernet EQs when MSI-X vectors
are limited.
The driver now supports allocating dedicated MSI-X for each EQ. Indicate
this capability through driver capability bits.
Signed-off-by: Long Li <longli@microsoft.com>
---
drivers/infiniband/hw/mana/main.c | 33 ++++++++++++++++++++++++++-----
include/net/mana/gdma.h | 7 +++++--
2 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index d51dd0ee85f4..0b74dd093b41 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -787,6 +787,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
{
struct gdma_context *gc = mdev_to_gc(mdev);
struct gdma_queue_spec spec = {};
+ struct gdma_irq_context *gic;
int err, i;
spec.type = GDMA_EQ;
@@ -797,9 +798,15 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
spec.eq.msix_index = 0;
+ gic = mana_gd_get_gic(gc, false, &spec.eq.msix_index);
+ if (!gic)
+ return -ENOMEM;
+
err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq);
- if (err)
+ if (err) {
+ mana_gd_put_gic(gc, false, 0);
return err;
+ }
mdev->eqs = kzalloc_objs(struct gdma_queue *,
mdev->ib_dev.num_comp_vectors);
@@ -810,31 +817,47 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
spec.eq.callback = NULL;
for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) {
spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+
+ gic = mana_gd_get_gic(gc, false, &spec.eq.msix_index);
+ if (!gic) {
+ err = -ENOMEM;
+ goto destroy_eqs;
+ }
+
err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]);
- if (err)
+ if (err) {
+ mana_gd_put_gic(gc, false, spec.eq.msix_index);
goto destroy_eqs;
+ }
}
return 0;
destroy_eqs:
- while (i-- > 0)
+ while (i-- > 0) {
mana_gd_destroy_queue(gc, mdev->eqs[i]);
+ mana_gd_put_gic(gc, false, (i + 1) % gc->num_msix_usable);
+ }
kfree(mdev->eqs);
destroy_fatal_eq:
mana_gd_destroy_queue(gc, mdev->fatal_err_eq);
+ mana_gd_put_gic(gc, false, 0);
return err;
}
void mana_ib_destroy_eqs(struct mana_ib_dev *mdev)
{
struct gdma_context *gc = mdev_to_gc(mdev);
- int i;
+ int i, msi;
mana_gd_destroy_queue(gc, mdev->fatal_err_eq);
+ mana_gd_put_gic(gc, false, 0);
- for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++)
+ for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) {
mana_gd_destroy_queue(gc, mdev->eqs[i]);
+ msi = (i + 1) % gc->num_msix_usable;
+ mana_gd_put_gic(gc, false, msi);
+ }
kfree(mdev->eqs);
}
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 84f85b2299b4..9faa072e779e 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -615,6 +615,7 @@ enum {
#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
#define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
+#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
/* Driver can handle holes (zeros) in the device list */
#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
@@ -631,7 +632,8 @@ enum {
/* Driver detects stalled send queues and recovers them */
#define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18)
-#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
+/* Driver supports separate EQ/MSIs for each vPort */
+#define GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT BIT(19)
/* Driver supports linearizing the skb when num_sge exceeds hardware limit */
#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)
@@ -659,7 +661,8 @@ enum {
GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \
GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \
- GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY)
+ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY | \
+ GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT)
#define GDMA_DRV_CAP_FLAGS2 0
--
2.43.0
^ permalink raw reply related
* [PATCH rdma] RDMA/mana_ib: Disable RX steering on RSS QP destroy
From: Long Li @ 2026-03-21 0:28 UTC (permalink / raw)
To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
Dexuan Cui
Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel,
stable
When an RSS QP is destroyed (e.g. DPDK exit), mana_ib_destroy_qp_rss()
destroys the RX WQ objects but does not disable vPort RX steering in
firmware. This leaves stale steering configuration that still points to
the destroyed RX objects.
If traffic continues to arrive (e.g. peer VM is still transmitting) and
the VF interface is subsequently brought up (mana_open), the firmware
may deliver completions using stale CQ IDs from the old RX objects.
These CQ IDs can be reused by the ethernet driver for new TX CQs,
causing RX completions to land on TX CQs:
WARNING: mana_poll_tx_cq+0x1b8/0x220 [mana] (is_sq == false)
WARNING: mana_gd_process_eq_events+0x209/0x290 (cq_table lookup fails)
Fix this by disabling vPort RX steering before destroying RX WQ objects.
Note that mana_fence_rqs() cannot be used here because the fence
completion is delivered on the CQ, which is polled by user-mode (e.g.
DPDK) and not visible to the kernel driver.
Refactor the disable logic into a shared mana_disable_vport_rx() in
mana_en, exported for use by mana_ib, replacing the duplicate code.
The ethernet driver's mana_dealloc_queues() is also updated to call
this common function.
Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter")
Cc: stable@vger.kernel.org
Signed-off-by: Long Li <longli@microsoft.com>
---
drivers/infiniband/hw/mana/qp.c | 17 ++++++++++++++++-
drivers/net/ethernet/microsoft/mana/mana_en.c | 11 ++++++++++-
include/net/mana/mana.h | 1 +
3 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 80cf4ade4b75..b27084c53a14 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -829,11 +829,26 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
struct net_device *ndev;
struct mana_ib_wq *wq;
struct ib_wq *ibwq;
- int i;
+ int i, err;
ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port);
mpc = netdev_priv(ndev);
+ /* Disable vPort RX steering before destroying RX WQ objects.
+ * Otherwise firmware still routes traffic to the destroyed queues,
+ * which can cause bogus completions on reused CQ IDs when the
+ * ethernet driver later creates new queues on mana_open().
+ *
+ * Unlike the ethernet teardown path, mana_fence_rqs() cannot be
+ * used here because the fence completion CQE is delivered on the
+ * CQ which is polled by userspace (e.g. DPDK), so there is no way
+ * for the kernel to wait for fence completion.
+ */
+ err = mana_disable_vport_rx(mpc);
+ if (err)
+ ibdev_err(&mdev->ib_dev,
+ "Failed to disable vPort RX: %d\n", err);
+
for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
ibwq = ind_tbl->ind_tbl[i];
wq = container_of(ibwq, struct mana_ib_wq, ibwq);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 22444c7530a5..51719ef1c09b 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2934,6 +2934,13 @@ static void mana_rss_table_init(struct mana_port_context *apc)
ethtool_rxfh_indir_default(i, apc->num_queues);
}
+int mana_disable_vport_rx(struct mana_port_context *apc)
+{
+ return mana_cfg_vport_steering(apc, TRI_STATE_FALSE, false, false,
+ false);
+}
+EXPORT_SYMBOL_NS(mana_disable_vport_rx, "NET_MANA");
+
int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx,
bool update_hash, bool update_tab)
{
@@ -3339,10 +3346,12 @@ static int mana_dealloc_queues(struct net_device *ndev)
*/
apc->rss_state = TRI_STATE_FALSE;
- err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
+ err = mana_disable_vport_rx(apc);
if (err && mana_en_need_log(apc, err))
netdev_err(ndev, "Failed to disable vPort: %d\n", err);
+ mana_fence_rqs(apc);
+
/* Even in err case, still need to cleanup the vPort */
mana_destroy_rxqs(apc);
mana_destroy_txq(apc);
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 204c2b612a62..2634e9135eed 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -574,6 +574,7 @@ struct mana_port_context {
netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
bool update_hash, bool update_tab);
+int mana_disable_vport_rx(struct mana_port_context *apc);
int mana_alloc_queues(struct net_device *ndev);
int mana_attach(struct net_device *ndev);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.
From: Jakub Kicinski @ 2026-03-21 0:29 UTC (permalink / raw)
To: Dipayaan Roy
Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, dipayanroy
In-Reply-To: <ab2T8LgRiDHDIUHV@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
On Fri, 20 Mar 2026 11:37:36 -0700 Dipayaan Roy wrote:
> On Sat, Mar 14, 2026 at 12:50:53PM -0700, Jakub Kicinski wrote:
> > On Tue, 10 Mar 2026 21:00:49 -0700 Dipayaan Roy wrote:
> > > On certain systems configured with 4K PAGE_SIZE, utilizing page_pool
> > > fragments for RX buffers results in a significant throughput regression.
> > > Profiling reveals that this regression correlates with high overhead in the
> > > fragment allocation and reference counting paths on these specific
> > > platforms, rendering the multi-buffer-per-page strategy counterproductive.
> >
> > Can you say more ? We could technically take two references on the page
> > right away if MTU is small and avoid some of the cost.
>
> There is a 15-20% shortfall in achieving line rate for MANA (180+ Gbps)
> on a particular ARM64 SKU. The issue is only specific to this processor SKU —
> not seen on other ARM64 SKUs (e.g., GB200) or x86 SKUs. Critically, the
> regression only manifests beyond 16 TCP connections, which strongly indicates
> seen when there is high contention and traffic.
>
> no. of | rx buf backed | rx buf backed
> connections | with page fragments | with full page
> -------------+---------------------+---------------
> 4 | 139 Gbps | 138 Gbps
> 8 | 140 Gbps | 162 Gbps
> 16 | 186 Gbps | 186 Gbps
These results look at bit odd, 4 and 16 streams have the same perf,
while all other cases indeed show a delta. What I was hoping for was
a more precise attribution of the performance issue. Like perf top
showing that its indeed the atomic ops on the refcount that stall.
> 32 | 136 Gbps | 183 Gbps
> 48 | 159 Gbps | 185 Gbps
> 64 | 165 Gbps | 184 Gbps
> 128 | 170 Gbps | 180 Gbps
>
> HW team is still working to RCA this hw behaviour.
>
> Regarding "We could technically take two references on the page right
> away", are you suggesting having page reference counting logic to driver
> instead of relying on page pool?
Yes, either that or adjust the page pool APIs.
page_pool_alloc_frag_netmem() currently sets the refcount to BIAS
which it then has to subtract later. So we get:
set(BIAS)
.. driver allocates chunks ..
sub(BIAS_MAX - pool->frag_users)
Instead of using BIAS we could make the page pool guess that the caller
will keep asking for the same frame size. So initially take
(PAGE_SIZE/size) references.
> > The driver doesn't seem to set skb->truesize accordingly after this
> > change. So you're lying to the stack about how much memory each packet
> > consumes. This is a blocker for the change.
> >
> ACK. I will send out a separate patch with fixes tag to fix the skb true
> size.
>
> > > To mitigate this, bypass the page_pool fragment path and force a single RX
> > > packet per page allocation when all the following conditions are met:
> > > 1. The system is configured with a 4K PAGE_SIZE.
> > > 2. A processor-specific quirk is detected via SMBIOS Type 4 data.
> >
> > I don't think we want the kernel to be in the business of carrying
> > matching on platform names and providing optimal config by default.
> > This sort of logic needs to live in user space or the hypervisor
> > (which can then pass a single bit to the driver to enable the behavior)
> >
> As per our internal discussion the hypervisor cannot provide the CPU
> version info(in vm as well as in bare metal offerings).
Why? I suppose it's much more effort for you but it's much more effort
for the community to carry the workaround. So..
> On handling it from user side are you suggesting it to introduce a new
> ethtool Private Flags and have udev rules for the driver to set the private
> flag and switch to full page rx buffers? Given that the wide number of distro
> support this might be harder to maintain/backport.
>
> Also the dmi parsing design was influenced by other net wireleass
> drivers as /wireless/ath/ath10k/core.c. If this approach is not
> acceptable for MANA driver then will have to take a alternate route
> based on the dsicussion right above it.
Plenty of ugly hacks in the kernel, it's no excuse.
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources
From: Long Li @ 2026-03-21 0:49 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Jason Gunthorpe, Konstantin Taranov, Jakub Kicinski,
David S . Miller, Paolo Abeni, Eric Dumazet, Andrew Lunn,
Haiyang Zhang, KY Srinivasan, Wei Liu, Dexuan Cui, Simon Horman,
netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260318144927.GB352386@unreal>
> On Tue, Mar 17, 2026 at 11:43:49PM +0000, Long Li wrote:
> > >
> > > On Fri, Mar 13, 2026 at 01:59:28PM -0300, Jason Gunthorpe wrote:
> > > > On Sat, Mar 07, 2026 at 07:38:14PM +0200, Leon Romanovsky wrote:
> > > > > On Fri, Mar 06, 2026 at 05:47:14PM -0800, Long Li wrote:
> > > > > > When the MANA hardware undergoes a service reset, the ETH
> > > > > > auxiliary device
> > > > > > (mana.eth) used by DPDK persists across the reset cycle — it
> > > > > > is not removed and re-added like RC/UD/GSI QPs. This means
> > > > > > userspace RDMA consumers such as DPDK have no way of knowing
> > > > > > that firmware handles for their PD, CQ, WQ, QP and MR resources have
> become stale.
> > > > >
> > > > > NAK to any of this.
> > > > >
> > > > > In case of hardware reset, mana_ib AUX device needs to be
> > > > > destroyed and recreated later.
> > > >
> > > > Yeah, that is our general model for any serious RAS event where
> > > > the driver's view of resources becomes out of sync with the HW.
> > > >
> > > > You have tear down the ib_device by removing the aux and then
> > > > bring back a new one.
> > > >
> > > > There is an IB_EVENT_DEVICE_FATAL, but the purpose of that event
> > > > is to tell userspace to close and re-open their uverbs FD.
> > > >
> > > > We don't have a model where a uverbs FD in userspace can continue
> > > > to work after the device has a catasrophic RAS event.
> > > >
> > > > There may be room to have a model where the ib device doesn't
> > > > fully unplug/replug so it retains its name and things, but that is
> > > > core code not driver stuff.
> > >
> > > Good luck with that model. It is going to break RDMA-CM hotplug support.
> > >
> >
> > I think we can preserve RDMA-CM behavior without requiring ib_device
> > unregister/re-register.
> >
> > On device reset, the driver can dispatch IB_EVENT_DEVICE_FATAL (or a
> > new reset event) through ib_dispatch_event(). RDMA-CM already handles
> > device events — we would add a handler that iterates all rdma_cm_ids
> > on the device and sends RDMA_CM_EVENT_DEVICE_REMOVAL to each,
> same
> > as cma_process_remove() does today. The difference: cma_device stays
> > alive, so applications can reconnect on the same device after recovery
> > instead of waiting for a new one to appear.
> >
> > The motivation for keeping ib_device alive is that some RDMA consumers
> > — DPDK and NCCL — don't use RDMA-CM at all. They use raw verbs and
> > manage QP state themselves.
>
> RDMA-CM provides an "external QP" model where the QP is managed by the
> rdma-cm user.
>
> As Jason noted, you should propose the core changes together with the
> corresponding librdmacm updates. The final result must ensure that legacy
> applications continue to function correctly with the new kernel.
>
> Thanks
Will send RFC patches.
Thank you,
Long
^ permalink raw reply
* RE: [EXTERNAL] [PATCH] net: mana: fix use-after-free in add_adev() error path
From: Long Li @ 2026-03-21 0:54 UTC (permalink / raw)
To: Guangshuo Li, KY Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Saurabh Sengar, Erni Sri Satya Vennela,
Shradha Gupta, Dipayaan Roy, Aditya Garg, Shiraz Saleem,
Leon Romanovsky, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
In-Reply-To: <20260318154041.638747-1-lgs201920130244@gmail.com>
> If auxiliary_device_add() fails, add_adev() calls auxiliary_device_uninit(adev),
> whose release callback adev_release() frees the containing struct mana_adev.
>
> The current error path then falls through to init_fail and accesses
> adev->id. Since adev is embedded in struct mana_adev, this may lead
> to a use-after-free.
>
> Fix it by storing the allocated auxiliary device id in a local variable and using that
> saved id in the cleanup path after auxiliary_device_uninit().
>
> Fixes: a69839d4327d ("net: mana: Add support for auxiliary device")
> Cc: stable@vger.kernel.org
> Signed-off-by: Guangshuo Li <lgs201920130244@gmail.com>
Reviewed-by: Long Li <longli@microsoft.com>
Thank you.
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 1ad154f9db1a..70d71594c599 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -3362,6 +3362,7 @@ static int add_adev(struct gdma_dev *gd, const char
> *name) {
> struct auxiliary_device *adev;
> struct mana_adev *madev;
> + int id;
> int ret;
>
> madev = kzalloc(sizeof(*madev), GFP_KERNEL); @@ -3372,7 +3373,8 @@
> static int add_adev(struct gdma_dev *gd, const char *name)
> ret = mana_adev_idx_alloc();
> if (ret < 0)
> goto idx_fail;
> - adev->id = ret;
> + id = ret;
> + adev->id = id;
>
> adev->name = name;
> adev->dev.parent = gd->gdma_context->dev; @@ -3398,7 +3400,7 @@
> static int add_adev(struct gdma_dev *gd, const char *name)
> auxiliary_device_uninit(adev);
>
> init_fail:
> - mana_adev_idx_free(adev->id);
> + mana_adev_idx_free(id);
>
> idx_fail:
> kfree(madev);
> --
> 2.43.0
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH rdma-next v2] RDMA/mana_ib: hardening: Clamp adapter capability values from MANA_IB_GET_ADAPTER_CAP
From: Long Li @ 2026-03-21 0:56 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Erni Sri Satya Vennela, Konstantin Taranov, Jason Gunthorpe,
linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org
In-Reply-To: <20260317094408.GR61385@unreal>
-next v2] RDMA/mana_ib: hardening:
> Clamp adapter capability values from MANA_IB_GET_ADAPTER_CAP
>
> On Mon, Mar 16, 2026 at 08:50:39PM +0000, Long Li wrote:
> > > On Thu, Mar 12, 2026 at 11:16:41AM -0700, Erni Sri Satya Vennela wrote:
> > > > As part of MANA hardening for CVM, clamp hardware-reported adapter
> > > > capability values from the MANA_IB_GET_ADAPTER_CAP response before
> > > > they are used by the IB subsystem.
> > > >
> > > > The response fields (max_qp_count, max_cq_count, max_mr_count,
> > > > max_pd_count, max_inbound_read_limit, max_outbound_read_limit,
> > > > max_qp_wr, max_send_sge_count, max_recv_sge_count) are u32 but are
> > > > assigned to signed int members in struct ib_device_attr. If
> > > > hardware returns a value exceeding INT_MAX, the implicit
> > > > u32-to-int conversion produces a negative value, which can cause
> > > > incorrect behavior in the IB core and userspace applications.
> > >
> > > This sentence does not make sense in the context of the Linux kernel.
> > > The fundamental assumption is that the underlying hardware behaves
> > > correctly, and driver code should not attempt to guard against
> > > purely hypothetical failures. The kernel only implements such
> > > self‑protection when there is a documented hardware issue accompanied by
> official errata.
> > >
> > > Thanks
> >
> > The idea is that a malicious hardware can't corrupt and steal other data from
> the kernel.
> >
> > The assumption is that in a public cloud environment, you can't trust the
> hardware 100%.
>
> You cannot separate functionality and claim that one line of code is trusted while
> another is not.
>
> Thanks
How we rephrase this in this way: the driver should not corrupt or overflow other parts of the kernel if its device is misbehaving (or has a bug).
Long
^ permalink raw reply
* Re: [PATCH v4 00/21] mm: expand mmap_prepare functionality and usage
From: Andrew Morton @ 2026-03-21 2:42 UTC (permalink / raw)
To: Lorenzo Stoakes (Oracle)
Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
Alexandre Torgue, Miquel Raynal, Richard Weinberger,
Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1774045440.git.ljs@kernel.org>
On Fri, 20 Mar 2026 22:39:26 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
> This series expands the mmap_prepare functionality, which is intended to
> replace the deprecated f_op->mmap hook which has been the source of bugs
> and security issues for some time.
Thanks, I updated mm-unstable to this version. Here's how that altered
mm.git:
Documentation/filesystems/mmap_prepare.rst | 4
include/linux/mm.h | 3
mm/internal.h | 27 ++++-
mm/util.c | 87 +++++++++----------
mm/vma.c | 41 +-------
tools/testing/vma/include/dup.h | 44 +--------
tools/testing/vma/include/stubs.h | 3
7 files changed, 80 insertions(+), 129 deletions(-)
--- a/Documentation/filesystems/mmap_prepare.rst~b
+++ a/Documentation/filesystems/mmap_prepare.rst
@@ -123,8 +123,8 @@ When implementing mmap_prepare(), refere
as a ``VMA_xxx_BIT`` macro, e.g. ``VMA_READ_BIT``, ``VMA_WRITE_BIT`` etc.,
and use one of (where ``desc`` is a pointer to struct vm_area_desc):
-* ``vma_desc_test_flags(desc, ...)`` - Specify a comma-separated list of flags
- you wish to test for (whether _any_ are set), e.g. - ``vma_desc_test_flags(
+* ``vma_desc_test_any(desc, ...)`` - Specify a comma-separated list of flags
+ you wish to test for (whether _any_ are set), e.g. - ``vma_desc_test_any(
desc, VMA_WRITE_BIT, VMA_MAYWRITE_BIT)`` - returns ``true`` if either are set,
otherwise ``false``.
* ``vma_desc_set_flags(desc, ...)`` - Update the VMA descriptor flags to set
--- a/include/linux/mm.h~b
+++ a/include/linux/mm.h
@@ -4394,8 +4394,7 @@ static inline void mmap_action_map_kerne
int mmap_action_prepare(struct vm_area_desc *desc);
int mmap_action_complete(struct vm_area_struct *vma,
- struct mmap_action *action,
- bool rmap_lock_held);
+ struct mmap_action *action);
/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
--- a/mm/internal.h~b
+++ a/mm/internal.h
@@ -202,14 +202,6 @@ static inline void vma_close(struct vm_a
/* unmap_vmas is in mm/memory.c */
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
-static inline void unmap_vma_locked(struct vm_area_struct *vma)
-{
- const size_t len = vma_pages(vma) << PAGE_SHIFT;
-
- mmap_assert_write_locked(vma->vm_mm);
- do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
-}
-
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
@@ -1826,6 +1818,25 @@ static inline int io_remap_pfn_range_pre
return 0;
}
+/*
+ * When we succeed an mmap action or just before we unmap a VMA on error, we
+ * need to ensure any rmap lock held is released. On unmap it's required to
+ * avoid a deadlock.
+ */
+static inline void maybe_rmap_unlock_action(struct vm_area_struct *vma,
+ struct mmap_action *action)
+{
+ struct file *file;
+
+ if (!action->hide_from_rmap_until_complete)
+ return;
+
+ VM_WARN_ON_ONCE(vma_is_anonymous(vma));
+ file = vma->vm_file;
+ i_mmap_unlock_write(file->f_mapping);
+ action->hide_from_rmap_until_complete = false;
+}
+
#ifdef CONFIG_MMU_NOTIFIER
static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep, unsigned int nr)
--- a/mm/util.c~b
+++ a/mm/util.c
@@ -1198,25 +1198,6 @@ void compat_set_desc_from_vma(struct vm_
}
EXPORT_SYMBOL(compat_set_desc_from_vma);
-static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
-{
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
- void *vm_private_data = vma->vm_private_data;
- int err;
-
- if (!vm_ops || !vm_ops->mapped)
- return 0;
-
- err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
- &vm_private_data);
- if (err)
- unmap_vma_locked(vma);
- else if (vm_private_data != vma->vm_private_data)
- vma->vm_private_data = vm_private_data;
-
- return err;
-}
-
/**
* __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows
* flexibility as to how the mmap_prepare callback is invoked, which is useful
@@ -1251,13 +1232,7 @@ int __compat_vma_mmap(struct vm_area_des
/* Update the VMA from the descriptor. */
compat_set_vma_from_desc(vma, desc);
/* Complete any specified mmap actions. */
- err = mmap_action_complete(vma, &desc->action,
- /*rmap_lock_held=*/false);
- if (err)
- return err;
-
- /* Invoke vm_ops->mapped callback. */
- return __compat_vma_mapped(desc->file, vma);
+ return mmap_action_complete(vma, &desc->action);
}
EXPORT_SYMBOL(__compat_vma_mmap);
@@ -1290,12 +1265,17 @@ EXPORT_SYMBOL(__compat_vma_mmap);
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
{
struct vm_area_desc desc;
+ struct mmap_action *action;
int err;
compat_set_desc_from_vma(&desc, file, vma);
err = vfs_mmap_prepare(file, &desc);
if (err)
return err;
+ action = &desc.action;
+
+ /* being invoked from .mmmap means we don't have to enforce this. */
+ action->hide_from_rmap_until_complete = false;
return __compat_vma_mmap(&desc, vma);
}
@@ -1399,25 +1379,47 @@ again:
}
}
+static int call_vma_mapped(struct vm_area_struct *vma)
+{
+ const struct vm_operations_struct *vm_ops = vma->vm_ops;
+ void *vm_private_data = vma->vm_private_data;
+ int err;
+
+ if (!vm_ops || !vm_ops->mapped)
+ return 0;
+
+ err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_file, &vm_private_data);
+ if (err)
+ return err;
+
+ if (vm_private_data != vma->vm_private_data)
+ vma->vm_private_data = vm_private_data;
+ return 0;
+}
+
static int mmap_action_finish(struct vm_area_struct *vma,
- struct mmap_action *action, int err,
- bool rmap_lock_held)
+ struct mmap_action *action, int err)
{
- if (rmap_lock_held)
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ size_t len;
- if (!err) {
- if (action->success_hook)
- return action->success_hook(vma);
+ if (!err)
+ err = call_vma_mapped(vma);
+ if (!err && action->success_hook)
+ err = action->success_hook(vma);
+
+ /* do_munmap() might take rmap lock, so release if held. */
+ maybe_rmap_unlock_action(vma, action);
+ if (!err)
return 0;
- }
/*
* If an error occurs, unmap the VMA altogether and return an error. We
* only clear the newly allocated VMA, since this function is only
* invoked if we do NOT merge, so we only clean up the VMA we created.
*/
- unmap_vma_locked(vma);
+ len = vma_pages(vma) << PAGE_SHIFT;
+ do_munmap(current->mm, vma->vm_start, len, NULL);
if (action->error_hook) {
/* We may want to filter the error. */
err = action->error_hook(err);
@@ -1459,16 +1461,13 @@ EXPORT_SYMBOL(mmap_action_prepare);
* mmap_action_complete - Execute VMA descriptor action.
* @vma: The VMA to perform the action upon.
* @action: The action to perform.
- * @rmap_lock_held: Is the file rmap lock held?
*
* Similar to mmap_action_prepare().
*
* Return: 0 on success, or error, at which point the VMA will be unmapped.
*/
int mmap_action_complete(struct vm_area_struct *vma,
- struct mmap_action *action,
- bool rmap_lock_held)
-
+ struct mmap_action *action)
{
int err = 0;
@@ -1489,8 +1488,7 @@ int mmap_action_complete(struct vm_area_
break;
}
- return mmap_action_finish(vma, action, err,
- rmap_lock_held);
+ return mmap_action_finish(vma, action, err);
}
EXPORT_SYMBOL(mmap_action_complete);
#else
@@ -1512,8 +1510,7 @@ int mmap_action_prepare(struct vm_area_d
EXPORT_SYMBOL(mmap_action_prepare);
int mmap_action_complete(struct vm_area_struct *vma,
- struct mmap_action *action,
- bool rmap_lock_held)
+ struct mmap_action *action)
{
int err = 0;
@@ -1523,14 +1520,14 @@ int mmap_action_complete(struct vm_area_
case MMAP_REMAP_PFN:
case MMAP_IO_REMAP_PFN:
case MMAP_SIMPLE_IO_REMAP:
- casr MMAP_MAP_KERNEL_PAGES:
+ case MMAP_MAP_KERNEL_PAGES:
WARN_ON_ONCE(1); /* nommu cannot handle this. */
err = -EINVAL;
break;
}
- return mmap_action_finish(vma, action, err, rmap_lock_held);
+ return mmap_action_finish(vma, action, err);
}
EXPORT_SYMBOL(mmap_action_complete);
#endif
--- a/mm/vma.c~b
+++ a/mm/vma.c
@@ -38,8 +38,6 @@ struct mmap_state {
/* Determine if we can check KSM flags early in mmap() logic. */
bool check_ksm_early :1;
- /* If we map new, hold the file rmap lock on mapping. */
- bool hold_file_rmap_lock :1;
/* If .mmap_prepare changed the file, we don't need to pin. */
bool file_doesnt_need_get :1;
};
@@ -2530,10 +2528,12 @@ static int __mmap_new_file_vma(struct mm
*
* @map: Mapping state.
* @vmap: Output pointer for the new VMA.
+ * @action: Any mmap_prepare action that is still to complete.
*
* Returns: Zero on success, or an error.
*/
-static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
+static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap,
+ struct mmap_action *action)
{
struct vma_iterator *vmi = map->vmi;
int error = 0;
@@ -2582,7 +2582,7 @@ static int __mmap_new_vma(struct mmap_st
vma_start_write(vma);
vma_iter_store_new(vmi, vma);
map->mm->map_count++;
- vma_link_file(vma, map->hold_file_rmap_lock);
+ vma_link_file(vma, action->hide_from_rmap_until_complete);
/*
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
@@ -2649,8 +2649,6 @@ static int call_action_prepare(struct mm
if (err)
return err;
- if (desc->action.hide_from_rmap_until_complete)
- map->hold_file_rmap_lock = true;
return 0;
}
@@ -2731,30 +2729,6 @@ static bool can_set_ksm_flags_early(stru
return false;
}
-static int call_mapped_hook(struct mmap_state *map,
- struct vm_area_struct *vma)
-{
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
- void *vm_private_data = vma->vm_private_data;
- int err;
-
- if (!vm_ops || !vm_ops->mapped)
- return 0;
- err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_file, &vm_private_data);
- if (err) {
- if (map->hold_file_rmap_lock)
- i_mmap_unlock_write(vma->vm_file->f_mapping);
-
- unmap_vma_locked(vma);
- return err;
- }
- /* Update private data if changed. */
- if (vm_private_data != vma->vm_private_data)
- vma->vm_private_data = vm_private_data;
- return 0;
-}
-
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vma_flags_t vma_flags,
unsigned long pgoff, struct list_head *uf)
@@ -2794,7 +2768,7 @@ static unsigned long __mmap_region(struc
/* ...but if we can't, allocate a new VMA. */
if (!vma) {
- error = __mmap_new_vma(&map, &vma);
+ error = __mmap_new_vma(&map, &vma, &desc.action);
if (error)
goto unacct_error;
allocated_new = true;
@@ -2806,10 +2780,7 @@ static unsigned long __mmap_region(struc
__mmap_complete(&map, vma);
if (have_mmap_prepare && allocated_new) {
- error = mmap_action_complete(vma, &desc.action,
- map.hold_file_rmap_lock);
- if (!error)
- error = call_mapped_hook(&map, vma);
+ error = mmap_action_complete(vma, &desc.action);
if (error)
return error;
}
--- a/tools/testing/vma/include/dup.h~b
+++ a/tools/testing/vma/include/dup.h
@@ -1313,27 +1313,9 @@ static inline unsigned long vma_pages(co
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
-static inline void unmap_vma_locked(struct vm_area_struct *vma)
-{
- const size_t len = vma_pages(vma) << PAGE_SHIFT;
-
- mmap_assert_write_locked(vma->vm_mm);
- do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
-}
-
-static inline int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
{
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
- int err;
-
- if (!vm_ops->mapped)
- return 0;
-
- err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
- &vma->vm_private_data);
- if (err)
- unmap_vma_locked(vma);
- return err;
+ return file->f_op->mmap_prepare(desc);
}
static inline int __compat_vma_mmap(struct vm_area_desc *desc,
@@ -1348,35 +1330,27 @@ static inline int __compat_vma_mmap(stru
/* Update the VMA from the descriptor. */
compat_set_vma_from_desc(vma, desc);
/* Complete any specified mmap actions. */
- err = mmap_action_complete(vma, &desc->action,
- /*rmap_lock_held=*/false);
- if (err)
- return err;
-
- /* Invoke vm_ops->mapped callback. */
- return __compat_vma_mapped(desc->file, vma);
-}
-
-static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
-{
- return file->f_op->mmap_prepare(desc);
+ return mmap_action_complete(vma, &desc->action);
}
-static inline int compat_vma_mmap(struct file *file,
- struct vm_area_struct *vma)
+static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
{
struct vm_area_desc desc;
+ struct mmap_action *action;
int err;
compat_set_desc_from_vma(&desc, file, vma);
err = vfs_mmap_prepare(file, &desc);
if (err)
return err;
+ action = &desc.action;
+
+ /* being invoked from .mmmap means we don't have to enforce this. */
+ action->hide_from_rmap_until_complete = false;
return __compat_vma_mmap(&desc, vma);
}
-
static inline void vma_iter_init(struct vma_iterator *vmi,
struct mm_struct *mm, unsigned long addr)
{
--- a/tools/testing/vma/include/stubs.h~b
+++ a/tools/testing/vma/include/stubs.h
@@ -87,8 +87,7 @@ static inline int mmap_action_prepare(st
}
static inline int mmap_action_complete(struct vm_area_struct *vma,
- struct mmap_action *action,
- bool rmap_lock_held)
+ struct mmap_action *action)
{
return 0;
}
_
^ permalink raw reply
* [PATCH net v2] net: mana: fix use-after-free in add_adev() error path
From: Guangshuo Li @ 2026-03-21 5:39 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Saurabh Sengar, Erni Sri Satya Vennela,
Shradha Gupta, Aditya Garg, Dipayaan Roy, Shiraz Saleem,
Leon Romanovsky, linux-hyperv, netdev, linux-kernel
Cc: Guangshuo Li, stable
If auxiliary_device_add() fails, add_adev() jumps to add_fail and calls
auxiliary_device_uninit(adev).
The auxiliary device has its release callback set to adev_release(),
which frees the containing struct mana_adev. Since adev is embedded in
struct mana_adev, the subsequent fall-through to init_fail and access
to adev->id may result in a use-after-free.
Fix this by saving the allocated auxiliary device id in a local
variable before calling auxiliary_device_add(), and use that saved id
in the cleanup path after auxiliary_device_uninit().
Fixes: a69839d4327d ("net: mana: Add support for auxiliary device")
Cc: stable@vger.kernel.org
Reviewed-by: Long Li <longli@microsoft.com>
Signed-off-by: Guangshuo Li <lgs201920130244@gmail.com>
---
v2:
- explain the UAF in more detail
- retarget to net
- preserve reverse xmas tree order for local variables
drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 1ad154f9db1a..70d71594c599 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3362,6 +3362,7 @@ static int add_adev(struct gdma_dev *gd, const char *name)
{
struct auxiliary_device *adev;
struct mana_adev *madev;
+ int id;
int ret;
madev = kzalloc(sizeof(*madev), GFP_KERNEL);
@@ -3372,7 +3373,8 @@ static int add_adev(struct gdma_dev *gd, const char *name)
ret = mana_adev_idx_alloc();
if (ret < 0)
goto idx_fail;
- adev->id = ret;
+ id = ret;
+ adev->id = id;
adev->name = name;
adev->dev.parent = gd->gdma_context->dev;
@@ -3398,7 +3400,7 @@ static int add_adev(struct gdma_dev *gd, const char *name)
auxiliary_device_uninit(adev);
init_fail:
- mana_adev_idx_free(adev->id);
+ mana_adev_idx_free(id);
idx_fail:
kfree(madev);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH net-next] net: mana: Use at least SZ_4K in doorbell ID range check
From: Simon Horman @ 2026-03-21 10:04 UTC (permalink / raw)
To: Erni Sri Satya Vennela
Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, shradhagupta, dipayanroy, shirazsaleem,
kotaranov, yury.norov, kees, linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260320122107.1560839-1-ernis@linux.microsoft.com>
On Fri, Mar 20, 2026 at 05:21:01AM -0700, Erni Sri Satya Vennela wrote:
> mana_gd_ring_doorbell() accesses doorbell offsets up to 0xFF8 + 8 = 4KB
> within a doorbell page. When db_page_size is zero, the validation check
> in mana_gd_register_device() reduces to:
> db_page_off + 0 > bar0_size
> which passes, even though mana_gd_ring_doorbell() will access
> [db_page_off, db_page_off + 4KB) and may go beyond BAR0.
>
> Use max(SZ_4K, db_page_size) in the range check so that a zero or
> unexpectedly small db_page_size still results in a rejection when the
> doorbell page would fall outside BAR0.
Thanks Erni,
I understand the maths here. And to that extent this change makes sense to me.
But I am curious to know how a db_page_size of zero works. I was expecting
some space is required there.
>
> Fixes: 89fe91c65992 ("net: mana: hardening: Validate doorbell ID from GDMA_REGISTER_DEVICE response")
> Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
...
^ permalink raw reply
* [PATCH] hv_sock: update outdated comment for renamed vsock_stream_recvmsg()
From: Kexin Sun @ 2026-03-21 10:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, sgarzare, davem, edumazet,
kuba, pabeni, horms, linux-hyperv, virtualization, netdev,
linux-kernel
Cc: julia.lawall, xutong.ma, kexinsun, yunbolyu, ratnadiraw
The function vsock_stream_recvmsg() was renamed to
vsock_connectible_recvmsg() by commit a9e29e5511b9 ("af_vsock:
update functions for connectible socket"). Update the comment
accordingly.
Assisted-by: unnamed:deepseek-v3.2 coccinelle
Signed-off-by: Kexin Sun <kexinsun@smail.nju.edu.cn>
---
net/vmw_vsock/hyperv_transport.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 069386a74557..2b7c0b5896ed 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -196,7 +196,7 @@ static int hvs_channel_readable_payload(struct vmbus_channel *chan)
if (readable > HVS_PKT_LEN(0)) {
/* At least we have 1 byte to read. We don't need to return
- * the exact readable bytes: see vsock_stream_recvmsg() ->
+ * the exact readable bytes: see vsock_connectible_recvmsg() ->
* vsock_stream_has_data().
*/
return 1;
--
2.25.1
^ permalink raw reply related
* Re: [PATCH rdma] RDMA/mana_ib: Disable RX steering on RSS QP destroy
From: Leon Romanovsky @ 2026-03-22 18:48 UTC (permalink / raw)
To: Long Li
Cc: Konstantin Taranov, Jakub Kicinski, David S . Miller, Paolo Abeni,
Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Haiyang Zhang,
K . Y . Srinivasan, Wei Liu, Dexuan Cui, Simon Horman, netdev,
linux-rdma, linux-hyperv, linux-kernel, stable
In-Reply-To: <20260321002842.1607179-1-longli@microsoft.com>
On Fri, Mar 20, 2026 at 05:28:42PM -0700, Long Li wrote:
> When an RSS QP is destroyed (e.g. DPDK exit), mana_ib_destroy_qp_rss()
> destroys the RX WQ objects but does not disable vPort RX steering in
> firmware. This leaves stale steering configuration that still points to
> the destroyed RX objects.
>
> If traffic continues to arrive (e.g. peer VM is still transmitting) and
> the VF interface is subsequently brought up (mana_open), the firmware
> may deliver completions using stale CQ IDs from the old RX objects.
> These CQ IDs can be reused by the ethernet driver for new TX CQs,
> causing RX completions to land on TX CQs:
>
> WARNING: mana_poll_tx_cq+0x1b8/0x220 [mana] (is_sq == false)
> WARNING: mana_gd_process_eq_events+0x209/0x290 (cq_table lookup fails)
>
> Fix this by disabling vPort RX steering before destroying RX WQ objects.
> Note that mana_fence_rqs() cannot be used here because the fence
> completion is delivered on the CQ, which is polled by user-mode (e.g.
> DPDK) and not visible to the kernel driver.
>
> Refactor the disable logic into a shared mana_disable_vport_rx() in
> mana_en, exported for use by mana_ib, replacing the duplicate code.
> The ethernet driver's mana_dealloc_queues() is also updated to call
> this common function.
>
> Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter")
> Cc: stable@vger.kernel.org
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> drivers/infiniband/hw/mana/qp.c | 17 ++++++++++++++++-
> drivers/net/ethernet/microsoft/mana/mana_en.c | 11 ++++++++++-
> include/net/mana/mana.h | 1 +
> 3 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
> index 80cf4ade4b75..b27084c53a14 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -829,11 +829,26 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
> struct net_device *ndev;
> struct mana_ib_wq *wq;
> struct ib_wq *ibwq;
> - int i;
> + int i, err;
>
> ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port);
> mpc = netdev_priv(ndev);
>
> + /* Disable vPort RX steering before destroying RX WQ objects.
> + * Otherwise firmware still routes traffic to the destroyed queues,
> + * which can cause bogus completions on reused CQ IDs when the
> + * ethernet driver later creates new queues on mana_open().
> + *
> + * Unlike the ethernet teardown path, mana_fence_rqs() cannot be
> + * used here because the fence completion CQE is delivered on the
> + * CQ which is polled by userspace (e.g. DPDK), so there is no way
> + * for the kernel to wait for fence completion.
> + */
> + err = mana_disable_vport_rx(mpc);
> + if (err)
> + ibdev_err(&mdev->ib_dev,
> + "Failed to disable vPort RX: %d\n", err);
mana_cfg_vport_steering() is already prints in all failure scenarios.
Thanks
> +
> for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
> ibwq = ind_tbl->ind_tbl[i];
> wq = container_of(ibwq, struct mana_ib_wq, ibwq);
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 22444c7530a5..51719ef1c09b 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -2934,6 +2934,13 @@ static void mana_rss_table_init(struct mana_port_context *apc)
> ethtool_rxfh_indir_default(i, apc->num_queues);
> }
>
> +int mana_disable_vport_rx(struct mana_port_context *apc)
> +{
> + return mana_cfg_vport_steering(apc, TRI_STATE_FALSE, false, false,
> + false);
> +}
> +EXPORT_SYMBOL_NS(mana_disable_vport_rx, "NET_MANA");
> +
> int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx,
> bool update_hash, bool update_tab)
> {
> @@ -3339,10 +3346,12 @@ static int mana_dealloc_queues(struct net_device *ndev)
> */
>
> apc->rss_state = TRI_STATE_FALSE;
> - err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
> + err = mana_disable_vport_rx(apc);
> if (err && mana_en_need_log(apc, err))
> netdev_err(ndev, "Failed to disable vPort: %d\n", err);
>
> + mana_fence_rqs(apc);
> +
> /* Even in err case, still need to cleanup the vPort */
> mana_destroy_rxqs(apc);
> mana_destroy_txq(apc);
> diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> index 204c2b612a62..2634e9135eed 100644
> --- a/include/net/mana/mana.h
> +++ b/include/net/mana/mana.h
> @@ -574,6 +574,7 @@ struct mana_port_context {
> netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
> int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
> bool update_hash, bool update_tab);
> +int mana_disable_vport_rx(struct mana_port_context *apc);
>
> int mana_alloc_queues(struct net_device *ndev);
> int mana_attach(struct net_device *ndev);
> --
> 2.43.0
>
^ permalink raw reply
* Re: [EXTERNAL] Re: [PATCH rdma-next v2] RDMA/mana_ib: hardening: Clamp adapter capability values from MANA_IB_GET_ADAPTER_CAP
From: Leon Romanovsky @ 2026-03-22 18:50 UTC (permalink / raw)
To: Long Li
Cc: Erni Sri Satya Vennela, Konstantin Taranov, Jason Gunthorpe,
linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org
In-Reply-To: <SA1PR21MB66833EBAF447BA0B102862FCCE4DA@SA1PR21MB6683.namprd21.prod.outlook.com>
On Sat, Mar 21, 2026 at 12:56:39AM +0000, Long Li wrote:
> -next v2] RDMA/mana_ib: hardening:
> > Clamp adapter capability values from MANA_IB_GET_ADAPTER_CAP
> >
> > On Mon, Mar 16, 2026 at 08:50:39PM +0000, Long Li wrote:
> > > > On Thu, Mar 12, 2026 at 11:16:41AM -0700, Erni Sri Satya Vennela wrote:
> > > > > As part of MANA hardening for CVM, clamp hardware-reported adapter
> > > > > capability values from the MANA_IB_GET_ADAPTER_CAP response before
> > > > > they are used by the IB subsystem.
> > > > >
> > > > > The response fields (max_qp_count, max_cq_count, max_mr_count,
> > > > > max_pd_count, max_inbound_read_limit, max_outbound_read_limit,
> > > > > max_qp_wr, max_send_sge_count, max_recv_sge_count) are u32 but are
> > > > > assigned to signed int members in struct ib_device_attr. If
> > > > > hardware returns a value exceeding INT_MAX, the implicit
> > > > > u32-to-int conversion produces a negative value, which can cause
> > > > > incorrect behavior in the IB core and userspace applications.
> > > >
> > > > This sentence does not make sense in the context of the Linux kernel.
> > > > The fundamental assumption is that the underlying hardware behaves
> > > > correctly, and driver code should not attempt to guard against
> > > > purely hypothetical failures. The kernel only implements such
> > > > self‑protection when there is a documented hardware issue accompanied by
> > official errata.
> > > >
> > > > Thanks
> > >
> > > The idea is that a malicious hardware can't corrupt and steal other data from
> > the kernel.
> > >
> > > The assumption is that in a public cloud environment, you can't trust the
> > hardware 100%.
> >
> > You cannot separate functionality and claim that one line of code is trusted while
> > another is not.
> >
> > Thanks
>
> How we rephrase this in this way: the driver should not corrupt or overflow other parts of the kernel if its device is misbehaving (or has a bug).
It shouldn't be theoretical claim, do you have errata?
Thanks
>
> Long
^ permalink raw reply
* RE: [PATCH v4 18/21] drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare
From: Michael Kelley @ 2026-03-23 4:16 UTC (permalink / raw)
To: Long Li, Lorenzo Stoakes (Oracle), Andrew Morton
Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Alexander Shishkin, Maxime Coquelin, Alexandre Torgue,
Miquel Raynal, Richard Weinberger, Vignesh Raghavendra,
Bodo Stroesser, Martin K . Petersen, David Howells, Marc Dionne,
Alexander Viro, Christian Brauner, Jan Kara, David Hildenbrand,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org,
linux-hyperv@vger.kernel.org,
linux-stm32@st-md-mailman.stormreply.com,
linux-arm-kernel@lists.infradead.org,
linux-mtd@lists.infradead.org, linux-staging@lists.linux.dev,
linux-scsi@vger.kernel.org, target-devel@vger.kernel.org,
linux-afs@lists.infradead.org, linux-fsdevel@vger.kernel.org,
linux-mm@kvack.org, Ryan Roberts
In-Reply-To: <05467cb62267d750e5c770147517d4df0246cda6.1774045440.git.ljs@kernel.org>
From: Lorenzo Stoakes (Oracle) <ljs@kernel.org> Sent: Friday, March 20, 2026 3:40 PM
>
> The f_op->mmap interface is deprecated, so update the vmbus driver to use
> its successor, mmap_prepare.
>
> This updates all callbacks which referenced the function pointer
> hv_mmap_ring_buffer to instead reference hv_mmap_prepare_ring_buffer,
> utilising the newly introduced compat_set_desc_from_vma() and
> __compat_vma_mmap() to be able to implement this change.
>
> The UIO HV generic driver is the only user of hv_create_ring_sysfs(),
> which is the only function which references
> vmbus_channel->mmap_prepare_ring_buffer which, in turn, is the only
> external interface to hv_mmap_prepare_ring_buffer.
>
> This patch therefore updates this caller to use mmap_prepare instead,
> which also previously used vm_iomap_memory(), so this change replaces it
> with its mmap_prepare equivalent, mmap_action_simple_ioremap().
>
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> ---
> drivers/hv/hyperv_vmbus.h | 4 ++--
> drivers/hv/vmbus_drv.c | 31 +++++++++++++++++++------------
> drivers/uio/uio_hv_generic.c | 11 ++++++-----
> include/linux/hyperv.h | 4 ++--
> 4 files changed, 29 insertions(+), 21 deletions(-)
>
There are two mmap() code paths in the Hyper-V UIO code. One path is
to mmap() the file descriptor for /dev/uio<n>, and the other is to mmap()
the "ring" entry under /sys/devices/vmbus/devices/<uuid>. The former is
done by uio_mmap(), and the latter by hv_uio_ring_mmap_prepare().
I tested both these paths using a combination of two methods in a
x86/x64 VM on Hyper-V:
1) Using the fcopy daemon, which maps the ring buffer for the primary
channel and sends/receives messages with the Hyper-V host. This
method tests only the 1st path because the fcopy daemon doesn't create
any subchannels that would use the "ring" entry.
2) Using a custom-built test program. This program doesn't communicate
with the Hyper-V host, but allows mostly verifying both code paths for the
primary channel. As a sanity check, it verifies that the two mmaps are
mapping the same memory, as expected.
As such,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
The most robust test would be to run DPDK networking against
UIO, as it would communicate with the Hyper-V host and use
multiple subchannels that resulting in mmap'ing the "ring"
entry under /sys.
@Long Li -- I'll leave it to your discretion as to whether you want
to test DPDK against these mmap() changes.
I've noted one minor issue below.
[snip]
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1015,8 +1015,8 @@ struct vmbus_channel {
/* The max size of a packet on this channel */
u32 max_pkt_size;
- /* function to mmap ring buffer memory to the channel's sysfs ring attribute */
- int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma);
+ /* function to mmap_prepare ring buffer memory to the channel's sysfs ring attribute */
Changing the comment from "mmap ring buffer" to "mmap_prepare ring buffer"
produces awkward wording since "mmap" is used here as a verb. It might be better
to just leave the comment unchanged.
Michael
+ int (*mmap_prepare_ring_buffer)(struct vmbus_channel *channel, struct vm_area_desc *desc);
/* boolean to control visibility of sysfs for ring buffer */
bool ring_sysfs_visible;
^ permalink raw reply
* [PATCH net] net: mana: Fix RX skb truesize accounting
From: Dipayaan Roy @ 2026-03-23 8:21 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, dipayanroy
MANA passes rxq->alloc_size to napi_build_skb() for all RX buffers.
It is correct for fragment-backed RX buffers, where alloc_size matches
the actual backing allocation used for each packet buffer. However, in
the non-fragment RX path mana allocates a full page, or a higher-order
page, per RX buffer. In that case alloc_size only reflects the usable
packet area and not the actual backing memory.
This causes napi_build_skb() to underestimate the skb backing allocation
in the single-buffer RX path, so skb->truesize is derived from a value
smaller than the real RX buffer allocation.
Fix this by updating alloc_size in the non-fragment RX path to the
actual backing allocation size before it is passed to napi_build_skb().
Fixes: 730ff06d3f5c ("net: mana: Use page pool fragments for RX buffers instead of full pages to improve memory efficiency.")
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ea71de39f996..884f8e548174 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -766,6 +766,13 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
}
*frag_count = 1;
+
+ /* In the single-buffer path, napi_build_skb() must see the
+ * actual backing allocation size so skb->truesize reflects
+ * the full page (or higher-order page), not just the usable
+ * packet area.
+ */
+ *alloc_size = PAGE_SIZE << get_order(*alloc_size);
return;
}
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v4 18/21] drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare
From: Lorenzo Stoakes (Oracle) @ 2026-03-23 9:13 UTC (permalink / raw)
To: Michael Kelley
Cc: Long Li, Andrew Morton, Jonathan Corbet, Clemens Ladisch,
Arnd Bergmann, Greg Kroah-Hartman, K . Y . Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Alexander Shishkin,
Maxime Coquelin, Alexandre Torgue, Miquel Raynal,
Richard Weinberger, Vignesh Raghavendra, Bodo Stroesser,
Martin K . Petersen, David Howells, Marc Dionne, Alexander Viro,
Christian Brauner, Jan Kara, David Hildenbrand, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
Jann Horn, Pedro Falcato, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, linux-hyperv@vger.kernel.org,
linux-stm32@st-md-mailman.stormreply.com,
linux-arm-kernel@lists.infradead.org,
linux-mtd@lists.infradead.org, linux-staging@lists.linux.dev,
linux-scsi@vger.kernel.org, target-devel@vger.kernel.org,
linux-afs@lists.infradead.org, linux-fsdevel@vger.kernel.org,
linux-mm@kvack.org, Ryan Roberts
In-Reply-To: <SN6PR02MB41573DF211DA2469D7FFE892D44BA@SN6PR02MB4157.namprd02.prod.outlook.com>
On Mon, Mar 23, 2026 at 04:16:20AM +0000, Michael Kelley wrote:
> From: Lorenzo Stoakes (Oracle) <ljs@kernel.org> Sent: Friday, March 20, 2026 3:40 PM
> >
> > The f_op->mmap interface is deprecated, so update the vmbus driver to use
> > its successor, mmap_prepare.
> >
> > This updates all callbacks which referenced the function pointer
> > hv_mmap_ring_buffer to instead reference hv_mmap_prepare_ring_buffer,
> > utilising the newly introduced compat_set_desc_from_vma() and
> > __compat_vma_mmap() to be able to implement this change.
> >
> > The UIO HV generic driver is the only user of hv_create_ring_sysfs(),
> > which is the only function which references
> > vmbus_channel->mmap_prepare_ring_buffer which, in turn, is the only
> > external interface to hv_mmap_prepare_ring_buffer.
> >
> > This patch therefore updates this caller to use mmap_prepare instead,
> > which also previously used vm_iomap_memory(), so this change replaces it
> > with its mmap_prepare equivalent, mmap_action_simple_ioremap().
> >
> > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > ---
> > drivers/hv/hyperv_vmbus.h | 4 ++--
> > drivers/hv/vmbus_drv.c | 31 +++++++++++++++++++------------
> > drivers/uio/uio_hv_generic.c | 11 ++++++-----
> > include/linux/hyperv.h | 4 ++--
> > 4 files changed, 29 insertions(+), 21 deletions(-)
> >
>
> There are two mmap() code paths in the Hyper-V UIO code. One path is
> to mmap() the file descriptor for /dev/uio<n>, and the other is to mmap()
> the "ring" entry under /sys/devices/vmbus/devices/<uuid>. The former is
> done by uio_mmap(), and the latter by hv_uio_ring_mmap_prepare().
>
> I tested both these paths using a combination of two methods in a
> x86/x64 VM on Hyper-V:
>
> 1) Using the fcopy daemon, which maps the ring buffer for the primary
> channel and sends/receives messages with the Hyper-V host. This
> method tests only the 1st path because the fcopy daemon doesn't create
> any subchannels that would use the "ring" entry.
>
> 2) Using a custom-built test program. This program doesn't communicate
> with the Hyper-V host, but allows mostly verifying both code paths for the
> primary channel. As a sanity check, it verifies that the two mmaps are
> mapping the same memory, as expected.
>
> As such,
>
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
Perfect, thanks so much for this!
It is tricky for me to test these, beyond fairly exhaustive logical
confirmation of equivalence, so this is _hugely_ helpful.
>
> The most robust test would be to run DPDK networking against
> UIO, as it would communicate with the Hyper-V host and use
> multiple subchannels that resulting in mmap'ing the "ring"
> entry under /sys.
>
> @Long Li -- I'll leave it to your discretion as to whether you want
> to test DPDK against these mmap() changes.
Thanks in advance for taking a look on this also!
>
> I've noted one minor issue below.
>
> [snip]
>
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1015,8 +1015,8 @@ struct vmbus_channel {
> /* The max size of a packet on this channel */
> u32 max_pkt_size;
>
> - /* function to mmap ring buffer memory to the channel's sysfs ring attribute */
> - int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma);
> + /* function to mmap_prepare ring buffer memory to the channel's sysfs ring attribute */
>
> Changing the comment from "mmap ring buffer" to "mmap_prepare ring buffer"
> produces awkward wording since "mmap" is used here as a verb. It might be better
> to just leave the comment unchanged.
Sure am happy with that of course, I think Sashiko moaned about this but
it's obviously fine either way.
Andrew - do you mind restoring the comment to its original form above? Thanks!
>
> Michael
>
>
> + int (*mmap_prepare_ring_buffer)(struct vmbus_channel *channel, struct vm_area_desc *desc);
>
> /* boolean to control visibility of sysfs for ring buffer */
> bool ring_sysfs_visible;
Cheers, Lorenzo
^ permalink raw reply
* Re: [PATCH net-next v4 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Simon Horman @ 2026-03-23 13:43 UTC (permalink / raw)
To: Long Li
Cc: Konstantin Taranov, Jakub Kicinski, David S . Miller, Paolo Abeni,
Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Leon Romanovsky,
Haiyang Zhang, K . Y . Srinivasan, Wei Liu, Dexuan Cui, netdev,
linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <cover.1774049761.git.longli@microsoft.com>
On Fri, Mar 20, 2026 at 04:54:13PM -0700, Long Li wrote:
> This series adds per-vPort Event Queue (EQ) allocation and MSI-X interrupt
> management for the MANA driver. Previously, all vPorts shared a single set
> of EQs. This change enables dedicated EQs per vPort with support for both
> dedicated and shared MSI-X vector allocation modes.
...
Hi Long Li,
Unfortunately this series did not apply to net-next cleanly.
Which breaks our CI.
Please rebase and repost.
Thanks!
--
pw-bot: changes-requested
^ permalink raw reply
* Re: [PATCH net-next] net: mana: Set default number of queues to 16
From: Simon Horman @ 2026-03-23 13:58 UTC (permalink / raw)
To: Long Li
Cc: Konstantin Taranov, Jakub Kicinski, David S . Miller, Paolo Abeni,
Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Leon Romanovsky,
Haiyang Zhang, K . Y . Srinivasan, Wei Liu, Dexuan Cui, netdev,
linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260320233027.1603495-1-longli@microsoft.com>
On Fri, Mar 20, 2026 at 04:30:27PM -0700, Long Li wrote:
> Set the default number of queues per vPort to MANA_DEF_NUM_QUEUES (16),
> as 16 queues can achieve optimal throughput for typical workloads. Users
> can increase the number of queues up to max_queues via ethtool if needed.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +-
> include/net/mana/mana.h | 1 +
> 2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 49c65cc1697c..7cae8a7b9f31 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -3357,7 +3357,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
> apc->ac = ac;
> apc->ndev = ndev;
> apc->max_queues = gc->max_num_queues;
> - apc->num_queues = gc->max_num_queues;
> + apc->num_queues = min(gc->max_num_queues, MANA_DEF_NUM_QUEUES);
Hi Long Li,
Maybe I am misunderstanding things. But it seems to me that this patch
sets a ceiling on the default number of queues. Which is subtly different
to setting the default. Even if not in practice if max_num_queues is never
less than MANA_DEF_NUM_QUEUES.
If so I'm wondering if you could tweak the commit message accordingly.
> apc->tx_queue_size = DEF_TX_BUFFERS_PER_QUEUE;
> apc->rx_queue_size = DEF_RX_BUFFERS_PER_QUEUE;
> apc->port_handle = INVALID_MANA_HANDLE;
...
^ permalink raw reply
* [PATCH 6.19 200/220] x86/hyperv: Use __naked attribute to fix stackless C function
From: Greg Kroah-Hartman @ 2026-03-23 13:46 UTC (permalink / raw)
To: stable
Cc: Greg Kroah-Hartman, patches, Mukesh Rathor, Uros Bizjak, Wei Liu,
linux-hyperv, Ard Biesheuvel, Sasha Levin, Andrew Cooper
In-Reply-To: <20260323134504.575022936@linuxfoundation.org>
6.19-stable review patch. If anyone has any objections, please let me know.
------------------
From: Ard Biesheuvel <ardb@kernel.org>
[ Upstream commit 3fde5281b805370a6c3bd2ef462ebff70a0ea2c6 ]
hv_crash_c_entry() is a C function that is entered without a stack,
and this is only allowed for functions that have the __naked attribute,
which informs the compiler that it must not emit the usual prologue and
epilogue or emit any other kind of instrumentation that relies on a
stack frame.
So split up the function, and set the __naked attribute on the initial
part that sets up the stack, GDT, IDT and other pieces that are needed
for ordinary C execution. Given that function calls are not permitted
either, use the existing long return coded in an asm() block to call the
second part of the function, which is an ordinary function that is
permitted to call other functions as usual.
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> # asm parts, not hv parts
Reviewed-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Acked-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: linux-hyperv@vger.kernel.org
Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
arch/x86/hyperv/hv_crash.c | 82 ++++++++++++++++++++------------------
1 file changed, 43 insertions(+), 39 deletions(-)
diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
index a78e4fed57203..1d91051daa3de 100644
--- a/arch/x86/hyperv/hv_crash.c
+++ b/arch/x86/hyperv/hv_crash.c
@@ -107,14 +107,12 @@ static void __noreturn hv_panic_timeout_reboot(void)
cpu_relax();
}
-/* This cannot be inlined as it needs stack */
-static noinline __noclone void hv_crash_restore_tss(void)
+static void hv_crash_restore_tss(void)
{
load_TR_desc();
}
-/* This cannot be inlined as it needs stack */
-static noinline void hv_crash_clear_kernpt(void)
+static void hv_crash_clear_kernpt(void)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
native_p4d_clear(p4d);
}
+
+static void __noreturn hv_crash_handle(void)
+{
+ hv_crash_restore_tss();
+ hv_crash_clear_kernpt();
+
+ /* we are now fully in devirtualized normal kernel mode */
+ __crash_kexec(NULL);
+
+ hv_panic_timeout_reboot();
+}
+
+/*
+ * __naked functions do not permit function calls, not even to __always_inline
+ * functions that only contain asm() blocks themselves. So use a macro instead.
+ */
+#define hv_wrmsr(msr, val) \
+ asm volatile("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
+
/*
* This is the C entry point from the asm glue code after the disable hypercall.
* We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
@@ -133,51 +150,38 @@ static noinline void hv_crash_clear_kernpt(void)
* available. We restore kernel GDT, and rest of the context, and continue
* to kexec.
*/
-static asmlinkage void __noreturn hv_crash_c_entry(void)
+static void __naked hv_crash_c_entry(void)
{
- struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
-
/* first thing, restore kernel gdt */
- native_load_gdt(&ctxt->gdtr);
+ asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
- asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
- asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+ asm volatile("movw %0, %%ss\n\t"
+ "movq %1, %%rsp"
+ :: "m"(hv_crash_ctxt.ss), "m"(hv_crash_ctxt.rsp));
- asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
- asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
- asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
- asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+ asm volatile("movw %0, %%ds" : : "m"(hv_crash_ctxt.ds));
+ asm volatile("movw %0, %%es" : : "m"(hv_crash_ctxt.es));
+ asm volatile("movw %0, %%fs" : : "m"(hv_crash_ctxt.fs));
+ asm volatile("movw %0, %%gs" : : "m"(hv_crash_ctxt.gs));
- native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
- asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+ hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
+ asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
- asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
- asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
- asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+ asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
+ asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
+ asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr2));
- native_load_idt(&ctxt->idtr);
- native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
- native_wrmsrq(MSR_EFER, ctxt->efer);
+ asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
+ hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
+ hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
/* restore the original kernel CS now via far return */
- asm volatile("movzwq %0, %%rax\n\t"
- "pushq %%rax\n\t"
- "pushq $1f\n\t"
- "lretq\n\t"
- "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
-
- /* We are in asmlinkage without stack frame, hence make C function
- * calls which will buy stack frames.
- */
- hv_crash_restore_tss();
- hv_crash_clear_kernpt();
-
- /* we are now fully in devirtualized normal kernel mode */
- __crash_kexec(NULL);
-
- hv_panic_timeout_reboot();
+ asm volatile("pushq %q0\n\t"
+ "pushq %q1\n\t"
+ "lretq"
+ :: "r"(hv_crash_ctxt.cs), "r"(hv_crash_handle));
}
-/* Tell gcc we are using lretq long jump in the above function intentionally */
+/* Tell objtool we are using lretq long jump in the above function intentionally */
STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
static void hv_mark_tss_not_busy(void)
--
2.51.0
^ permalink raw reply related
* Re: [PATCH net v2] net: mana: fix use-after-free in add_adev() error path
From: Simon Horman @ 2026-03-23 14:26 UTC (permalink / raw)
To: Guangshuo Li
Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Saurabh Sengar, Erni Sri Satya Vennela,
Shradha Gupta, Aditya Garg, Dipayaan Roy, Shiraz Saleem,
Leon Romanovsky, linux-hyperv, netdev, linux-kernel, stable
In-Reply-To: <20260321053918.791068-1-lgs201920130244@gmail.com>
On Sat, Mar 21, 2026 at 01:39:18PM +0800, Guangshuo Li wrote:
> If auxiliary_device_add() fails, add_adev() jumps to add_fail and calls
> auxiliary_device_uninit(adev).
>
> The auxiliary device has its release callback set to adev_release(),
> which frees the containing struct mana_adev. Since adev is embedded in
> struct mana_adev, the subsequent fall-through to init_fail and access
> to adev->id may result in a use-after-free.
>
> Fix this by saving the allocated auxiliary device id in a local
> variable before calling auxiliary_device_add(), and use that saved id
> in the cleanup path after auxiliary_device_uninit().
>
> Fixes: a69839d4327d ("net: mana: Add support for auxiliary device")
> Cc: stable@vger.kernel.org
> Reviewed-by: Long Li <longli@microsoft.com>
> Signed-off-by: Guangshuo Li <lgs201920130244@gmail.com>
> ---
> v2:
> - explain the UAF in more detail
> - retarget to net
> - preserve reverse xmas tree order for local variables
Thanks for the update.
Unfortunately the patch doesn't apply cleanly against net,
which breaks our CI.
Please rebase and repost.
--
pw-bot: changes-requested
^ permalink raw reply
* Re: [PATCH] hv_sock: update outdated comment for renamed vsock_stream_recvmsg()
From: Simon Horman @ 2026-03-23 15:26 UTC (permalink / raw)
To: Kexin Sun
Cc: kys, haiyangz, wei.liu, decui, longli, sgarzare, davem, edumazet,
kuba, pabeni, linux-hyperv, virtualization, netdev, linux-kernel,
julia.lawall, xutong.ma, yunbolyu, ratnadiraw
In-Reply-To: <20260321105753.6751-1-kexinsun@smail.nju.edu.cn>
On Sat, Mar 21, 2026 at 06:57:53PM +0800, Kexin Sun wrote:
> The function vsock_stream_recvmsg() was renamed to
> vsock_connectible_recvmsg() by commit a9e29e5511b9 ("af_vsock:
> update functions for connectible socket"). Update the comment
> accordingly.
>
> Assisted-by: unnamed:deepseek-v3.2 coccinelle
> Signed-off-by: Kexin Sun <kexinsun@smail.nju.edu.cn>
Reviewed-by: Simon Horman <horms@kernel.org>
^ permalink raw reply
* Re: [PATCH] mshv: Fix error handling in mshv_region_populate_pages
From: Stanislav Kinsburskii @ 2026-03-23 16:09 UTC (permalink / raw)
To: Wei Liu
Cc: Michael Kelley, kys@microsoft.com, haiyangz@microsoft.com,
decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260318162003.GB262287@liuwe-devbox-debian-v2.local>
On Wed, Mar 18, 2026 at 04:20:03PM +0000, Wei Liu wrote:
> On Wed, Mar 18, 2026 at 02:38:49PM +0000, Michael Kelley wrote:
> > From: Wei Liu <wei.liu@kernel.org> Sent: Tuesday, March 17, 2026 11:20 PM
> > >
> > > On Tue, Mar 17, 2026 at 09:56:07PM +0000, Michael Kelley wrote:
> > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, March 17, 2026 8:05 AM
> > > > >
> > > > > The current error handling has two issues:
> > > > >
> > > > > First, pin_user_pages_fast() can return a short pin count (less than
> > > > > requested but greater than zero) when it cannot pin all requested pages.
> > > > > This is treated as success, leading to partially pinned regions being
> > > > > used, which causes memory corruption.
> > > > >
> > > > > Second, when an error occurs mid-loop, already pinned pages from the
> > > > > current batch are not released before calling mshv_region_evict_pages(),
> > > > > causing a page reference leak.
> > > >
> > > > There's now an online LLM-based tool that is automatically reviewing
> > > > kernel patches. For this patch, the results are here:
> > > >
> > > >
> > > https://sashiko.dev/#/patchset/177375989324.25621.6532741522672582851.stgit
> > > %40skinsburskii-cloud-desktop.internal.cloudapp.net
> > > >
> > > > It has flagged the commit message as incorrectly referencing the
> > > > function mshv_region_evict_pages(), which doesn't exist.
> > > >
> > > > FWIW, the announcement about sashiko.dev is here:
> > > >
> > > > https://lore.kernel.org/lkml/7ia4o6kmpj5s.fsf@castle.c.googlers.com/
> > > >
> > > > Other than the commit message reference, this looks good to me.
> > > >
> > > > Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> > >
> > > The second point is written as if the code here should release the
> > > already pinned pages before calling mshv_region_invalidate_pages(), but
> > > the code actually relies on mshv_mem_region_invalidate_pages() to
> > > release the pages. The change here fixes the accounting.
> > >
> > > Second, when an error occurs mid-loop, already pinned pages from the
> > > current batch are not accounted for before calling
> > > mshv_region_invalidate_pages(), causing a page reference leak.
> > >
> > > And queued up the patch to hyperv-fixes.
> >
> > One other thing I noticed: The "Subject" of the patch is wrong. It
> > mentions mshv_region_populate_pages(), but the function being
> > modified is actually mshv_region_pin().
>
> Good catch. I have updated the subject line and pushed to hyperv-fixes.
>
Thank you Michael and Wei.
Thanks,
Stanislav
> Wei
>
> >
> > Michael
> >
> > >
> > > Wei
> > >
> > > >
> > > > >
> > > > > Fix by treating short pins as errors and explicitly unpinning the
> > > > > partial batch before cleanup.
> > > > >
> > > > > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > > > > ---
> > > > > drivers/hv/mshv_regions.c | 6 ++++--
> > > > > 1 file changed, 4 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
> > > > > index c28aac0726de..fdffd4f002f6 100644
> > > > > --- a/drivers/hv/mshv_regions.c
> > > > > +++ b/drivers/hv/mshv_regions.c
> > > > > @@ -314,15 +314,17 @@ int mshv_region_pin(struct mshv_mem_region *region)
> > > > > ret = pin_user_pages_fast(userspace_addr, nr_pages,
> > > > > FOLL_WRITE | FOLL_LONGTERM,
> > > > > pages);
> > > > > - if (ret < 0)
> > > > > + if (ret != nr_pages)
> > > > > goto release_pages;
> > > > > }
> > > > >
> > > > > return 0;
> > > > >
> > > > > release_pages:
> > > > > + if (ret > 0)
> > > > > + done_count += ret;
> > > > > mshv_region_invalidate_pages(region, 0, done_count);
> > > > > - return ret;
> > > > > + return ret < 0 ? ret : -ENOMEM;
> > > > > }
> > > > >
> > > > > static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
> > > > >
> > > > >
> > > >
> >
^ permalink raw reply
* [PPATCH net v3] net: mana: fix use-after-free in add_adev() error path
From: Guangshuo Li @ 2026-03-23 16:57 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Erni Sri Satya Vennela, Dipayaan Roy, Aditya Garg,
Shiraz Saleem, Kees Cook, Leon Romanovsky, linux-hyperv, netdev,
linux-kernel
Cc: Guangshuo Li, stable
If auxiliary_device_add() fails, add_adev() jumps to add_fail and calls
auxiliary_device_uninit(adev).
The auxiliary device has its release callback set to adev_release(),
which frees the containing struct mana_adev. Since adev is embedded in
struct mana_adev, the subsequent fall-through to init_fail and access
to adev->id may result in a use-after-free.
Fix this by saving the allocated auxiliary device id in a local
variable before calling auxiliary_device_add(), and use that saved id
in the cleanup path after auxiliary_device_uninit().
Fixes: a69839d4327d ("net: mana: Add support for auxiliary device")
Cc: stable@vger.kernel.org
Reviewed-by: Long Li <longli@microsoft.com>
Signed-off-by: Guangshuo Li <lgs201920130244@gmail.com>
---
v2:
- explain the UAF in more detail
- retarget to net
- preserve reverse xmas tree order for local variables
v3:
- rebase onto the current net tree
drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9017e806ecda..d03f42245ab8 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3424,6 +3424,7 @@ static int add_adev(struct gdma_dev *gd, const char *name)
{
struct auxiliary_device *adev;
struct mana_adev *madev;
+ int id;
int ret;
madev = kzalloc_obj(*madev);
@@ -3434,7 +3435,8 @@ static int add_adev(struct gdma_dev *gd, const char *name)
ret = mana_adev_idx_alloc();
if (ret < 0)
goto idx_fail;
- adev->id = ret;
+ id = ret;
+ adev->id = id;
adev->name = name;
adev->dev.parent = gd->gdma_context->dev;
@@ -3460,7 +3462,7 @@ static int add_adev(struct gdma_dev *gd, const char *name)
auxiliary_device_uninit(adev);
init_fail:
- mana_adev_idx_free(adev->id);
+ mana_adev_idx_free(id);
idx_fail:
kfree(madev);
--
2.43.0
^ permalink raw reply related
* RE: [EXTERNAL] Re: [PATCH net-next] net: mana: Set default number of queues to 16
From: Long Li @ 2026-03-23 17:54 UTC (permalink / raw)
To: Simon Horman
Cc: Konstantin Taranov, Jakub Kicinski, David S . Miller, Paolo Abeni,
Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Leon Romanovsky,
Haiyang Zhang, KY Srinivasan, Wei Liu, Dexuan Cui,
netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260323135848.GA81558@horms.kernel.org>
>
> On Fri, Mar 20, 2026 at 04:30:27PM -0700, Long Li wrote:
> > Set the default number of queues per vPort to MANA_DEF_NUM_QUEUES
> > (16), as 16 queues can achieve optimal throughput for typical
> > workloads. Users can increase the number of queues up to max_queues via
> ethtool if needed.
> >
> > Signed-off-by: Long Li <longli@microsoft.com>
> > ---
> > drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +-
> > include/net/mana/mana.h | 1 +
> > 2 files changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index 49c65cc1697c..7cae8a7b9f31 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -3357,7 +3357,7 @@ static int mana_probe_port(struct mana_context
> *ac, int port_idx,
> > apc->ac = ac;
> > apc->ndev = ndev;
> > apc->max_queues = gc->max_num_queues;
> > - apc->num_queues = gc->max_num_queues;
> > + apc->num_queues = min(gc->max_num_queues,
> MANA_DEF_NUM_QUEUES);
>
> Hi Long Li,
>
> Maybe I am misunderstanding things. But it seems to me that this patch sets a
> ceiling on the default number of queues. Which is subtly different to setting the
> default. Even if not in practice if max_num_queues is never less than
> MANA_DEF_NUM_QUEUES.
>
> If so I'm wondering if you could tweak the commit message accordingly.
Yes, will tweak the commit message and resend patch.
Thanks,
Long
>
> > apc->tx_queue_size = DEF_TX_BUFFERS_PER_QUEUE;
> > apc->rx_queue_size = DEF_RX_BUFFERS_PER_QUEUE;
> > apc->port_handle = INVALID_MANA_HANDLE;
>
> ...
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH rdma] RDMA/mana_ib: Disable RX steering on RSS QP destroy
From: Long Li @ 2026-03-23 18:03 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Konstantin Taranov, Jakub Kicinski, David S . Miller, Paolo Abeni,
Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Haiyang Zhang,
KY Srinivasan, Wei Liu, Dexuan Cui, Simon Horman,
netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
stable@vger.kernel.org
In-Reply-To: <20260322184848.GC814676@unreal>
> On Fri, Mar 20, 2026 at 05:28:42PM -0700, Long Li wrote:
> > When an RSS QP is destroyed (e.g. DPDK exit), mana_ib_destroy_qp_rss()
> > destroys the RX WQ objects but does not disable vPort RX steering in
> > firmware. This leaves stale steering configuration that still points
> > to the destroyed RX objects.
> >
> > If traffic continues to arrive (e.g. peer VM is still transmitting)
> > and the VF interface is subsequently brought up (mana_open), the
> > firmware may deliver completions using stale CQ IDs from the old RX objects.
> > These CQ IDs can be reused by the ethernet driver for new TX CQs,
> > causing RX completions to land on TX CQs:
> >
> > WARNING: mana_poll_tx_cq+0x1b8/0x220 [mana] (is_sq == false)
> > WARNING: mana_gd_process_eq_events+0x209/0x290 (cq_table lookup
> > fails)
> >
> > Fix this by disabling vPort RX steering before destroying RX WQ objects.
> > Note that mana_fence_rqs() cannot be used here because the fence
> > completion is delivered on the CQ, which is polled by user-mode (e.g.
> > DPDK) and not visible to the kernel driver.
> >
> > Refactor the disable logic into a shared mana_disable_vport_rx() in
> > mana_en, exported for use by mana_ib, replacing the duplicate code.
> > The ethernet driver's mana_dealloc_queues() is also updated to call
> > this common function.
> >
> > Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure
> > Network Adapter")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Long Li <longli@microsoft.com>
> > ---
> > drivers/infiniband/hw/mana/qp.c | 17 ++++++++++++++++-
> > drivers/net/ethernet/microsoft/mana/mana_en.c | 11 ++++++++++-
> > include/net/mana/mana.h | 1 +
> > 3 files changed, 27 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/mana/qp.c
> > b/drivers/infiniband/hw/mana/qp.c index 80cf4ade4b75..b27084c53a14
> > 100644
> > --- a/drivers/infiniband/hw/mana/qp.c
> > +++ b/drivers/infiniband/hw/mana/qp.c
> > @@ -829,11 +829,26 @@ static int mana_ib_destroy_qp_rss(struct
> mana_ib_qp *qp,
> > struct net_device *ndev;
> > struct mana_ib_wq *wq;
> > struct ib_wq *ibwq;
> > - int i;
> > + int i, err;
> >
> > ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port);
> > mpc = netdev_priv(ndev);
> >
> > + /* Disable vPort RX steering before destroying RX WQ objects.
> > + * Otherwise firmware still routes traffic to the destroyed queues,
> > + * which can cause bogus completions on reused CQ IDs when the
> > + * ethernet driver later creates new queues on mana_open().
> > + *
> > + * Unlike the ethernet teardown path, mana_fence_rqs() cannot be
> > + * used here because the fence completion CQE is delivered on the
> > + * CQ which is polled by userspace (e.g. DPDK), so there is no way
> > + * for the kernel to wait for fence completion.
> > + */
> > + err = mana_disable_vport_rx(mpc);
> > + if (err)
> > + ibdev_err(&mdev->ib_dev,
> > + "Failed to disable vPort RX: %d\n", err);
>
> mana_cfg_vport_steering() is already prints in all failure scenarios.
>
> Thanks
I'm sending v2 with this message removed.
Thanks,
Long
>
> > +
> > for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
> > ibwq = ind_tbl->ind_tbl[i];
> > wq = container_of(ibwq, struct mana_ib_wq, ibwq); diff --git
> > a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index 22444c7530a5..51719ef1c09b 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -2934,6 +2934,13 @@ static void mana_rss_table_init(struct
> mana_port_context *apc)
> > ethtool_rxfh_indir_default(i, apc->num_queues); }
> >
> > +int mana_disable_vport_rx(struct mana_port_context *apc) {
> > + return mana_cfg_vport_steering(apc, TRI_STATE_FALSE, false, false,
> > + false);
> > +}
> > +EXPORT_SYMBOL_NS(mana_disable_vport_rx, "NET_MANA");
> > +
> > int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx,
> > bool update_hash, bool update_tab) { @@ -3339,10 +3346,12
> @@
> > static int mana_dealloc_queues(struct net_device *ndev)
> > */
> >
> > apc->rss_state = TRI_STATE_FALSE;
> > - err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
> > + err = mana_disable_vport_rx(apc);
> > if (err && mana_en_need_log(apc, err))
> > netdev_err(ndev, "Failed to disable vPort: %d\n", err);
> >
> > + mana_fence_rqs(apc);
> > +
> > /* Even in err case, still need to cleanup the vPort */
> > mana_destroy_rxqs(apc);
> > mana_destroy_txq(apc);
> > diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index
> > 204c2b612a62..2634e9135eed 100644
> > --- a/include/net/mana/mana.h
> > +++ b/include/net/mana/mana.h
> > @@ -574,6 +574,7 @@ struct mana_port_context { netdev_tx_t
> > mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); int
> > mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
> > bool update_hash, bool update_tab);
> > +int mana_disable_vport_rx(struct mana_port_context *apc);
> >
> > int mana_alloc_queues(struct net_device *ndev); int
> > mana_attach(struct net_device *ndev);
> > --
> > 2.43.0
> >
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox